mel_filter/lib.rs
1use itertools_num::linspace;
2use num_traits::{AsPrimitive, Float, NumCast, NumOps};
3
4pub enum NormalizationFactor {
5 /// Leave all the triangles aiming for a peak value of 1.0
6 None,
7 /// divide the triangular mel weights by the width of the mel band (area normalization).
8 One,
9 /// Leave all the triangles aiming for a peak value of 1.0
10 Inf,
11}
12
13/// Implementation of `librosa.hz_to_mel`
14///
15/// Convert Hz to Mels
16///
17/// # Parameters
18///
19/// `frequencies` : number or &[..] , float
20/// scalar or slice of frequencies
21/// `htk` : use HTK formula instead of Slaney
22///
23/// # Returns
24/// Vec![..], input frequencies in Mels
25///
26/// # Examples
27///
28/// ```
29/// use mel_filter::hz_to_mel;
30/// assert_eq!(vec![0.8999999999999999], hz_to_mel(&[60.], false));
31/// assert_eq!(vec![0.8999999999999999], hz_to_mel(&[60.], false));
32/// assert_eq!(vec![0.], hz_to_mel(&[0.], false));
33/// assert_eq!(vec![49.91059448015905], hz_to_mel(&[11025.], false));
34/// assert_eq!(vec![ 1.65, 3.3 , 6.6 ], hz_to_mel(vec![110., 220., 440.], false));
35/// ```
36///
37/// # See Also
38///
39/// [mel_to_hz]
40///
41pub fn hz_to_mel<T, A>(frequencies: T, htk: bool) -> Vec<A>
42where
43 T: AsRef<[A]>,
44 A: Copy + Float + NumOps + NumCast,
45{
46 if htk {
47 let n: A = NumCast::from(2595.0).unwrap();
48 return frequencies
49 .as_ref()
50 .iter()
51 .map(|v| n * (*v / NumCast::from(700.0).unwrap() + NumCast::from(1.0).unwrap()).log10())
52 .collect();
53 }
54
55 // Fill in the linear part
56 let f_min = A::zero();
57 let f_sp = NumCast::from(200.0 / 3.).unwrap();
58
59 let mut mels: Vec<A> = frequencies
60 .as_ref()
61 .iter()
62 .map(|v| (*v - f_min) / f_sp)
63 .collect();
64
65 // Fill in the log-scale part
66
67 let min_log_hz = NumCast::from(1000.0).unwrap(); // beginning of log region (Hz)
68 let min_log_mel = (min_log_hz - f_min) / f_sp; // same (Mels)
69 let logstep = NumCast::from((6.4).ln() / 27.0).unwrap(); // step size for log region
70 // If we have array data, vectorize
71 for (idx, val) in frequencies.as_ref().iter().enumerate() {
72 if val >= &min_log_hz {
73 mels[idx] = min_log_mel + (*val / min_log_hz).ln() / logstep;
74 }
75 }
76 // min_log_mel + np.log(frequencies[log_t] / min_log_hz) / logstep
77 mels
78 //let log_t = frequencies >= min_log_hz;
79 //mels[log_t] = min_log_mel + np.log(frequencies[log_t] / min_log_hz) / logstep;
80 //
81 // If we have scalar data, heck directly
82 //min_log_mel + (frequencies / min_log_hz).ln() / logstep;
83}
84
85/// Implementation of `librosa.mel_to_hz`
86///
87/// Convert mel bin numbers to frequencies
88///
89/// # Parameters
90///
91/// `mels` : Vec [shape=(n,)], float mel bins to convert
92/// `htk` : use HTK formula instead of Slaney
93///
94/// # Returns
95///
96/// frequencies : np.ndarray [shape=(n,)]
97/// input mels in Hz
98///
99/// # Examples
100///
101/// ```
102/// use mel_filter::mel_to_hz;
103/// assert_eq!(vec![200.], mel_to_hz(&[3.], false));
104/// assert_eq!(vec![200.], mel_to_hz(&[3.], false));
105///
106/// assert_eq!(vec![ 66.66666666666667, 133.33333333333334, 200. , 266.6666666666667, 333.33333333333337], mel_to_hz(&[1.,2.,3.,4.,5.], false));
107/// assert_eq!(vec![ 66.66666666666667, 133.33333333333334, 200. , 266.6666666666667, 333.33333333333337], mel_to_hz(&[1.,2.,3.,4.,5.], false));
108/// ```
109///
110/// # See Also
111///
112/// [hz_to_mel]
113///
114pub fn mel_to_hz<T, A>(mels: T, htk: bool) -> Vec<A>
115where
116 T: AsRef<[A]>,
117 A: Copy + Float + NumOps + NumCast,
118{
119 if htk {
120 let base: A = NumCast::from(10.0).unwrap();
121 let seven_hundred: A = NumCast::from(700.0).unwrap();
122 return mels
123 .as_ref()
124 .iter()
125 .map(|v| {
126 seven_hundred
127 * (base.powf(*v / NumCast::from(2595.0).unwrap()) - NumCast::from(1.0).unwrap())
128 })
129 .collect();
130 }
131
132 let mels = mels.as_ref();
133
134 // Fill in the linear scale
135 let f_min = A::zero();
136 let f_sp: A = A::from(200.0 / 3.0).unwrap();
137
138 let mut freqs: Vec<A> = mels.iter().map(|v| f_min + f_sp * *v).collect();
139
140 // And now the nonlinear scale
141 let min_log_hz = A::from(1000.0).unwrap(); // beginning of log region (Hz)
142 let min_log_mel = (min_log_hz - f_min) / f_sp; // same (Mels)
143 let logstep = A::from((6.4).ln() / 27.0).unwrap(); // step size for log region
144
145 // If we have vector data, vectorize
146 for (idx, val) in mels.iter().enumerate() {
147 if val >= &min_log_mel {
148 // min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
149 freqs[idx] = min_log_hz * A::exp(logstep * (*val - min_log_mel))
150 }
151 }
152 freqs
153 /*
154 let freqs = f_min + f_sp * mels;
155 if mels >= min_log_mel {
156 // If we have scalar data, check directly
157 freqs = min_log_hz * (logstep * (mels - min_log_mel)).exp();
158 }
159 freqs
160 */
161}
162
163/// Implementation of `librosa.mel_frequencies`
164///
165/// Compute an array of acoustic frequencies tuned to the mel scale.
166///
167/// The mel scale is a quasi-logarithmic function of acoustic frequency
168/// designed such that perceptually similar pitch intervals (e.g. octaves)
169/// appear equal in width over the full hearing range.
170///
171/// Because the definition of the mel scale is conditioned by a finite number
172/// of subjective psychoaoustical experiments, several implementations coexist
173/// in the audio signal processing literature [#]_. By default, librosa replicates
174/// the behavior of the well-established MATLAB Auditory Toolbox of Slaney [#]_.
175/// According to this default implementation, the conversion from Hertz to mel is
176/// linear below 1 kHz and logarithmic above 1 kHz. Another available implementation
177/// replicates the Hidden Markov Toolkit [#]_ (HTK) according to the following formula::
178///
179/// mel = 2595.0 * (1.0 + f / 700.0).log10().
180///
181/// The choice of implementation is determined by the `htk` keyword argument: setting
182/// `htk=false` leads to the Auditory toolbox implementation, whereas setting it `htk=true`
183/// leads to the HTK implementation.
184///
185/// - [#] Umesh, S., Cohen, L., & Nelson, D. Fitting the mel scale.
186/// In Proc. International Conference on Acoustics, Speech, and Signal Processing
187/// (ICASSP), vol. 1, pp. 217-220, 1998.
188///
189/// - [#] Slaney, M. Auditory Toolbox: A MATLAB Toolbox for Auditory
190/// Modeling Work. Technical Report, version 2, Interval Research Corporation, 1998.
191///
192/// - [#] Young, S., Evermann, G., Gales, M., Hain, T., Kershaw, D., Liu, X.,
193/// Moore, G., Odell, J., Ollason, D., Povey, D., Valtchev, V., & Woodland, P.
194/// The HTK book, version 3.4. Cambridge University, March 2009.
195///
196///
197/// # See Also
198/// [hz_to_mel]
199///
200/// [mel_to_hz]
201///
202/// librosa.feature.melspectrogram
203///
204/// librosa.feature.mfcc
205///
206/// # Parameters
207///
208/// `n_mels` : Number of mel bins.
209///
210/// `fmin` : float >= 0, Minimum frequency (Hz).
211///
212/// `fmax` : float >= 0, Maximum frequency (Hz).
213///
214/// `htk` : bool
215/// If True, use HTK formula to convert Hz to mel.
216/// Otherwise (False), use Slaney's Auditory Toolbox.
217///
218/// # Returns
219/// `bin_frequencies` : Vec[shape=(n_mels,)]
220/// Vector of `n_mels` frequencies in Hz which are uniformly spaced on the Mel
221/// axis.
222///
223/// # Examples
224///
225/// ```
226/// use mel_filter::mel_frequencies;
227/// let freqs = mel_frequencies::<f64>(Some(40), None, None, false); // n_mels=40
228/// println!("{:?}", freqs);
229/// let expected = vec![ 0.0 , 85.31725552163941, 170.63451104327882,
230/// 255.95176656491824, 341.26902208655764, 426.586277608197 ,
231/// 511.9035331298365 , 597.2207886514759 , 682.5380441731153 ,
232/// 767.8552996947546 , 853.172555216394 , 938.4898107380334 ,
233/// 1024.8555458780081 , 1119.1140732107583 , 1222.0417930074345 ,
234/// 1334.436032577335 , 1457.1674514162094 , 1591.1867857508237 ,
235/// 1737.532213396153 , 1897.3373959769085 , 2071.840260812287 ,
236/// 2262.3925904926227 , 2470.4704944333835 , 2697.6858435241707 ,
237/// 2945.7987564509885 , 3216.731234416783 , 3512.5820498813423 ,
238/// 3835.64300465582 , 4188.416683294875 , 4573.635839312682 ,
239/// 4994.284564397706 , 5453.621404613084 , 5955.204602651788 ,
240/// 6502.919661685081 , 7101.009444327076 , 7754.107039876304 ,
241/// 8467.271654439703 , 9246.027801961029 , 10096.408099746186 ,
242/// 11025.0];
243/// assert_eq!(freqs, expected);
244/// ```
245///
246pub fn mel_frequencies<T: Float + NumOps>(
247 n_mels: Option<usize>,
248 fmin: Option<T>,
249 fmax: Option<T>,
250 htk: bool,
251) -> Vec<T> {
252 let n_mels = n_mels.unwrap_or(128);
253
254 let fmin = fmin.unwrap_or_else(T::zero);
255
256 let fmax = fmax.unwrap_or_else(|| NumCast::from(11025.0).unwrap());
257
258 // 'Center freqs' of mel bands - uniformly spaced between limits
259 let min_mel = hz_to_mel(&[fmin], htk)[0];
260 let max_mel = hz_to_mel(&[fmax], htk)[0];
261
262 let mels: Vec<_> = linspace::<T>(min_mel, max_mel, n_mels).collect();
263
264 mel_to_hz(mels, htk)
265}
266
267/// Implementation of `librosa.fft_frequencies`
268///
269/// Returns Vec of frequencies lenth is `1 + n_fft/2`.
270/// Frequencies ``(0, sr/n_fft, 2*sr/n_fft, ..., sr/2)``
271///
272/// # Parameters
273///
274/// `sr` : Audio sampling rate
275///
276/// `n_fft` : FFT window size
277///
278/// # Examples
279///
280/// ```
281/// use mel_filter::fft_frequencies;
282/// assert_eq!(fft_frequencies::<f32>(Some(22050), Some(16)), vec![ 0. , 1378.125, 2756.25 , 4134.375,
283/// 5512.5 , 6890.625, 8268.75 , 9646.875, 11025.]);
284/// assert_eq!(fft_frequencies::<f64>(Some(22050), Some(16)), vec![ 0. , 1378.125, 2756.25 , 4134.375,
285/// 5512.5 , 6890.625, 8268.75 , 9646.875, 11025.]);
286/// // array([ 0. , 1378.125, 2756.25 , 4134.375,
287/// // 5512.5 , 6890.625, 8268.75 , 9646.875, 11025. ]));
288/// ```
289///
290pub fn fft_frequencies<T: Float + NumOps>(sr: Option<usize>, n_fft: Option<usize>) -> Vec<T> {
291 let sr: f64 = sr.unwrap_or(22050).as_();
292 let n_fft = n_fft.unwrap_or(2048);
293
294 linspace(T::zero(), NumCast::from(sr / 2.).unwrap(), 1 + n_fft / 2).collect()
295}
296
297/// `librosa.filters.mel`.
298/// Returns 2D array, of shape `n_mels * (1 + n_fft/2)`, Currently is `Vec<Vec<T>>`.
299///
300/// # Arguments
301///
302/// * `sr` - Sampling rate of the incoming signal
303/// * `n_fft` - number of FFT components
304/// * `n_mels` - number of Mel bands to generate, default 128
305/// * `fmin` - lowest frequency (in Hz), default 0.0
306/// * `fmax` - highest frequency (in Hz)
307/// If `None`, use `fmax = sr / 2.0`
308/// * `htk` - use HTK formula instead of Slaney
309/// * `norm` if [NormalizationFactor::One], divide the triangular mel weights by the width of the mel band
310/// (area normalization). Otherwise, leave all the triangles aiming for
311/// a peak value of 1.0
312///
313/// # Examples
314///
315/// ```
316/// use mel_filter::{mel, NormalizationFactor};
317/// let melfb = mel::<f64>(22050, 2048, None, None, None, false, NormalizationFactor::One);
318/// assert_eq!(melfb.len(), 128);
319/// assert_eq!(melfb[0].len(), 1025);
320/// assert_eq!(&melfb[0][..6], &[0f64, 0.016182853208219942, 0.032365706416439884, 0.028990088037379964, 0.012807234829160026, 0.][..], "begin six element");
321/// assert_eq!(&melfb[1][..9], &[0f64, 0., 0., 0.009779235793639925, 0.025962089001859864, 0.035393705451959974, 0.01921085224374004, 0.003027999035520103, 0.][..], "second nine element");
322/// // melfb = [[ 0. , 0.016, ..., 0. , 0. ],
323/// // [ 0. , 0. , ..., 0. , 0. ],
324/// // ...,
325/// // [ 0. , 0. , ..., 0. , 0. ],
326/// // [ 0. , 0. , ..., 0. , 0. ]]
327/// // Clip the maximum frequency to 8KHz
328/// let melfb = mel(22050, 2048, None, None, Some(8000.), false, NormalizationFactor::One);
329/// println!("{:?}", &melfb[0][..10]);
330/// assert_eq!(melfb.len(), 128);
331/// assert_eq!(melfb[0].len(), 1025);
332/// assert_eq!(&melfb[0][..6], &[0f64, 0.01969187633619885, 0.0393837526723977, 0.026457473399387796, 0.006765597063188946, 0.][..], "begin six element");
333/// assert_eq!(&melfb[1][..9], &[0f64, 0., 0., 0.016309077804604378, 0.036000954140803225, 0.029840271930982275, 0.010148395594783432, 0., 0.][..], "second nine element");
334/// // melfb = [[ 0. , 0.02, ..., 0. , 0. ],
335/// // [ 0. , 0. , ..., 0. , 0. ],
336/// // ...,
337/// // [ 0. , 0. , ..., 0. , 0. ],
338/// // [ 0. , 0. , ..., 0. , 0. ]])
339/// ```
340pub fn mel<T: Float + NumOps>(
341 sr: usize,
342 n_fft: usize,
343 n_mels: Option<usize>,
344 fmin: Option<T>,
345 fmax: Option<T>,
346 htk: bool,
347 norm: NormalizationFactor,
348) -> Vec<Vec<T>> {
349 let fmax: T = fmax.unwrap_or_else(|| {
350 let sr: f64 = sr.as_();
351 T::from(sr / 2.).unwrap()
352 });
353
354 // Initialize the weights
355 let n_mels = n_mels.unwrap_or(128);
356 let width = 1 + n_fft / 2;
357 let mut weights = vec![vec![T::zero(); width]; n_mels];
358
359 // Center freqs of each FFT bin
360 let fftfreqs = fft_frequencies(Some(sr), Some(n_fft));
361
362 // 'Center freqs' of mel bands - uniformly spaced between limits
363 let mel_f = mel_frequencies(Some(n_mels + 2), fmin, Some(fmax), htk);
364
365 let fdiff: Vec<_> = mel_f[..mel_f.len() - 1]
366 .iter()
367 .zip(mel_f[1..].iter())
368 .map(|(x, y)| *y - *x)
369 .collect();
370
371 // ramps = np.subtract.outer(mel_f, fftfreqs);
372 let mut ramps = vec![vec![T::zero(); fftfreqs.len()]; mel_f.len()];
373
374 let _ = mel_f
375 .iter()
376 .enumerate()
377 .map(|(i, m)| {
378 fftfreqs
379 .iter()
380 .enumerate()
381 .map(|(j, f)| {
382 ramps[i][j] = *m - *f;
383 })
384 .collect::<Vec<_>>()
385 })
386 .collect::<Vec<_>>();
387
388 for i in 0..n_mels {
389 for j in 0..width {
390 // lower and upper slopes for all bins
391 let lower = -ramps[i][j] / fdiff[i];
392 let upper = ramps[i + 2][j] / fdiff[i + 1]; // +2 is safe since we create `mel_f`
393 // with `n_mels + 2` size
394
395 // .. then intersect them with each other and zero
396 weights[i][j] = T::zero().max(lower.min(upper));
397 }
398 }
399
400 match norm {
401 NormalizationFactor::One => {
402 // Slaney-style mel is scaled to be approx constant energy per channel
403 // enorm = 2.0 / (mel_f[2:n_mels+2] - mel_f[:n_mels])
404 // weights *= enorm[:, np.newaxis]
405
406 let two: T = NumCast::from(2.).unwrap();
407 for (idx, enorm) in mel_f[2..n_mels + 2]
408 .iter()
409 .zip(mel_f[..n_mels].iter())
410 .map(|(x, y)| two / (*x - *y))
411 .enumerate()
412 {
413 let _: Vec<_> = weights[idx].iter_mut().map(|v| *v = (*v) * enorm).collect();
414 }
415 //let enorm = 2.0 / (mel_f[2..n_mels + 2] - mel_f[..n_mels]);
416 // weights *= enorm[.., np.newaxis];
417 }
418 _ => {}
419 }
420
421 // Only check weights if f_mel[0] is positive
422 //if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)) {
423 // // This means we have an empty channel somewhere
424 // warn!("Empty filters detected in mel frequency basis. Some channels will produce empty responses. Try increasing your sampling rate (and fmax) or reducing n_mels.");
425 //}
426
427 weights
428}