Skip to main content

mel_filter/
lib.rs

1use itertools_num::linspace;
2use num_traits::{AsPrimitive, Float, NumCast, NumOps};
3
4pub enum NormalizationFactor {
5    /// Leave all the triangles aiming for a peak value of 1.0
6    None,
7    /// divide the triangular mel weights by the width of the mel band (area normalization).
8    One,
9    /// Leave all the triangles aiming for a peak value of 1.0
10    Inf,
11}
12
13/// Implementation of `librosa.hz_to_mel`
14///
15/// Convert Hz to Mels
16///
17/// # Parameters
18///
19/// `frequencies` : number or &[..] , float
20///     scalar or slice of frequencies
21/// `htk`         : use HTK formula instead of Slaney
22///
23/// # Returns
24/// Vec![..], input frequencies in Mels
25///
26/// # Examples
27///
28/// ```
29/// use mel_filter::hz_to_mel;
30/// assert_eq!(vec![0.8999999999999999], hz_to_mel(&[60.], false));
31/// assert_eq!(vec![0.8999999999999999], hz_to_mel(&[60.], false));
32/// assert_eq!(vec![0.], hz_to_mel(&[0.], false));
33/// assert_eq!(vec![49.91059448015905], hz_to_mel(&[11025.], false));
34/// assert_eq!(vec![ 1.65,  3.3 ,  6.6 ], hz_to_mel(vec![110., 220., 440.], false));
35/// ```
36///
37/// # See Also
38///
39/// [mel_to_hz]
40///
41pub fn hz_to_mel<T, A>(frequencies: T, htk: bool) -> Vec<A>
42where
43    T: AsRef<[A]>,
44    A: Copy + Float + NumOps + NumCast,
45{
46    if htk {
47        let n: A = NumCast::from(2595.0).unwrap();
48        return frequencies
49            .as_ref()
50            .iter()
51            .map(|v| n * (*v / NumCast::from(700.0).unwrap() + NumCast::from(1.0).unwrap()).log10())
52            .collect();
53    }
54
55    // Fill in the linear part
56    let f_min = A::zero();
57    let f_sp = NumCast::from(200.0 / 3.).unwrap();
58
59    let mut mels: Vec<A> = frequencies
60        .as_ref()
61        .iter()
62        .map(|v| (*v - f_min) / f_sp)
63        .collect();
64
65    // Fill in the log-scale part
66
67    let min_log_hz = NumCast::from(1000.0).unwrap(); // beginning of log region (Hz)
68    let min_log_mel = (min_log_hz - f_min) / f_sp; // same (Mels)
69    let logstep = NumCast::from((6.4).ln() / 27.0).unwrap(); // step size for log region
70                                                             // If we have array data, vectorize
71    for (idx, val) in frequencies.as_ref().iter().enumerate() {
72        if val >= &min_log_hz {
73            mels[idx] = min_log_mel + (*val / min_log_hz).ln() / logstep;
74        }
75    }
76    // min_log_mel + np.log(frequencies[log_t] / min_log_hz) / logstep
77    mels
78    //let log_t = frequencies >= min_log_hz;
79    //mels[log_t] = min_log_mel + np.log(frequencies[log_t] / min_log_hz) / logstep;
80    //
81    // If we have scalar data, heck directly
82    //min_log_mel + (frequencies / min_log_hz).ln() / logstep;
83}
84
85/// Implementation of `librosa.mel_to_hz`
86///
87/// Convert mel bin numbers to frequencies
88///
89/// # Parameters
90///
91/// `mels`          : Vec [shape=(n,)], float mel bins to convert
92/// `htk`           : use HTK formula instead of Slaney
93///
94/// # Returns
95///
96/// frequencies   : np.ndarray [shape=(n,)]
97///     input mels in Hz
98///
99/// # Examples
100///
101/// ```
102/// use mel_filter::mel_to_hz;
103/// assert_eq!(vec![200.], mel_to_hz(&[3.], false));
104/// assert_eq!(vec![200.], mel_to_hz(&[3.], false));
105///
106/// assert_eq!(vec![  66.66666666666667,  133.33333333333334,  200.   ,  266.6666666666667,  333.33333333333337], mel_to_hz(&[1.,2.,3.,4.,5.], false));
107/// assert_eq!(vec![  66.66666666666667,  133.33333333333334,  200.   ,  266.6666666666667,  333.33333333333337], mel_to_hz(&[1.,2.,3.,4.,5.], false));
108/// ```
109///
110/// # See Also
111///
112/// [hz_to_mel]
113///
114pub fn mel_to_hz<T, A>(mels: T, htk: bool) -> Vec<A>
115where
116    T: AsRef<[A]>,
117    A: Copy + Float + NumOps + NumCast,
118{
119    if htk {
120        let base: A = NumCast::from(10.0).unwrap();
121        let seven_hundred: A = NumCast::from(700.0).unwrap();
122        return mels
123            .as_ref()
124            .iter()
125            .map(|v| {
126                seven_hundred
127                    * (base.powf(*v / NumCast::from(2595.0).unwrap()) - NumCast::from(1.0).unwrap())
128            })
129            .collect();
130    }
131
132    let mels = mels.as_ref();
133
134    // Fill in the linear scale
135    let f_min = A::zero();
136    let f_sp: A = A::from(200.0 / 3.0).unwrap();
137
138    let mut freqs: Vec<A> = mels.iter().map(|v| f_min + f_sp * *v).collect();
139
140    // And now the nonlinear scale
141    let min_log_hz = A::from(1000.0).unwrap(); // beginning of log region (Hz)
142    let min_log_mel = (min_log_hz - f_min) / f_sp; // same (Mels)
143    let logstep = A::from((6.4).ln() / 27.0).unwrap(); // step size for log region
144
145    // If we have vector data, vectorize
146    for (idx, val) in mels.iter().enumerate() {
147        if val >= &min_log_mel {
148            // min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
149            freqs[idx] = min_log_hz * A::exp(logstep * (*val - min_log_mel))
150        }
151    }
152    freqs
153    /*
154    let freqs = f_min + f_sp * mels;
155    if mels >= min_log_mel {
156        // If we have scalar data, check directly
157        freqs = min_log_hz * (logstep * (mels - min_log_mel)).exp();
158    }
159    freqs
160    */
161}
162
163/// Implementation of `librosa.mel_frequencies`
164///
165/// Compute an array of acoustic frequencies tuned to the mel scale.
166///
167/// The mel scale is a quasi-logarithmic function of acoustic frequency
168/// designed such that perceptually similar pitch intervals (e.g. octaves)
169/// appear equal in width over the full hearing range.
170///
171/// Because the definition of the mel scale is conditioned by a finite number
172/// of subjective psychoaoustical experiments, several implementations coexist
173/// in the audio signal processing literature [#]_. By default, librosa replicates
174/// the behavior of the well-established MATLAB Auditory Toolbox of Slaney [#]_.
175/// According to this default implementation,  the conversion from Hertz to mel is
176/// linear below 1 kHz and logarithmic above 1 kHz. Another available implementation
177/// replicates the Hidden Markov Toolkit [#]_ (HTK) according to the following formula::
178///
179///     mel = 2595.0 * (1.0 + f / 700.0).log10().
180///
181/// The choice of implementation is determined by the `htk` keyword argument: setting
182/// `htk=false` leads to the Auditory toolbox implementation, whereas setting it `htk=true`
183/// leads to the HTK implementation.
184///
185/// - [#] Umesh, S., Cohen, L., & Nelson, D. Fitting the mel scale.
186///     In Proc. International Conference on Acoustics, Speech, and Signal Processing
187///     (ICASSP), vol. 1, pp. 217-220, 1998.
188///
189/// - [#] Slaney, M. Auditory Toolbox: A MATLAB Toolbox for Auditory
190///     Modeling Work. Technical Report, version 2, Interval Research Corporation, 1998.
191///
192/// - [#] Young, S., Evermann, G., Gales, M., Hain, T., Kershaw, D., Liu, X.,
193///     Moore, G., Odell, J., Ollason, D., Povey, D., Valtchev, V., & Woodland, P.
194///     The HTK book, version 3.4. Cambridge University, March 2009.
195///
196///
197/// # See Also
198/// [hz_to_mel]
199///
200/// [mel_to_hz]
201///
202/// librosa.feature.melspectrogram
203///
204/// librosa.feature.mfcc
205///
206/// # Parameters
207///
208/// `n_mels`    : Number of mel bins.
209///
210/// `fmin`      : float >= 0, Minimum frequency (Hz).
211///
212/// `fmax`      : float >= 0, Maximum frequency (Hz).
213///
214/// `htk`       : bool
215///     If True, use HTK formula to convert Hz to mel.
216///     Otherwise (False), use Slaney's Auditory Toolbox.
217///
218/// # Returns
219/// `bin_frequencies` : Vec[shape=(n_mels,)]
220///     Vector of `n_mels` frequencies in Hz which are uniformly spaced on the Mel
221///     axis.
222///
223/// # Examples
224///
225/// ```
226/// use mel_filter::mel_frequencies;
227/// let freqs = mel_frequencies::<f64>(Some(40), None, None, false); // n_mels=40
228/// println!("{:?}", freqs);
229/// let expected = vec![  0.0             ,   85.31725552163941,  170.63451104327882,
230///                     255.95176656491824,  341.26902208655764,  426.586277608197  ,
231///                     511.9035331298365 ,  597.2207886514759 ,  682.5380441731153 ,
232///                     767.8552996947546 ,  853.172555216394  ,  938.4898107380334 ,
233///                    1024.8555458780081 , 1119.1140732107583 , 1222.0417930074345 ,
234///                    1334.436032577335  , 1457.1674514162094 , 1591.1867857508237 ,
235///                    1737.532213396153  , 1897.3373959769085 , 2071.840260812287  ,
236///                    2262.3925904926227 , 2470.4704944333835 , 2697.6858435241707 ,
237///                    2945.7987564509885 , 3216.731234416783  , 3512.5820498813423 ,
238///                    3835.64300465582   , 4188.416683294875  , 4573.635839312682  ,
239///                    4994.284564397706  , 5453.621404613084  , 5955.204602651788  ,
240///                    6502.919661685081  , 7101.009444327076  , 7754.107039876304  ,
241///                    8467.271654439703  , 9246.027801961029  , 10096.408099746186 ,
242///                    11025.0];
243/// assert_eq!(freqs, expected);
244/// ```
245///
246pub fn mel_frequencies<T: Float + NumOps>(
247    n_mels: Option<usize>,
248    fmin: Option<T>,
249    fmax: Option<T>,
250    htk: bool,
251) -> Vec<T> {
252    let n_mels = n_mels.unwrap_or(128);
253
254    let fmin = fmin.unwrap_or_else(T::zero);
255
256    let fmax = fmax.unwrap_or_else(|| NumCast::from(11025.0).unwrap());
257
258    // 'Center freqs' of mel bands - uniformly spaced between limits
259    let min_mel = hz_to_mel(&[fmin], htk)[0];
260    let max_mel = hz_to_mel(&[fmax], htk)[0];
261
262    let mels: Vec<_> = linspace::<T>(min_mel, max_mel, n_mels).collect();
263
264    mel_to_hz(mels, htk)
265}
266
267/// Implementation of `librosa.fft_frequencies`
268///
269/// Returns Vec of frequencies lenth is `1 + n_fft/2`.
270///     Frequencies ``(0, sr/n_fft, 2*sr/n_fft, ..., sr/2)``
271///
272/// # Parameters
273///
274/// `sr` : Audio sampling rate
275///
276/// `n_fft` : FFT window size
277///
278/// # Examples
279///
280/// ```
281/// use mel_filter::fft_frequencies;
282/// assert_eq!(fft_frequencies::<f32>(Some(22050), Some(16)), vec![   0.   ,   1378.125,   2756.25 ,   4134.375,
283///                                                                5512.5  ,   6890.625,   8268.75 ,   9646.875,  11025.]);
284/// assert_eq!(fft_frequencies::<f64>(Some(22050), Some(16)), vec![   0.   ,   1378.125,   2756.25 ,   4134.375,
285///                                                                5512.5  ,   6890.625,   8268.75 ,   9646.875,  11025.]);
286/// // array([     0.   ,   1378.125,   2756.25 ,   4134.375,
287/// //          5512.5  ,   6890.625,   8268.75 ,   9646.875,  11025.   ]));
288/// ```
289///
290pub fn fft_frequencies<T: Float + NumOps>(sr: Option<usize>, n_fft: Option<usize>) -> Vec<T> {
291    let sr: f64 = sr.unwrap_or(22050).as_();
292    let n_fft = n_fft.unwrap_or(2048);
293
294    linspace(T::zero(), NumCast::from(sr / 2.).unwrap(), 1 + n_fft / 2).collect()
295}
296
297/// `librosa.filters.mel`.
298/// Returns 2D array, of shape `n_mels * (1 + n_fft/2)`, Currently is `Vec<Vec<T>>`.
299///
300/// # Arguments
301///
302/// * `sr` - Sampling rate of the incoming signal
303/// * `n_fft` - number of FFT components
304/// * `n_mels` - number of Mel bands to generate, default 128
305/// * `fmin` - lowest frequency (in Hz), default 0.0
306/// * `fmax` - highest frequency (in Hz)
307///     If `None`, use `fmax = sr / 2.0`
308/// * `htk` - use HTK formula instead of Slaney
309/// * `norm` if [NormalizationFactor::One], divide the triangular mel weights by the width of the mel band
310///     (area normalization).  Otherwise, leave all the triangles aiming for
311///     a peak value of 1.0
312///
313/// # Examples
314///
315/// ```
316/// use mel_filter::{mel, NormalizationFactor};
317/// let melfb = mel::<f64>(22050, 2048, None, None, None, false, NormalizationFactor::One);
318/// assert_eq!(melfb.len(), 128);
319/// assert_eq!(melfb[0].len(), 1025);
320/// assert_eq!(&melfb[0][..6], &[0f64, 0.016182853208219942, 0.032365706416439884, 0.028990088037379964, 0.012807234829160026, 0.][..], "begin six element");
321/// assert_eq!(&melfb[1][..9], &[0f64, 0., 0., 0.009779235793639925, 0.025962089001859864, 0.035393705451959974, 0.01921085224374004, 0.003027999035520103, 0.][..], "second nine element");
322/// // melfb = [[ 0.   ,  0.016, ...,  0.   ,  0.   ],
323/// //          [ 0.   ,  0.   , ...,  0.   ,  0.   ],
324/// //          ...,
325/// //          [ 0.   ,  0.   , ...,  0.   ,  0.   ],
326/// //          [ 0.   ,  0.   , ...,  0.   ,  0.   ]]
327/// // Clip the maximum frequency to 8KHz
328/// let melfb = mel(22050, 2048, None, None, Some(8000.), false, NormalizationFactor::One);
329/// println!("{:?}", &melfb[0][..10]);
330/// assert_eq!(melfb.len(), 128);
331/// assert_eq!(melfb[0].len(), 1025);
332/// assert_eq!(&melfb[0][..6], &[0f64, 0.01969187633619885, 0.0393837526723977, 0.026457473399387796, 0.006765597063188946, 0.][..], "begin six element");
333/// assert_eq!(&melfb[1][..9], &[0f64, 0., 0., 0.016309077804604378, 0.036000954140803225, 0.029840271930982275, 0.010148395594783432, 0., 0.][..], "second nine element");
334/// // melfb = [[ 0.  ,  0.02, ...,  0.  ,  0.  ],
335/// //          [ 0.  ,  0.  , ...,  0.  ,  0.  ],
336/// //          ...,
337/// //          [ 0.  ,  0.  , ...,  0.  ,  0.  ],
338/// //          [ 0.  ,  0.  , ...,  0.  ,  0.  ]])
339/// ```
340pub fn mel<T: Float + NumOps>(
341    sr: usize,
342    n_fft: usize,
343    n_mels: Option<usize>,
344    fmin: Option<T>,
345    fmax: Option<T>,
346    htk: bool,
347    norm: NormalizationFactor,
348) -> Vec<Vec<T>> {
349    let fmax: T = fmax.unwrap_or_else(|| {
350        let sr: f64 = sr.as_();
351        T::from(sr / 2.).unwrap()
352    });
353
354    // Initialize the weights
355    let n_mels = n_mels.unwrap_or(128);
356    let width = 1 + n_fft / 2;
357    let mut weights = vec![vec![T::zero(); width]; n_mels];
358
359    // Center freqs of each FFT bin
360    let fftfreqs = fft_frequencies(Some(sr), Some(n_fft));
361
362    // 'Center freqs' of mel bands - uniformly spaced between limits
363    let mel_f = mel_frequencies(Some(n_mels + 2), fmin, Some(fmax), htk);
364
365    let fdiff: Vec<_> = mel_f[..mel_f.len() - 1]
366        .iter()
367        .zip(mel_f[1..].iter())
368        .map(|(x, y)| *y - *x)
369        .collect();
370
371    // ramps = np.subtract.outer(mel_f, fftfreqs);
372    let mut ramps = vec![vec![T::zero(); fftfreqs.len()]; mel_f.len()];
373
374    let _ = mel_f
375        .iter()
376        .enumerate()
377        .map(|(i, m)| {
378            fftfreqs
379                .iter()
380                .enumerate()
381                .map(|(j, f)| {
382                    ramps[i][j] = *m - *f;
383                })
384                .collect::<Vec<_>>()
385        })
386        .collect::<Vec<_>>();
387
388    for i in 0..n_mels {
389        for j in 0..width {
390            // lower and upper slopes for all bins
391            let lower = -ramps[i][j] / fdiff[i];
392            let upper = ramps[i + 2][j] / fdiff[i + 1]; // +2 is safe since we create `mel_f`
393                                                        // with `n_mels + 2` size
394
395            // .. then intersect them with each other and zero
396            weights[i][j] = T::zero().max(lower.min(upper));
397        }
398    }
399
400    match norm {
401        NormalizationFactor::One => {
402            // Slaney-style mel is scaled to be approx constant energy per channel
403            // enorm = 2.0 / (mel_f[2:n_mels+2] - mel_f[:n_mels])
404            // weights *= enorm[:, np.newaxis]
405
406            let two: T = NumCast::from(2.).unwrap();
407            for (idx, enorm) in mel_f[2..n_mels + 2]
408                .iter()
409                .zip(mel_f[..n_mels].iter())
410                .map(|(x, y)| two / (*x - *y))
411                .enumerate()
412            {
413                let _: Vec<_> = weights[idx].iter_mut().map(|v| *v = (*v) * enorm).collect();
414            }
415            //let enorm = 2.0 / (mel_f[2..n_mels + 2] - mel_f[..n_mels]);
416            // weights *= enorm[.., np.newaxis];
417        }
418        _ => {}
419    }
420
421    // Only check weights if f_mel[0] is positive
422    //if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)) {
423    //    // This means we have an empty channel somewhere
424    //    warn!("Empty filters detected in mel frequency basis. Some channels will produce empty responses. Try increasing your sampling rate (and fmax) or reducing n_mels.");
425    //}
426
427    weights
428}