mecomp_analysis/
timbral.rs

1//! Timbral feature extraction module.
2//!
3//! Contains functions to extract & summarize the zero-crossing rate,
4//! spectral centroid, spectral flatness and spectral roll-off of
5//! a given Song.
6
7use bliss_audio_aubio_rs::vec::CVec;
8use bliss_audio_aubio_rs::{PVoc, SpecDesc, SpecShape, bin_to_freq};
9use likely_stable::{LikelyResult, unlikely};
10use ndarray::{Axis, arr1};
11
12use crate::Feature;
13
14use super::SAMPLE_RATE;
15use super::errors::{AnalysisError, AnalysisResult};
16use super::utils::{Normalize, geometric_mean, mean, number_crossings};
17
18/**
19 * General object holding all the spectral descriptor.
20 *
21 * Holds 3 spectral descriptors together. It would be better conceptually
22 * to have 3 different spectral descriptor objects, but this avoids re-computing
23 * the same FFT three times.
24 *
25 * Current spectral descriptors are spectral centroid, spectral rolloff and
26 * spectral flatness (see `values_object` for a further description of the
27 * object.
28 *
29 * All descriptors are currently summarized by their mean only.
30 */
31pub struct SpectralDesc {
32    phase_vocoder: PVoc,
33    sample_rate: u32,
34
35    centroid_aubio_desc: SpecDesc,
36    rolloff_aubio_desc: SpecDesc,
37    values_centroid: Vec<f32>,
38    values_rolloff: Vec<f32>,
39    values_flatness: Vec<f32>,
40}
41
42impl SpectralDesc {
43    pub const WINDOW_SIZE: usize = 512;
44    pub const HOP_SIZE: usize = Self::WINDOW_SIZE / 4;
45
46    /**
47     * Compute score related to the
48     * [spectral centroid](https://en.wikipedia.org/wiki/Spectral_centroid) values,
49     * obtained after repeatedly calling `do_` on all of the song's chunks.
50     *
51     * Spectral centroid is used to determine the "brightness" of a sound, i.e.
52     * how much high frequency there is in an audio signal.
53     *
54     * It of course depends of the instrument used: a piano-only track that makes
55     * use of high frequencies will still score less than a song using a lot of
56     * percussive sound, because the piano frequency range is lower.
57     *
58     * The value range is between 0 and `sample_rate / 2`.
59     */
60    #[inline]
61    pub fn get_centroid(&mut self) -> [Feature; 2] {
62        [
63            self.normalize(Feature::from(mean(&self.values_centroid))),
64            self.normalize(Feature::from(
65                arr1(&self.values_centroid)
66                    .std_axis(Axis(0), 0.)
67                    .into_scalar(),
68            )),
69        ]
70    }
71
72    /**
73     * Compute score related to the spectral roll-off values, obtained
74     * after repeatedly calling `do_` on all of the song's chunks.
75     *
76     * Spectral roll-off is the bin frequency number below which a certain
77     * percentage of the spectral energy is found, here, 95%.
78     *
79     * It can be used to distinguish voiced speech (low roll-off) and unvoiced
80     * speech (high roll-off). It is also a good indication of the energy
81     * repartition of a song.
82     *
83     * The value range is between 0 and `sample_rate / 2`
84     */
85    #[inline]
86    pub fn get_rolloff(&mut self) -> [Feature; 2] {
87        [
88            self.normalize(Feature::from(mean(&self.values_rolloff))),
89            self.normalize(Feature::from(
90                arr1(&self.values_rolloff)
91                    .std_axis(Axis(0), 0.)
92                    .into_scalar(),
93            )),
94        ]
95    }
96
97    /**
98     * Compute score related to the
99     * [spectral flatness](https://en.wikipedia.org/wiki/Spectral_flatness) values,
100     * obtained after repeatedly calling `do_` on all of the song's chunks.
101     *
102     * Spectral flatness is the ratio between the geometric mean of the spectrum
103     * and its arithmetic mean.
104     *
105     * It is used to distinguish between tone-like and noise-like signals.
106     * Tone-like audio is f.ex. a piano key, something that has one or more
107     * specific frequencies, while (white) noise has an equal distribution
108     * of intensity among all frequencies.
109     *
110     * The value range is between 0 and 1, since the geometric mean is always less
111     * than the arithmetic mean.
112     */
113    #[inline]
114    pub fn get_flatness(&mut self) -> [Feature; 2] {
115        let max_value = 1.;
116        let min_value = 0.;
117        // Range is different from the other spectral algorithms, so normalizing
118        // manually here.
119        [
120            2. * (Feature::from(mean(&self.values_flatness)) - min_value) / (max_value - min_value)
121                - 1.,
122            2. * (Feature::from(
123                arr1(&self.values_flatness)
124                    .std_axis(Axis(0), 0.)
125                    .into_scalar(),
126            ) - min_value)
127                / (max_value - min_value)
128                - 1.,
129        ]
130    }
131
132    /// # Errors
133    ///
134    /// This function will return an error if there is an error loading the aubio objects
135    #[inline]
136    pub fn new(sample_rate: u32) -> AnalysisResult<Self> {
137        Ok(Self {
138            centroid_aubio_desc: SpecDesc::new(SpecShape::Centroid, Self::WINDOW_SIZE)
139                .map_err_unlikely(|e| {
140                    AnalysisError::AnalysisError(format!(
141                        "error while loading aubio centroid object: {e}",
142                    ))
143                })?,
144            rolloff_aubio_desc: SpecDesc::new(SpecShape::Rolloff, Self::WINDOW_SIZE)
145                .map_err_unlikely(|e| {
146                    AnalysisError::AnalysisError(format!(
147                        "error while loading aubio rolloff object: {e}",
148                    ))
149                })?,
150            phase_vocoder: PVoc::new(Self::WINDOW_SIZE, Self::HOP_SIZE).map_err_unlikely(|e| {
151                AnalysisError::AnalysisError(format!("error while loading aubio pvoc object: {e}",))
152            })?,
153            values_centroid: Vec::new(),
154            values_rolloff: Vec::new(),
155            values_flatness: Vec::new(),
156            sample_rate,
157        })
158    }
159
160    /**
161    Compute all the descriptors' value for the given chunk.
162
163    After using this on all the song's chunks, you can call
164    `get_centroid`, `get_flatness` and `get_rolloff` to get the respective
165    descriptors' values.
166    */
167    #[allow(clippy::missing_errors_doc, clippy::missing_panics_doc)]
168    #[allow(clippy::missing_inline_in_public_items)]
169    pub fn do_(&mut self, chunk: &[f32]) -> AnalysisResult<()> {
170        let mut fftgrain: Vec<f32> = vec![0.0; Self::WINDOW_SIZE];
171        self.phase_vocoder
172            .do_(chunk, fftgrain.as_mut_slice())
173            .map_err_unlikely(|e| {
174                AnalysisError::AnalysisError(format!("error while processing aubio pv object: {e}"))
175            })?;
176
177        let bin = self
178            .centroid_aubio_desc
179            .do_result(fftgrain.as_slice())
180            .map_err_unlikely(|e| {
181                AnalysisError::AnalysisError(format!(
182                    "error while processing aubio centroid object: {e}",
183                ))
184            })?;
185
186        #[allow(clippy::cast_precision_loss)]
187        let freq = bin_to_freq(bin, self.sample_rate as f32, Self::WINDOW_SIZE as f32);
188        self.values_centroid.push(freq);
189
190        let mut bin = self
191            .rolloff_aubio_desc
192            .do_result(fftgrain.as_slice())
193            .map_err_unlikely(|e| {
194                AnalysisError::AnalysisError(format!(
195                    "error while processing aubio rolloff object: {e}",
196                ))
197            })?;
198
199        // Until https://github.com/aubio/aubio/pull/318 is in
200        #[allow(clippy::cast_precision_loss)]
201        if unlikely(bin > Self::WINDOW_SIZE as f32 / 2.) {
202            // only happens on the last chunk
203            bin = Self::WINDOW_SIZE as f32 / 2.;
204        }
205
206        #[allow(clippy::cast_precision_loss)]
207        let freq = bin_to_freq(bin, self.sample_rate as f32, Self::WINDOW_SIZE as f32);
208        self.values_rolloff.push(freq);
209
210        let cvec: CVec<'_> = fftgrain.as_slice().into();
211        let geo_mean = geometric_mean(cvec.norm());
212        if unlikely(geo_mean == 0.0) {
213            self.values_flatness.push(0.0);
214            return Ok(());
215        }
216        let flatness = geo_mean / mean(cvec.norm());
217        self.values_flatness.push(flatness);
218        Ok(())
219    }
220}
221
222impl Normalize for SpectralDesc {
223    #[allow(clippy::cast_precision_loss)]
224    const MAX_VALUE: Feature = SAMPLE_RATE as Feature / 2.;
225    const MIN_VALUE: Feature = 0.;
226}
227
228/**
229 * [Zero-crossing rate](https://en.wikipedia.org/wiki/Zero-crossing_rate)
230 * detection object.
231 *
232 * Zero-crossing rate is mostly used to detect percussive sounds in an audio
233 * signal, as well as whether an audio signal contains speech or not.
234 *
235 * It is a good metric to differentiate between songs with people speaking clearly,
236 * (e.g. slam) and instrumental songs.
237 *
238 * The value range is between 0 and 1.
239 */
240#[derive(Default, Clone)]
241pub struct ZeroCrossingRateDesc {
242    crossings_sum: u32,
243    samples_checked: usize,
244}
245
246impl ZeroCrossingRateDesc {
247    #[must_use]
248    #[inline]
249    pub fn new(_sample_rate: u32) -> Self {
250        Self::default()
251    }
252
253    /// Count the number of zero-crossings for the current `chunk`.
254    #[inline]
255    pub fn do_(&mut self, chunk: &[f32]) {
256        self.crossings_sum += number_crossings(chunk);
257        self.samples_checked += chunk.len();
258    }
259
260    /// Sum the number of zero-crossings witnessed and divide by
261    /// the total number of samples.
262    #[allow(clippy::cast_precision_loss)]
263    #[inline]
264    pub fn get_value(&mut self) -> Feature {
265        self.normalize(Feature::from(self.crossings_sum) / self.samples_checked as Feature)
266    }
267}
268
269impl Normalize for ZeroCrossingRateDesc {
270    const MAX_VALUE: Feature = 1.;
271    const MIN_VALUE: Feature = 0.;
272}
273
274#[cfg(test)]
275mod tests {
276    use super::*;
277    use crate::decoder::{Decoder as DecoderTrait, MecompDecoder as Decoder};
278    use std::path::Path;
279
280    #[test]
281    fn test_zcr_boundaries() {
282        let mut zcr_desc = ZeroCrossingRateDesc::default();
283        let chunk = vec![0.; 1024];
284        zcr_desc.do_(&chunk);
285        let value = zcr_desc.get_value();
286        assert!(f64::EPSILON > (-1. - value).abs(), "{value} !~= -1");
287
288        let one_chunk = [-1., 1.];
289        let chunks = std::iter::repeat_n(one_chunk.iter(), 512)
290            .flatten()
291            .copied()
292            .collect::<Vec<f32>>();
293        let mut zcr_desc = ZeroCrossingRateDesc::default();
294        zcr_desc.do_(&chunks);
295        let value = zcr_desc.get_value();
296        assert!(0.001 > (0.998_046_9 - value).abs(), "{value} !~= 0.9980469");
297    }
298
299    #[test]
300    fn test_zcr() {
301        let song = Decoder::new()
302            .unwrap()
303            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
304            .unwrap();
305        let mut zcr_desc = ZeroCrossingRateDesc::default();
306        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
307            zcr_desc.do_(chunk);
308        }
309        let value = zcr_desc.get_value();
310        assert!(0.001 > (-0.85036 - value).abs(), "{value} !~= -0.85036");
311    }
312
313    #[test]
314    fn test_spectral_flatness_boundaries() {
315        let mut spectral_desc = SpectralDesc::new(10).unwrap();
316        let chunk = vec![0.; 1024];
317
318        let expected_values = [-1., -1.];
319        spectral_desc.do_(&chunk).unwrap();
320        for (expected, actual) in expected_values
321            .iter()
322            .zip(spectral_desc.get_flatness().iter())
323        {
324            assert!(
325                0.000_000_1 > (expected - actual).abs(),
326                "{expected} !~= {actual}"
327            );
328        }
329
330        let song = Decoder::new()
331            .unwrap()
332            .decode(Path::new("data/white_noise.mp3"))
333            .unwrap();
334        let mut spectral_desc = SpectralDesc::new(22050).unwrap();
335        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
336            spectral_desc.do_(chunk).unwrap();
337        }
338        println!("{:?}", spectral_desc.get_flatness());
339        // White noise - as close to 1 as possible
340        let expected_values = [0.578_530_3, -0.942_630_8];
341        for (expected, actual) in expected_values
342            .iter()
343            .zip(spectral_desc.get_flatness().iter())
344        {
345            // original test wanted absolute error < 0.001
346            // assert!(0.001 > (expected - actual).abs(), "{expected} !~= {actual}");
347            let relative_error = (expected - actual).abs() / expected.abs();
348            assert!(
349                relative_error < 0.078,
350                "relative error: {relative_error}, expected: {expected}, actual: {actual}"
351            );
352        }
353    }
354
355    #[test]
356    fn test_spectral_flatness() {
357        let song = Decoder::new()
358            .unwrap()
359            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
360            .unwrap();
361        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
362        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
363            spectral_desc.do_(chunk).unwrap();
364        }
365        // Spectral flatness mean value computed here with phase vocoder before normalization: 0.111949615
366        // Essentia value with spectrum / hann window: 0.11197535695207445
367        let expected_values = [-0.776_100_75, -0.814_817_9];
368        for (expected, actual) in expected_values
369            .iter()
370            .zip(spectral_desc.get_flatness().iter())
371        {
372            assert!(0.01 > (expected - actual).abs(), "{expected} !~= {actual}");
373        }
374    }
375
376    #[test]
377    fn test_spectral_roll_off_boundaries() {
378        let mut spectral_desc = SpectralDesc::new(10).unwrap();
379        let chunk = vec![0.; 512];
380
381        let expected_values = [-1., -1.];
382        spectral_desc.do_(&chunk).unwrap();
383        for (expected, actual) in expected_values
384            .iter()
385            .zip(spectral_desc.get_rolloff().iter())
386        {
387            assert!(
388                0.000_000_1 > (expected - actual).abs(),
389                "{expected} !~= {actual}"
390            );
391        }
392
393        let song = Decoder::new()
394            .unwrap()
395            .decode(Path::new("data/tone_11080Hz.flac"))
396            .unwrap();
397        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
398        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
399            spectral_desc.do_(chunk).unwrap();
400        }
401        let expected_values = [0.996_768_1, -0.996_151_75];
402        for (expected, actual) in expected_values
403            .iter()
404            .zip(spectral_desc.get_rolloff().iter())
405        {
406            assert!(
407                0.0001 > (expected - actual).abs(),
408                "{expected} !~= {actual}"
409            );
410        }
411    }
412
413    #[test]
414    fn test_spectral_roll_off() {
415        let song = Decoder::new()
416            .unwrap()
417            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
418            .unwrap();
419        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
420        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
421            spectral_desc.do_(chunk).unwrap();
422        }
423        let expected_values = [-0.632_648_6, -0.726_093_3];
424        // Roll-off mean value computed here with phase vocoder before normalization: 2026.7644
425        // Essentia value with spectrum / hann window: 1979.632683520047
426        for (expected, actual) in expected_values
427            .iter()
428            .zip(spectral_desc.get_rolloff().iter())
429        {
430            assert!(0.01 > (expected - actual).abs(), "{expected} !~= {actual}");
431        }
432    }
433
434    #[test]
435    fn test_spectral_centroid() {
436        let song = Decoder::new()
437            .unwrap()
438            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
439            .unwrap();
440        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
441        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
442            spectral_desc.do_(chunk).unwrap();
443        }
444        // Spectral centroid mean value computed here with phase vocoder before normalization: 1354.2273
445        // Essential value with spectrum / hann window: 1351
446        let expected_values = [-0.75483, -0.879_168_87];
447        for (expected, actual) in expected_values
448            .iter()
449            .zip(spectral_desc.get_centroid().iter())
450        {
451            assert!(
452                0.0001 > (expected - actual).abs(),
453                "{expected} !~= {actual}"
454            );
455        }
456    }
457
458    #[test]
459    fn test_spectral_centroid_boundaries() {
460        let mut spectral_desc = SpectralDesc::new(10).unwrap();
461        let chunk = vec![0.; 512];
462
463        spectral_desc.do_(&chunk).unwrap();
464        let expected_values = [-1., -1.];
465        for (expected, actual) in expected_values
466            .iter()
467            .zip(spectral_desc.get_centroid().iter())
468        {
469            assert!(
470                0.000_000_1 > (expected - actual).abs(),
471                "{expected} !~= {actual}"
472            );
473        }
474        let song = Decoder::new()
475            .unwrap()
476            .decode(Path::new("data/tone_11080Hz.flac"))
477            .unwrap();
478        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
479        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
480            spectral_desc.do_(chunk).unwrap();
481        }
482        let expected_values = [0.97266, -0.960_992_6];
483        for (expected, actual) in expected_values
484            .iter()
485            .zip(spectral_desc.get_centroid().iter())
486        {
487            // original test wanted absolute error < 0.00001
488            // assert!(0.00001 > (expected - actual).abs(), "{expected} !~= {actual}");
489            let relative_error = (expected - actual).abs() / expected.abs();
490            assert!(
491                relative_error < 0.039,
492                "relative error: {relative_error}, expected: {expected}, actual: {actual}"
493            );
494        }
495    }
496}