mecomp_analysis/
timbral.rs

1//! Timbral feature extraction module.
2//!
3//! Contains functions to extract & summarize the zero-crossing rate,
4//! spectral centroid, spectral flatness and spectral roll-off of
5//! a given Song.
6
7use bliss_audio_aubio_rs::vec::CVec;
8use bliss_audio_aubio_rs::{bin_to_freq, PVoc, SpecDesc, SpecShape};
9use ndarray::{arr1, Axis};
10
11use crate::Feature;
12
13use super::errors::{AnalysisError, AnalysisResult};
14use super::utils::{geometric_mean, mean, number_crossings, Normalize};
15use super::SAMPLE_RATE;
16
17/**
18 * General object holding all the spectral descriptor.
19 *
20 * Holds 3 spectral descriptors together. It would be better conceptually
21 * to have 3 different spectral descriptor objects, but this avoids re-computing
22 * the same FFT three times.
23 *
24 * Current spectral descriptors are spectral centroid, spectral rolloff and
25 * spectral flatness (see `values_object` for a further description of the
26 * object.
27 *
28 * All descriptors are currently summarized by their mean only.
29 */
30pub struct SpectralDesc {
31    phase_vocoder: PVoc,
32    sample_rate: u32,
33
34    centroid_aubio_desc: SpecDesc,
35    rolloff_aubio_desc: SpecDesc,
36    values_centroid: Vec<f32>,
37    values_rolloff: Vec<f32>,
38    values_flatness: Vec<f32>,
39}
40
41impl SpectralDesc {
42    pub const WINDOW_SIZE: usize = 512;
43    pub const HOP_SIZE: usize = Self::WINDOW_SIZE / 4;
44
45    /**
46     * Compute score related to the
47     * [spectral centroid](https://en.wikipedia.org/wiki/Spectral_centroid) values,
48     * obtained after repeatedly calling `do_` on all of the song's chunks.
49     *
50     * Spectral centroid is used to determine the "brightness" of a sound, i.e.
51     * how much high frequency there is in an audio signal.
52     *
53     * It of course depends of the instrument used: a piano-only track that makes
54     * use of high frequencies will still score less than a song using a lot of
55     * percussive sound, because the piano frequency range is lower.
56     *
57     * The value range is between 0 and `sample_rate / 2`.
58     */
59    pub fn get_centroid(&mut self) -> Vec<Feature> {
60        vec![
61            self.normalize(Feature::from(mean(&self.values_centroid))),
62            self.normalize(Feature::from(
63                arr1(&self.values_centroid)
64                    .std_axis(Axis(0), 0.)
65                    .into_scalar(),
66            )),
67        ]
68    }
69
70    /**
71     * Compute score related to the spectral roll-off values, obtained
72     * after repeatedly calling `do_` on all of the song's chunks.
73     *
74     * Spectral roll-off is the bin frequency number below which a certain
75     * percentage of the spectral energy is found, here, 95%.
76     *
77     * It can be used to distinguish voiced speech (low roll-off) and unvoiced
78     * speech (high roll-off). It is also a good indication of the energy
79     * repartition of a song.
80     *
81     * The value range is between 0 and `sample_rate / 2`
82     */
83    pub fn get_rolloff(&mut self) -> Vec<Feature> {
84        vec![
85            self.normalize(Feature::from(mean(&self.values_rolloff))),
86            self.normalize(Feature::from(
87                arr1(&self.values_rolloff)
88                    .std_axis(Axis(0), 0.)
89                    .into_scalar(),
90            )),
91        ]
92    }
93
94    /**
95     * Compute score related to the
96     * [spectral flatness](https://en.wikipedia.org/wiki/Spectral_flatness) values,
97     * obtained after repeatedly calling `do_` on all of the song's chunks.
98     *
99     * Spectral flatness is the ratio between the geometric mean of the spectrum
100     * and its arithmetic mean.
101     *
102     * It is used to distinguish between tone-like and noise-like signals.
103     * Tone-like audio is f.ex. a piano key, something that has one or more
104     * specific frequencies, while (white) noise has an equal distribution
105     * of intensity among all frequencies.
106     *
107     * The value range is between 0 and 1, since the geometric mean is always less
108     * than the arithmetic mean.
109     */
110    pub fn get_flatness(&mut self) -> Vec<Feature> {
111        let max_value = 1.;
112        let min_value = 0.;
113        // Range is different from the other spectral algorithms, so normalizing
114        // manually here.
115        vec![
116            2. * (Feature::from(mean(&self.values_flatness)) - min_value) / (max_value - min_value)
117                - 1.,
118            2. * (Feature::from(
119                arr1(&self.values_flatness)
120                    .std_axis(Axis(0), 0.)
121                    .into_scalar(),
122            ) - min_value)
123                / (max_value - min_value)
124                - 1.,
125        ]
126    }
127
128    /// # Errors
129    ///
130    /// This function will return an error if there is an error loading the aubio objects
131    pub fn new(sample_rate: u32) -> AnalysisResult<Self> {
132        Ok(Self {
133            centroid_aubio_desc: SpecDesc::new(SpecShape::Centroid, Self::WINDOW_SIZE).map_err(
134                |e| {
135                    AnalysisError::AnalysisError(format!(
136                        "error while loading aubio centroid object: {e}",
137                    ))
138                },
139            )?,
140            rolloff_aubio_desc: SpecDesc::new(SpecShape::Rolloff, Self::WINDOW_SIZE).map_err(
141                |e| {
142                    AnalysisError::AnalysisError(format!(
143                        "error while loading aubio rolloff object: {e}",
144                    ))
145                },
146            )?,
147            phase_vocoder: PVoc::new(Self::WINDOW_SIZE, Self::HOP_SIZE).map_err(|e| {
148                AnalysisError::AnalysisError(format!("error while loading aubio pvoc object: {e}",))
149            })?,
150            values_centroid: Vec::new(),
151            values_rolloff: Vec::new(),
152            values_flatness: Vec::new(),
153            sample_rate,
154        })
155    }
156
157    /**
158    Compute all the descriptors' value for the given chunk.
159
160    After using this on all the song's chunks, you can call
161    `get_centroid`, `get_flatness` and `get_rolloff` to get the respective
162    descriptors' values.
163    */
164    #[allow(clippy::missing_errors_doc, clippy::missing_panics_doc)]
165    pub fn do_(&mut self, chunk: &[f32]) -> AnalysisResult<()> {
166        let mut fftgrain: Vec<f32> = vec![0.0; Self::WINDOW_SIZE];
167        self.phase_vocoder
168            .do_(chunk, fftgrain.as_mut_slice())
169            .map_err(|e| {
170                AnalysisError::AnalysisError(format!("error while processing aubio pv object: {e}"))
171            })?;
172
173        let bin = self
174            .centroid_aubio_desc
175            .do_result(fftgrain.as_slice())
176            .map_err(|e| {
177                AnalysisError::AnalysisError(format!(
178                    "error while processing aubio centroid object: {e}",
179                ))
180            })?;
181
182        #[allow(clippy::cast_precision_loss)]
183        let freq = bin_to_freq(bin, self.sample_rate as f32, Self::WINDOW_SIZE as f32);
184        self.values_centroid.push(freq);
185
186        let mut bin = self
187            .rolloff_aubio_desc
188            .do_result(fftgrain.as_slice())
189            .unwrap();
190
191        // Until https://github.com/aubio/aubio/pull/318 is in
192        #[allow(clippy::cast_precision_loss)]
193        if bin > Self::WINDOW_SIZE as f32 / 2. {
194            bin = Self::WINDOW_SIZE as f32 / 2.;
195        }
196
197        #[allow(clippy::cast_precision_loss)]
198        let freq = bin_to_freq(bin, self.sample_rate as f32, Self::WINDOW_SIZE as f32);
199        self.values_rolloff.push(freq);
200
201        let cvec: CVec = fftgrain.as_slice().into();
202        let geo_mean = geometric_mean(cvec.norm());
203        if geo_mean == 0.0 {
204            self.values_flatness.push(0.0);
205            return Ok(());
206        }
207        let flatness = geo_mean / mean(cvec.norm());
208        self.values_flatness.push(flatness);
209        Ok(())
210    }
211}
212
213impl Normalize for SpectralDesc {
214    #[allow(clippy::cast_precision_loss)]
215    const MAX_VALUE: Feature = SAMPLE_RATE as Feature / 2.;
216    const MIN_VALUE: Feature = 0.;
217}
218
219/**
220 * [Zero-crossing rate](https://en.wikipedia.org/wiki/Zero-crossing_rate)
221 * detection object.
222 *
223 * Zero-crossing rate is mostly used to detect percussive sounds in an audio
224 * signal, as well as whether an audio signal contains speech or not.
225 *
226 * It is a good metric to differentiate between songs with people speaking clearly,
227 * (e.g. slam) and instrumental songs.
228 *
229 * The value range is between 0 and 1.
230 */
231#[derive(Default, Clone)]
232pub struct ZeroCrossingRateDesc {
233    values: Vec<u32>,
234    number_samples: usize,
235}
236
237impl ZeroCrossingRateDesc {
238    #[allow(dead_code)]
239    #[must_use]
240    pub fn new(_sample_rate: u32) -> Self {
241        Self::default()
242    }
243
244    /// Count the number of zero-crossings for the current `chunk`.
245    pub fn do_(&mut self, chunk: &[f32]) {
246        self.values.push(number_crossings(chunk));
247        self.number_samples += chunk.len();
248    }
249
250    /// Sum the number of zero-crossings witnessed and divide by
251    /// the total number of samples.
252    #[allow(clippy::cast_precision_loss)]
253    pub fn get_value(&mut self) -> Feature {
254        self.normalize(
255            Feature::from(self.values.iter().sum::<u32>()) / self.number_samples as Feature,
256        )
257    }
258}
259
260impl Normalize for ZeroCrossingRateDesc {
261    const MAX_VALUE: Feature = 1.;
262    const MIN_VALUE: Feature = 0.;
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268    use crate::decoder::{Decoder as DecoderTrait, MecompDecoder as Decoder};
269    use std::path::Path;
270
271    #[test]
272    fn test_zcr_boundaries() {
273        let mut zcr_desc = ZeroCrossingRateDesc::default();
274        let chunk = vec![0.; 1024];
275        zcr_desc.do_(&chunk);
276        assert_eq!(-1., zcr_desc.get_value());
277
278        let one_chunk = [-1., 1.];
279        let chunks = std::iter::repeat(one_chunk.iter())
280            .take(512)
281            .flatten()
282            .copied()
283            .collect::<Vec<f32>>();
284        let mut zcr_desc = ZeroCrossingRateDesc::default();
285        zcr_desc.do_(&chunks);
286        assert!(
287            0.001 > (0.998_046_9 - zcr_desc.get_value()).abs(),
288            "{} !~= 0.9980469",
289            zcr_desc.get_value()
290        );
291    }
292
293    #[test]
294    fn test_zcr() {
295        let song = Decoder::decode(Path::new("data/s16_mono_22_5kHz.flac")).unwrap();
296        let mut zcr_desc = ZeroCrossingRateDesc::default();
297        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
298            zcr_desc.do_(chunk);
299        }
300        assert!(
301            0.001 > (-0.85036 - zcr_desc.get_value()).abs(),
302            "{} !~= -0.85036",
303            zcr_desc.get_value()
304        );
305    }
306
307    #[test]
308    fn test_spectral_flatness_boundaries() {
309        let mut spectral_desc = SpectralDesc::new(10).unwrap();
310        let chunk = vec![0.; 1024];
311
312        let expected_values = [-1., -1.];
313        spectral_desc.do_(&chunk).unwrap();
314        for (expected, actual) in expected_values
315            .iter()
316            .zip(spectral_desc.get_flatness().iter())
317        {
318            assert!(
319                0.000_000_1 > (expected - actual).abs(),
320                "{expected} !~= {actual}"
321            );
322        }
323
324        let song = Decoder::decode(Path::new("data/white_noise.mp3")).unwrap();
325        let mut spectral_desc = SpectralDesc::new(22050).unwrap();
326        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
327            spectral_desc.do_(chunk).unwrap();
328        }
329        println!("{:?}", spectral_desc.get_flatness());
330        // White noise - as close to 1 as possible
331        let expected_values = [0.578_530_3, -0.942_630_8];
332        for (expected, actual) in expected_values
333            .iter()
334            .zip(spectral_desc.get_flatness().iter())
335        {
336            // original test wanted absolute error < 0.001
337            // assert!(0.001 > (expected - actual).abs(), "{expected} !~= {actual}");
338            let relative_error = (expected - actual).abs() / expected.abs();
339            assert!(
340                relative_error < 0.078,
341                "relative error: {relative_error}, expected: {expected}, actual: {actual}"
342            );
343        }
344    }
345
346    #[test]
347    fn test_spectral_flatness() {
348        let song = Decoder::decode(Path::new("data/s16_mono_22_5kHz.flac")).unwrap();
349        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
350        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
351            spectral_desc.do_(chunk).unwrap();
352        }
353        // Spectral flatness mean value computed here with phase vocoder before normalization: 0.111949615
354        // Essentia value with spectrum / hann window: 0.11197535695207445
355        let expected_values = [-0.776_100_75, -0.814_817_9];
356        for (expected, actual) in expected_values
357            .iter()
358            .zip(spectral_desc.get_flatness().iter())
359        {
360            assert!(0.01 > (expected - actual).abs(), "{expected} !~= {actual}");
361        }
362    }
363
364    #[test]
365    fn test_spectral_roll_off_boundaries() {
366        let mut spectral_desc = SpectralDesc::new(10).unwrap();
367        let chunk = vec![0.; 512];
368
369        let expected_values = [-1., -1.];
370        spectral_desc.do_(&chunk).unwrap();
371        for (expected, actual) in expected_values
372            .iter()
373            .zip(spectral_desc.get_rolloff().iter())
374        {
375            assert!(
376                0.000_000_1 > (expected - actual).abs(),
377                "{expected} !~= {actual}"
378            );
379        }
380
381        let song = Decoder::decode(Path::new("data/tone_11080Hz.flac")).unwrap();
382        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
383        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
384            spectral_desc.do_(chunk).unwrap();
385        }
386        let expected_values = [0.996_768_1, -0.996_151_75];
387        for (expected, actual) in expected_values
388            .iter()
389            .zip(spectral_desc.get_rolloff().iter())
390        {
391            assert!(
392                0.0001 > (expected - actual).abs(),
393                "{expected} !~= {actual}"
394            );
395        }
396    }
397
398    #[test]
399    fn test_spectral_roll_off() {
400        let song = Decoder::decode(Path::new("data/s16_mono_22_5kHz.flac")).unwrap();
401        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
402        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
403            spectral_desc.do_(chunk).unwrap();
404        }
405        let expected_values = [-0.632_648_6, -0.726_093_3];
406        // Roll-off mean value computed here with phase vocoder before normalization: 2026.7644
407        // Essentia value with spectrum / hann window: 1979.632683520047
408        for (expected, actual) in expected_values
409            .iter()
410            .zip(spectral_desc.get_rolloff().iter())
411        {
412            assert!(0.01 > (expected - actual).abs(), "{expected} !~= {actual}");
413        }
414    }
415
416    #[test]
417    fn test_spectral_centroid() {
418        let song = Decoder::decode(Path::new("data/s16_mono_22_5kHz.flac")).unwrap();
419        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
420        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
421            spectral_desc.do_(chunk).unwrap();
422        }
423        // Spectral centroid mean value computed here with phase vocoder before normalization: 1354.2273
424        // Essential value with spectrum / hann window: 1351
425        let expected_values = [-0.75483, -0.879_168_87];
426        for (expected, actual) in expected_values
427            .iter()
428            .zip(spectral_desc.get_centroid().iter())
429        {
430            assert!(
431                0.0001 > (expected - actual).abs(),
432                "{expected} !~= {actual}"
433            );
434        }
435    }
436
437    #[test]
438    fn test_spectral_centroid_boundaries() {
439        let mut spectral_desc = SpectralDesc::new(10).unwrap();
440        let chunk = vec![0.; 512];
441
442        spectral_desc.do_(&chunk).unwrap();
443        let expected_values = [-1., -1.];
444        for (expected, actual) in expected_values
445            .iter()
446            .zip(spectral_desc.get_centroid().iter())
447        {
448            assert!(
449                0.000_000_1 > (expected - actual).abs(),
450                "{expected} !~= {actual}"
451            );
452        }
453        let song = Decoder::decode(Path::new("data/tone_11080Hz.flac")).unwrap();
454        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
455        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
456            spectral_desc.do_(chunk).unwrap();
457        }
458        let expected_values = [0.97266, -0.960_992_6];
459        for (expected, actual) in expected_values
460            .iter()
461            .zip(spectral_desc.get_centroid().iter())
462        {
463            // original test wanted absolute error < 0.00001
464            // assert!(0.00001 > (expected - actual).abs(), "{expected} !~= {actual}");
465            let relative_error = (expected - actual).abs() / expected.abs();
466            assert!(
467                relative_error < 0.039,
468                "relative error: {relative_error}, expected: {expected}, actual: {actual}"
469            );
470        }
471    }
472}