mecomp_analysis/
timbral.rs

1//! Timbral feature extraction module.
2//!
3//! Contains functions to extract & summarize the zero-crossing rate,
4//! spectral centroid, spectral flatness and spectral roll-off of
5//! a given Song.
6
7use bliss_audio_aubio_rs::vec::CVec;
8use bliss_audio_aubio_rs::{PVoc, SpecDesc, SpecShape, bin_to_freq};
9use ndarray::{Axis, arr1};
10
11use crate::Feature;
12
13use super::SAMPLE_RATE;
14use super::errors::{AnalysisError, AnalysisResult};
15use super::utils::{Normalize, geometric_mean, mean, number_crossings};
16
17/**
18 * General object holding all the spectral descriptor.
19 *
20 * Holds 3 spectral descriptors together. It would be better conceptually
21 * to have 3 different spectral descriptor objects, but this avoids re-computing
22 * the same FFT three times.
23 *
24 * Current spectral descriptors are spectral centroid, spectral rolloff and
25 * spectral flatness (see `values_object` for a further description of the
26 * object.
27 *
28 * All descriptors are currently summarized by their mean only.
29 */
30pub struct SpectralDesc {
31    phase_vocoder: PVoc,
32    sample_rate: u32,
33
34    centroid_aubio_desc: SpecDesc,
35    rolloff_aubio_desc: SpecDesc,
36    values_centroid: Vec<f32>,
37    values_rolloff: Vec<f32>,
38    values_flatness: Vec<f32>,
39}
40
41impl SpectralDesc {
42    pub const WINDOW_SIZE: usize = 512;
43    pub const HOP_SIZE: usize = Self::WINDOW_SIZE / 4;
44
45    /**
46     * Compute score related to the
47     * [spectral centroid](https://en.wikipedia.org/wiki/Spectral_centroid) values,
48     * obtained after repeatedly calling `do_` on all of the song's chunks.
49     *
50     * Spectral centroid is used to determine the "brightness" of a sound, i.e.
51     * how much high frequency there is in an audio signal.
52     *
53     * It of course depends of the instrument used: a piano-only track that makes
54     * use of high frequencies will still score less than a song using a lot of
55     * percussive sound, because the piano frequency range is lower.
56     *
57     * The value range is between 0 and `sample_rate / 2`.
58     */
59    #[inline]
60    pub fn get_centroid(&mut self) -> Vec<Feature> {
61        vec![
62            self.normalize(Feature::from(mean(&self.values_centroid))),
63            self.normalize(Feature::from(
64                arr1(&self.values_centroid)
65                    .std_axis(Axis(0), 0.)
66                    .into_scalar(),
67            )),
68        ]
69    }
70
71    /**
72     * Compute score related to the spectral roll-off values, obtained
73     * after repeatedly calling `do_` on all of the song's chunks.
74     *
75     * Spectral roll-off is the bin frequency number below which a certain
76     * percentage of the spectral energy is found, here, 95%.
77     *
78     * It can be used to distinguish voiced speech (low roll-off) and unvoiced
79     * speech (high roll-off). It is also a good indication of the energy
80     * repartition of a song.
81     *
82     * The value range is between 0 and `sample_rate / 2`
83     */
84    #[inline]
85    pub fn get_rolloff(&mut self) -> Vec<Feature> {
86        vec![
87            self.normalize(Feature::from(mean(&self.values_rolloff))),
88            self.normalize(Feature::from(
89                arr1(&self.values_rolloff)
90                    .std_axis(Axis(0), 0.)
91                    .into_scalar(),
92            )),
93        ]
94    }
95
96    /**
97     * Compute score related to the
98     * [spectral flatness](https://en.wikipedia.org/wiki/Spectral_flatness) values,
99     * obtained after repeatedly calling `do_` on all of the song's chunks.
100     *
101     * Spectral flatness is the ratio between the geometric mean of the spectrum
102     * and its arithmetic mean.
103     *
104     * It is used to distinguish between tone-like and noise-like signals.
105     * Tone-like audio is f.ex. a piano key, something that has one or more
106     * specific frequencies, while (white) noise has an equal distribution
107     * of intensity among all frequencies.
108     *
109     * The value range is between 0 and 1, since the geometric mean is always less
110     * than the arithmetic mean.
111     */
112    #[inline]
113    pub fn get_flatness(&mut self) -> Vec<Feature> {
114        let max_value = 1.;
115        let min_value = 0.;
116        // Range is different from the other spectral algorithms, so normalizing
117        // manually here.
118        vec![
119            2. * (Feature::from(mean(&self.values_flatness)) - min_value) / (max_value - min_value)
120                - 1.,
121            2. * (Feature::from(
122                arr1(&self.values_flatness)
123                    .std_axis(Axis(0), 0.)
124                    .into_scalar(),
125            ) - min_value)
126                / (max_value - min_value)
127                - 1.,
128        ]
129    }
130
131    /// # Errors
132    ///
133    /// This function will return an error if there is an error loading the aubio objects
134    #[inline]
135    pub fn new(sample_rate: u32) -> AnalysisResult<Self> {
136        Ok(Self {
137            centroid_aubio_desc: SpecDesc::new(SpecShape::Centroid, Self::WINDOW_SIZE).map_err(
138                |e| {
139                    AnalysisError::AnalysisError(format!(
140                        "error while loading aubio centroid object: {e}",
141                    ))
142                },
143            )?,
144            rolloff_aubio_desc: SpecDesc::new(SpecShape::Rolloff, Self::WINDOW_SIZE).map_err(
145                |e| {
146                    AnalysisError::AnalysisError(format!(
147                        "error while loading aubio rolloff object: {e}",
148                    ))
149                },
150            )?,
151            phase_vocoder: PVoc::new(Self::WINDOW_SIZE, Self::HOP_SIZE).map_err(|e| {
152                AnalysisError::AnalysisError(format!("error while loading aubio pvoc object: {e}",))
153            })?,
154            values_centroid: Vec::new(),
155            values_rolloff: Vec::new(),
156            values_flatness: Vec::new(),
157            sample_rate,
158        })
159    }
160
161    /**
162    Compute all the descriptors' value for the given chunk.
163
164    After using this on all the song's chunks, you can call
165    `get_centroid`, `get_flatness` and `get_rolloff` to get the respective
166    descriptors' values.
167    */
168    #[allow(clippy::missing_errors_doc, clippy::missing_panics_doc)]
169    #[allow(clippy::missing_inline_in_public_items)]
170    pub fn do_(&mut self, chunk: &[f32]) -> AnalysisResult<()> {
171        let mut fftgrain: Vec<f32> = vec![0.0; Self::WINDOW_SIZE];
172        self.phase_vocoder
173            .do_(chunk, fftgrain.as_mut_slice())
174            .map_err(|e| {
175                AnalysisError::AnalysisError(format!("error while processing aubio pv object: {e}"))
176            })?;
177
178        let bin = self
179            .centroid_aubio_desc
180            .do_result(fftgrain.as_slice())
181            .map_err(|e| {
182                AnalysisError::AnalysisError(format!(
183                    "error while processing aubio centroid object: {e}",
184                ))
185            })?;
186
187        #[allow(clippy::cast_precision_loss)]
188        let freq = bin_to_freq(bin, self.sample_rate as f32, Self::WINDOW_SIZE as f32);
189        self.values_centroid.push(freq);
190
191        let mut bin = self
192            .rolloff_aubio_desc
193            .do_result(fftgrain.as_slice())
194            .unwrap();
195
196        // Until https://github.com/aubio/aubio/pull/318 is in
197        #[allow(clippy::cast_precision_loss)]
198        if bin > Self::WINDOW_SIZE as f32 / 2. {
199            bin = Self::WINDOW_SIZE as f32 / 2.;
200        }
201
202        #[allow(clippy::cast_precision_loss)]
203        let freq = bin_to_freq(bin, self.sample_rate as f32, Self::WINDOW_SIZE as f32);
204        self.values_rolloff.push(freq);
205
206        let cvec: CVec = fftgrain.as_slice().into();
207        let geo_mean = geometric_mean(cvec.norm());
208        if geo_mean == 0.0 {
209            self.values_flatness.push(0.0);
210            return Ok(());
211        }
212        let flatness = geo_mean / mean(cvec.norm());
213        self.values_flatness.push(flatness);
214        Ok(())
215    }
216}
217
218impl Normalize for SpectralDesc {
219    #[allow(clippy::cast_precision_loss)]
220    const MAX_VALUE: Feature = SAMPLE_RATE as Feature / 2.;
221    const MIN_VALUE: Feature = 0.;
222}
223
224/**
225 * [Zero-crossing rate](https://en.wikipedia.org/wiki/Zero-crossing_rate)
226 * detection object.
227 *
228 * Zero-crossing rate is mostly used to detect percussive sounds in an audio
229 * signal, as well as whether an audio signal contains speech or not.
230 *
231 * It is a good metric to differentiate between songs with people speaking clearly,
232 * (e.g. slam) and instrumental songs.
233 *
234 * The value range is between 0 and 1.
235 */
236#[derive(Default, Clone)]
237pub struct ZeroCrossingRateDesc {
238    crossings_sum: u32,
239    samples_checked: usize,
240}
241
242impl ZeroCrossingRateDesc {
243    #[must_use]
244    #[inline]
245    pub fn new(_sample_rate: u32) -> Self {
246        Self::default()
247    }
248
249    /// Count the number of zero-crossings for the current `chunk`.
250    #[inline]
251    pub fn do_(&mut self, chunk: &[f32]) {
252        self.crossings_sum += number_crossings(chunk);
253        self.samples_checked += chunk.len();
254    }
255
256    /// Sum the number of zero-crossings witnessed and divide by
257    /// the total number of samples.
258    #[allow(clippy::cast_precision_loss)]
259    #[inline]
260    pub fn get_value(&mut self) -> Feature {
261        self.normalize(Feature::from(self.crossings_sum) / self.samples_checked as Feature)
262    }
263}
264
265impl Normalize for ZeroCrossingRateDesc {
266    const MAX_VALUE: Feature = 1.;
267    const MIN_VALUE: Feature = 0.;
268}
269
270#[cfg(test)]
271mod tests {
272    use super::*;
273    use crate::decoder::{Decoder as DecoderTrait, MecompDecoder as Decoder};
274    use std::path::Path;
275
276    #[test]
277    fn test_zcr_boundaries() {
278        let mut zcr_desc = ZeroCrossingRateDesc::default();
279        let chunk = vec![0.; 1024];
280        zcr_desc.do_(&chunk);
281        let value = zcr_desc.get_value();
282        assert!(f64::EPSILON > (-1. - value).abs(), "{value} !~= -1");
283
284        let one_chunk = [-1., 1.];
285        let chunks = std::iter::repeat_n(one_chunk.iter(), 512)
286            .flatten()
287            .copied()
288            .collect::<Vec<f32>>();
289        let mut zcr_desc = ZeroCrossingRateDesc::default();
290        zcr_desc.do_(&chunks);
291        let value = zcr_desc.get_value();
292        assert!(0.001 > (0.998_046_9 - value).abs(), "{value} !~= 0.9980469");
293    }
294
295    #[test]
296    fn test_zcr() {
297        let song = Decoder::new()
298            .unwrap()
299            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
300            .unwrap();
301        let mut zcr_desc = ZeroCrossingRateDesc::default();
302        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
303            zcr_desc.do_(chunk);
304        }
305        let value = zcr_desc.get_value();
306        assert!(0.001 > (-0.85036 - value).abs(), "{value} !~= -0.85036");
307    }
308
309    #[test]
310    fn test_spectral_flatness_boundaries() {
311        let mut spectral_desc = SpectralDesc::new(10).unwrap();
312        let chunk = vec![0.; 1024];
313
314        let expected_values = [-1., -1.];
315        spectral_desc.do_(&chunk).unwrap();
316        for (expected, actual) in expected_values
317            .iter()
318            .zip(spectral_desc.get_flatness().iter())
319        {
320            assert!(
321                0.000_000_1 > (expected - actual).abs(),
322                "{expected} !~= {actual}"
323            );
324        }
325
326        let song = Decoder::new()
327            .unwrap()
328            .decode(Path::new("data/white_noise.mp3"))
329            .unwrap();
330        let mut spectral_desc = SpectralDesc::new(22050).unwrap();
331        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
332            spectral_desc.do_(chunk).unwrap();
333        }
334        println!("{:?}", spectral_desc.get_flatness());
335        // White noise - as close to 1 as possible
336        let expected_values = [0.578_530_3, -0.942_630_8];
337        for (expected, actual) in expected_values
338            .iter()
339            .zip(spectral_desc.get_flatness().iter())
340        {
341            // original test wanted absolute error < 0.001
342            // assert!(0.001 > (expected - actual).abs(), "{expected} !~= {actual}");
343            let relative_error = (expected - actual).abs() / expected.abs();
344            assert!(
345                relative_error < 0.078,
346                "relative error: {relative_error}, expected: {expected}, actual: {actual}"
347            );
348        }
349    }
350
351    #[test]
352    fn test_spectral_flatness() {
353        let song = Decoder::new()
354            .unwrap()
355            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
356            .unwrap();
357        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
358        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
359            spectral_desc.do_(chunk).unwrap();
360        }
361        // Spectral flatness mean value computed here with phase vocoder before normalization: 0.111949615
362        // Essentia value with spectrum / hann window: 0.11197535695207445
363        let expected_values = [-0.776_100_75, -0.814_817_9];
364        for (expected, actual) in expected_values
365            .iter()
366            .zip(spectral_desc.get_flatness().iter())
367        {
368            assert!(0.01 > (expected - actual).abs(), "{expected} !~= {actual}");
369        }
370    }
371
372    #[test]
373    fn test_spectral_roll_off_boundaries() {
374        let mut spectral_desc = SpectralDesc::new(10).unwrap();
375        let chunk = vec![0.; 512];
376
377        let expected_values = [-1., -1.];
378        spectral_desc.do_(&chunk).unwrap();
379        for (expected, actual) in expected_values
380            .iter()
381            .zip(spectral_desc.get_rolloff().iter())
382        {
383            assert!(
384                0.000_000_1 > (expected - actual).abs(),
385                "{expected} !~= {actual}"
386            );
387        }
388
389        let song = Decoder::new()
390            .unwrap()
391            .decode(Path::new("data/tone_11080Hz.flac"))
392            .unwrap();
393        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
394        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
395            spectral_desc.do_(chunk).unwrap();
396        }
397        let expected_values = [0.996_768_1, -0.996_151_75];
398        for (expected, actual) in expected_values
399            .iter()
400            .zip(spectral_desc.get_rolloff().iter())
401        {
402            assert!(
403                0.0001 > (expected - actual).abs(),
404                "{expected} !~= {actual}"
405            );
406        }
407    }
408
409    #[test]
410    fn test_spectral_roll_off() {
411        let song = Decoder::new()
412            .unwrap()
413            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
414            .unwrap();
415        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
416        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
417            spectral_desc.do_(chunk).unwrap();
418        }
419        let expected_values = [-0.632_648_6, -0.726_093_3];
420        // Roll-off mean value computed here with phase vocoder before normalization: 2026.7644
421        // Essentia value with spectrum / hann window: 1979.632683520047
422        for (expected, actual) in expected_values
423            .iter()
424            .zip(spectral_desc.get_rolloff().iter())
425        {
426            assert!(0.01 > (expected - actual).abs(), "{expected} !~= {actual}");
427        }
428    }
429
430    #[test]
431    fn test_spectral_centroid() {
432        let song = Decoder::new()
433            .unwrap()
434            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
435            .unwrap();
436        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
437        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
438            spectral_desc.do_(chunk).unwrap();
439        }
440        // Spectral centroid mean value computed here with phase vocoder before normalization: 1354.2273
441        // Essential value with spectrum / hann window: 1351
442        let expected_values = [-0.75483, -0.879_168_87];
443        for (expected, actual) in expected_values
444            .iter()
445            .zip(spectral_desc.get_centroid().iter())
446        {
447            assert!(
448                0.0001 > (expected - actual).abs(),
449                "{expected} !~= {actual}"
450            );
451        }
452    }
453
454    #[test]
455    fn test_spectral_centroid_boundaries() {
456        let mut spectral_desc = SpectralDesc::new(10).unwrap();
457        let chunk = vec![0.; 512];
458
459        spectral_desc.do_(&chunk).unwrap();
460        let expected_values = [-1., -1.];
461        for (expected, actual) in expected_values
462            .iter()
463            .zip(spectral_desc.get_centroid().iter())
464        {
465            assert!(
466                0.000_000_1 > (expected - actual).abs(),
467                "{expected} !~= {actual}"
468            );
469        }
470        let song = Decoder::new()
471            .unwrap()
472            .decode(Path::new("data/tone_11080Hz.flac"))
473            .unwrap();
474        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
475        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
476            spectral_desc.do_(chunk).unwrap();
477        }
478        let expected_values = [0.97266, -0.960_992_6];
479        for (expected, actual) in expected_values
480            .iter()
481            .zip(spectral_desc.get_centroid().iter())
482        {
483            // original test wanted absolute error < 0.00001
484            // assert!(0.00001 > (expected - actual).abs(), "{expected} !~= {actual}");
485            let relative_error = (expected - actual).abs() / expected.abs();
486            assert!(
487                relative_error < 0.039,
488                "relative error: {relative_error}, expected: {expected}, actual: {actual}"
489            );
490        }
491    }
492}