mecomp_analysis/
timbral.rs

1//! Timbral feature extraction module.
2//!
3//! Contains functions to extract & summarize the zero-crossing rate,
4//! spectral centroid, spectral flatness and spectral roll-off of
5//! a given Song.
6
7use bliss_audio_aubio_rs::vec::CVec;
8use bliss_audio_aubio_rs::{bin_to_freq, PVoc, SpecDesc, SpecShape};
9use ndarray::{arr1, Axis};
10
11use crate::Feature;
12
13use super::errors::{AnalysisError, AnalysisResult};
14use super::utils::{geometric_mean, mean, number_crossings, Normalize};
15use super::SAMPLE_RATE;
16
17/**
18 * General object holding all the spectral descriptor.
19 *
20 * Holds 3 spectral descriptors together. It would be better conceptually
21 * to have 3 different spectral descriptor objects, but this avoids re-computing
22 * the same FFT three times.
23 *
24 * Current spectral descriptors are spectral centroid, spectral rolloff and
25 * spectral flatness (see `values_object` for a further description of the
26 * object.
27 *
28 * All descriptors are currently summarized by their mean only.
29 */
30pub struct SpectralDesc {
31    phase_vocoder: PVoc,
32    sample_rate: u32,
33
34    centroid_aubio_desc: SpecDesc,
35    rolloff_aubio_desc: SpecDesc,
36    values_centroid: Vec<f32>,
37    values_rolloff: Vec<f32>,
38    values_flatness: Vec<f32>,
39}
40
41impl SpectralDesc {
42    pub const WINDOW_SIZE: usize = 512;
43    pub const HOP_SIZE: usize = Self::WINDOW_SIZE / 4;
44
45    /**
46     * Compute score related to the
47     * [spectral centroid](https://en.wikipedia.org/wiki/Spectral_centroid) values,
48     * obtained after repeatedly calling `do_` on all of the song's chunks.
49     *
50     * Spectral centroid is used to determine the "brightness" of a sound, i.e.
51     * how much high frequency there is in an audio signal.
52     *
53     * It of course depends of the instrument used: a piano-only track that makes
54     * use of high frequencies will still score less than a song using a lot of
55     * percussive sound, because the piano frequency range is lower.
56     *
57     * The value range is between 0 and `sample_rate / 2`.
58     */
59    #[inline]
60    pub fn get_centroid(&mut self) -> Vec<Feature> {
61        vec![
62            self.normalize(Feature::from(mean(&self.values_centroid))),
63            self.normalize(Feature::from(
64                arr1(&self.values_centroid)
65                    .std_axis(Axis(0), 0.)
66                    .into_scalar(),
67            )),
68        ]
69    }
70
71    /**
72     * Compute score related to the spectral roll-off values, obtained
73     * after repeatedly calling `do_` on all of the song's chunks.
74     *
75     * Spectral roll-off is the bin frequency number below which a certain
76     * percentage of the spectral energy is found, here, 95%.
77     *
78     * It can be used to distinguish voiced speech (low roll-off) and unvoiced
79     * speech (high roll-off). It is also a good indication of the energy
80     * repartition of a song.
81     *
82     * The value range is between 0 and `sample_rate / 2`
83     */
84    #[inline]
85    pub fn get_rolloff(&mut self) -> Vec<Feature> {
86        vec![
87            self.normalize(Feature::from(mean(&self.values_rolloff))),
88            self.normalize(Feature::from(
89                arr1(&self.values_rolloff)
90                    .std_axis(Axis(0), 0.)
91                    .into_scalar(),
92            )),
93        ]
94    }
95
96    /**
97     * Compute score related to the
98     * [spectral flatness](https://en.wikipedia.org/wiki/Spectral_flatness) values,
99     * obtained after repeatedly calling `do_` on all of the song's chunks.
100     *
101     * Spectral flatness is the ratio between the geometric mean of the spectrum
102     * and its arithmetic mean.
103     *
104     * It is used to distinguish between tone-like and noise-like signals.
105     * Tone-like audio is f.ex. a piano key, something that has one or more
106     * specific frequencies, while (white) noise has an equal distribution
107     * of intensity among all frequencies.
108     *
109     * The value range is between 0 and 1, since the geometric mean is always less
110     * than the arithmetic mean.
111     */
112    #[inline]
113    pub fn get_flatness(&mut self) -> Vec<Feature> {
114        let max_value = 1.;
115        let min_value = 0.;
116        // Range is different from the other spectral algorithms, so normalizing
117        // manually here.
118        vec![
119            2. * (Feature::from(mean(&self.values_flatness)) - min_value) / (max_value - min_value)
120                - 1.,
121            2. * (Feature::from(
122                arr1(&self.values_flatness)
123                    .std_axis(Axis(0), 0.)
124                    .into_scalar(),
125            ) - min_value)
126                / (max_value - min_value)
127                - 1.,
128        ]
129    }
130
131    /// # Errors
132    ///
133    /// This function will return an error if there is an error loading the aubio objects
134    #[inline]
135    pub fn new(sample_rate: u32) -> AnalysisResult<Self> {
136        Ok(Self {
137            centroid_aubio_desc: SpecDesc::new(SpecShape::Centroid, Self::WINDOW_SIZE).map_err(
138                |e| {
139                    AnalysisError::AnalysisError(format!(
140                        "error while loading aubio centroid object: {e}",
141                    ))
142                },
143            )?,
144            rolloff_aubio_desc: SpecDesc::new(SpecShape::Rolloff, Self::WINDOW_SIZE).map_err(
145                |e| {
146                    AnalysisError::AnalysisError(format!(
147                        "error while loading aubio rolloff object: {e}",
148                    ))
149                },
150            )?,
151            phase_vocoder: PVoc::new(Self::WINDOW_SIZE, Self::HOP_SIZE).map_err(|e| {
152                AnalysisError::AnalysisError(format!("error while loading aubio pvoc object: {e}",))
153            })?,
154            values_centroid: Vec::new(),
155            values_rolloff: Vec::new(),
156            values_flatness: Vec::new(),
157            sample_rate,
158        })
159    }
160
161    /**
162    Compute all the descriptors' value for the given chunk.
163
164    After using this on all the song's chunks, you can call
165    `get_centroid`, `get_flatness` and `get_rolloff` to get the respective
166    descriptors' values.
167    */
168    #[allow(clippy::missing_errors_doc, clippy::missing_panics_doc)]
169    #[allow(clippy::missing_inline_in_public_items)]
170    pub fn do_(&mut self, chunk: &[f32]) -> AnalysisResult<()> {
171        let mut fftgrain: Vec<f32> = vec![0.0; Self::WINDOW_SIZE];
172        self.phase_vocoder
173            .do_(chunk, fftgrain.as_mut_slice())
174            .map_err(|e| {
175                AnalysisError::AnalysisError(format!("error while processing aubio pv object: {e}"))
176            })?;
177
178        let bin = self
179            .centroid_aubio_desc
180            .do_result(fftgrain.as_slice())
181            .map_err(|e| {
182                AnalysisError::AnalysisError(format!(
183                    "error while processing aubio centroid object: {e}",
184                ))
185            })?;
186
187        #[allow(clippy::cast_precision_loss)]
188        let freq = bin_to_freq(bin, self.sample_rate as f32, Self::WINDOW_SIZE as f32);
189        self.values_centroid.push(freq);
190
191        let mut bin = self
192            .rolloff_aubio_desc
193            .do_result(fftgrain.as_slice())
194            .unwrap();
195
196        // Until https://github.com/aubio/aubio/pull/318 is in
197        #[allow(clippy::cast_precision_loss)]
198        if bin > Self::WINDOW_SIZE as f32 / 2. {
199            bin = Self::WINDOW_SIZE as f32 / 2.;
200        }
201
202        #[allow(clippy::cast_precision_loss)]
203        let freq = bin_to_freq(bin, self.sample_rate as f32, Self::WINDOW_SIZE as f32);
204        self.values_rolloff.push(freq);
205
206        let cvec: CVec = fftgrain.as_slice().into();
207        let geo_mean = geometric_mean(cvec.norm());
208        if geo_mean == 0.0 {
209            self.values_flatness.push(0.0);
210            return Ok(());
211        }
212        let flatness = geo_mean / mean(cvec.norm());
213        self.values_flatness.push(flatness);
214        Ok(())
215    }
216}
217
218impl Normalize for SpectralDesc {
219    #[allow(clippy::cast_precision_loss)]
220    const MAX_VALUE: Feature = SAMPLE_RATE as Feature / 2.;
221    const MIN_VALUE: Feature = 0.;
222}
223
224/**
225 * [Zero-crossing rate](https://en.wikipedia.org/wiki/Zero-crossing_rate)
226 * detection object.
227 *
228 * Zero-crossing rate is mostly used to detect percussive sounds in an audio
229 * signal, as well as whether an audio signal contains speech or not.
230 *
231 * It is a good metric to differentiate between songs with people speaking clearly,
232 * (e.g. slam) and instrumental songs.
233 *
234 * The value range is between 0 and 1.
235 */
236#[derive(Default, Clone)]
237pub struct ZeroCrossingRateDesc {
238    values: Vec<u32>,
239    number_samples: usize,
240}
241
242impl ZeroCrossingRateDesc {
243    #[must_use]
244    #[inline]
245    pub fn new(_sample_rate: u32) -> Self {
246        Self::default()
247    }
248
249    /// Count the number of zero-crossings for the current `chunk`.
250    #[inline]
251    pub fn do_(&mut self, chunk: &[f32]) {
252        self.values.push(number_crossings(chunk));
253        self.number_samples += chunk.len();
254    }
255
256    /// Sum the number of zero-crossings witnessed and divide by
257    /// the total number of samples.
258    #[allow(clippy::cast_precision_loss)]
259    #[inline]
260    pub fn get_value(&mut self) -> Feature {
261        self.normalize(
262            Feature::from(self.values.iter().sum::<u32>()) / self.number_samples as Feature,
263        )
264    }
265}
266
267impl Normalize for ZeroCrossingRateDesc {
268    const MAX_VALUE: Feature = 1.;
269    const MIN_VALUE: Feature = 0.;
270}
271
272#[cfg(test)]
273mod tests {
274    use super::*;
275    use crate::decoder::{Decoder as DecoderTrait, MecompDecoder as Decoder};
276    use std::path::Path;
277
278    #[test]
279    fn test_zcr_boundaries() {
280        let mut zcr_desc = ZeroCrossingRateDesc::default();
281        let chunk = vec![0.; 1024];
282        zcr_desc.do_(&chunk);
283        assert_eq!(-1., zcr_desc.get_value());
284
285        let one_chunk = [-1., 1.];
286        let chunks = std::iter::repeat(one_chunk.iter())
287            .take(512)
288            .flatten()
289            .copied()
290            .collect::<Vec<f32>>();
291        let mut zcr_desc = ZeroCrossingRateDesc::default();
292        zcr_desc.do_(&chunks);
293        assert!(
294            0.001 > (0.998_046_9 - zcr_desc.get_value()).abs(),
295            "{} !~= 0.9980469",
296            zcr_desc.get_value()
297        );
298    }
299
300    #[test]
301    fn test_zcr() {
302        let song = Decoder::decode(Path::new("data/s16_mono_22_5kHz.flac")).unwrap();
303        let mut zcr_desc = ZeroCrossingRateDesc::default();
304        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
305            zcr_desc.do_(chunk);
306        }
307        assert!(
308            0.001 > (-0.85036 - zcr_desc.get_value()).abs(),
309            "{} !~= -0.85036",
310            zcr_desc.get_value()
311        );
312    }
313
314    #[test]
315    fn test_spectral_flatness_boundaries() {
316        let mut spectral_desc = SpectralDesc::new(10).unwrap();
317        let chunk = vec![0.; 1024];
318
319        let expected_values = [-1., -1.];
320        spectral_desc.do_(&chunk).unwrap();
321        for (expected, actual) in expected_values
322            .iter()
323            .zip(spectral_desc.get_flatness().iter())
324        {
325            assert!(
326                0.000_000_1 > (expected - actual).abs(),
327                "{expected} !~= {actual}"
328            );
329        }
330
331        let song = Decoder::decode(Path::new("data/white_noise.mp3")).unwrap();
332        let mut spectral_desc = SpectralDesc::new(22050).unwrap();
333        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
334            spectral_desc.do_(chunk).unwrap();
335        }
336        println!("{:?}", spectral_desc.get_flatness());
337        // White noise - as close to 1 as possible
338        let expected_values = [0.578_530_3, -0.942_630_8];
339        for (expected, actual) in expected_values
340            .iter()
341            .zip(spectral_desc.get_flatness().iter())
342        {
343            // original test wanted absolute error < 0.001
344            // assert!(0.001 > (expected - actual).abs(), "{expected} !~= {actual}");
345            let relative_error = (expected - actual).abs() / expected.abs();
346            assert!(
347                relative_error < 0.078,
348                "relative error: {relative_error}, expected: {expected}, actual: {actual}"
349            );
350        }
351    }
352
353    #[test]
354    fn test_spectral_flatness() {
355        let song = Decoder::decode(Path::new("data/s16_mono_22_5kHz.flac")).unwrap();
356        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
357        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
358            spectral_desc.do_(chunk).unwrap();
359        }
360        // Spectral flatness mean value computed here with phase vocoder before normalization: 0.111949615
361        // Essentia value with spectrum / hann window: 0.11197535695207445
362        let expected_values = [-0.776_100_75, -0.814_817_9];
363        for (expected, actual) in expected_values
364            .iter()
365            .zip(spectral_desc.get_flatness().iter())
366        {
367            assert!(0.01 > (expected - actual).abs(), "{expected} !~= {actual}");
368        }
369    }
370
371    #[test]
372    fn test_spectral_roll_off_boundaries() {
373        let mut spectral_desc = SpectralDesc::new(10).unwrap();
374        let chunk = vec![0.; 512];
375
376        let expected_values = [-1., -1.];
377        spectral_desc.do_(&chunk).unwrap();
378        for (expected, actual) in expected_values
379            .iter()
380            .zip(spectral_desc.get_rolloff().iter())
381        {
382            assert!(
383                0.000_000_1 > (expected - actual).abs(),
384                "{expected} !~= {actual}"
385            );
386        }
387
388        let song = Decoder::decode(Path::new("data/tone_11080Hz.flac")).unwrap();
389        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
390        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
391            spectral_desc.do_(chunk).unwrap();
392        }
393        let expected_values = [0.996_768_1, -0.996_151_75];
394        for (expected, actual) in expected_values
395            .iter()
396            .zip(spectral_desc.get_rolloff().iter())
397        {
398            assert!(
399                0.0001 > (expected - actual).abs(),
400                "{expected} !~= {actual}"
401            );
402        }
403    }
404
405    #[test]
406    fn test_spectral_roll_off() {
407        let song = Decoder::decode(Path::new("data/s16_mono_22_5kHz.flac")).unwrap();
408        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
409        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
410            spectral_desc.do_(chunk).unwrap();
411        }
412        let expected_values = [-0.632_648_6, -0.726_093_3];
413        // Roll-off mean value computed here with phase vocoder before normalization: 2026.7644
414        // Essentia value with spectrum / hann window: 1979.632683520047
415        for (expected, actual) in expected_values
416            .iter()
417            .zip(spectral_desc.get_rolloff().iter())
418        {
419            assert!(0.01 > (expected - actual).abs(), "{expected} !~= {actual}");
420        }
421    }
422
423    #[test]
424    fn test_spectral_centroid() {
425        let song = Decoder::decode(Path::new("data/s16_mono_22_5kHz.flac")).unwrap();
426        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
427        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
428            spectral_desc.do_(chunk).unwrap();
429        }
430        // Spectral centroid mean value computed here with phase vocoder before normalization: 1354.2273
431        // Essential value with spectrum / hann window: 1351
432        let expected_values = [-0.75483, -0.879_168_87];
433        for (expected, actual) in expected_values
434            .iter()
435            .zip(spectral_desc.get_centroid().iter())
436        {
437            assert!(
438                0.0001 > (expected - actual).abs(),
439                "{expected} !~= {actual}"
440            );
441        }
442    }
443
444    #[test]
445    fn test_spectral_centroid_boundaries() {
446        let mut spectral_desc = SpectralDesc::new(10).unwrap();
447        let chunk = vec![0.; 512];
448
449        spectral_desc.do_(&chunk).unwrap();
450        let expected_values = [-1., -1.];
451        for (expected, actual) in expected_values
452            .iter()
453            .zip(spectral_desc.get_centroid().iter())
454        {
455            assert!(
456                0.000_000_1 > (expected - actual).abs(),
457                "{expected} !~= {actual}"
458            );
459        }
460        let song = Decoder::decode(Path::new("data/tone_11080Hz.flac")).unwrap();
461        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
462        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
463            spectral_desc.do_(chunk).unwrap();
464        }
465        let expected_values = [0.97266, -0.960_992_6];
466        for (expected, actual) in expected_values
467            .iter()
468            .zip(spectral_desc.get_centroid().iter())
469        {
470            // original test wanted absolute error < 0.00001
471            // assert!(0.00001 > (expected - actual).abs(), "{expected} !~= {actual}");
472            let relative_error = (expected - actual).abs() / expected.abs();
473            assert!(
474                relative_error < 0.039,
475                "relative error: {relative_error}, expected: {expected}, actual: {actual}"
476            );
477        }
478    }
479}