mecomp_analysis/
timbral.rs

1//! Timbral feature extraction module.
2//!
3//! Contains functions to extract & summarize the zero-crossing rate,
4//! spectral centroid, spectral flatness and spectral roll-off of
5//! a given Song.
6
7use bliss_audio_aubio_rs::vec::CVec;
8use bliss_audio_aubio_rs::{PVoc, SpecDesc, SpecShape, bin_to_freq};
9use ndarray::{Axis, arr1};
10
11use crate::Feature;
12
13use super::SAMPLE_RATE;
14use super::errors::{AnalysisError, AnalysisResult};
15use super::utils::{Normalize, geometric_mean, mean, number_crossings};
16
17/**
18 * General object holding all the spectral descriptor.
19 *
20 * Holds 3 spectral descriptors together. It would be better conceptually
21 * to have 3 different spectral descriptor objects, but this avoids re-computing
22 * the same FFT three times.
23 *
24 * Current spectral descriptors are spectral centroid, spectral rolloff and
25 * spectral flatness (see `values_object` for a further description of the
26 * object.
27 *
28 * All descriptors are currently summarized by their mean only.
29 */
30pub struct SpectralDesc {
31    phase_vocoder: PVoc,
32    sample_rate: u32,
33
34    centroid_aubio_desc: SpecDesc,
35    rolloff_aubio_desc: SpecDesc,
36    values_centroid: Vec<f32>,
37    values_rolloff: Vec<f32>,
38    values_flatness: Vec<f32>,
39}
40
41impl SpectralDesc {
42    pub const WINDOW_SIZE: usize = 512;
43    pub const HOP_SIZE: usize = Self::WINDOW_SIZE / 4;
44
45    /**
46     * Compute score related to the
47     * [spectral centroid](https://en.wikipedia.org/wiki/Spectral_centroid) values,
48     * obtained after repeatedly calling `do_` on all of the song's chunks.
49     *
50     * Spectral centroid is used to determine the "brightness" of a sound, i.e.
51     * how much high frequency there is in an audio signal.
52     *
53     * It of course depends of the instrument used: a piano-only track that makes
54     * use of high frequencies will still score less than a song using a lot of
55     * percussive sound, because the piano frequency range is lower.
56     *
57     * The value range is between 0 and `sample_rate / 2`.
58     */
59    #[inline]
60    pub fn get_centroid(&mut self) -> Vec<Feature> {
61        vec![
62            self.normalize(Feature::from(mean(&self.values_centroid))),
63            self.normalize(Feature::from(
64                arr1(&self.values_centroid)
65                    .std_axis(Axis(0), 0.)
66                    .into_scalar(),
67            )),
68        ]
69    }
70
71    /**
72     * Compute score related to the spectral roll-off values, obtained
73     * after repeatedly calling `do_` on all of the song's chunks.
74     *
75     * Spectral roll-off is the bin frequency number below which a certain
76     * percentage of the spectral energy is found, here, 95%.
77     *
78     * It can be used to distinguish voiced speech (low roll-off) and unvoiced
79     * speech (high roll-off). It is also a good indication of the energy
80     * repartition of a song.
81     *
82     * The value range is between 0 and `sample_rate / 2`
83     */
84    #[inline]
85    pub fn get_rolloff(&mut self) -> Vec<Feature> {
86        vec![
87            self.normalize(Feature::from(mean(&self.values_rolloff))),
88            self.normalize(Feature::from(
89                arr1(&self.values_rolloff)
90                    .std_axis(Axis(0), 0.)
91                    .into_scalar(),
92            )),
93        ]
94    }
95
96    /**
97     * Compute score related to the
98     * [spectral flatness](https://en.wikipedia.org/wiki/Spectral_flatness) values,
99     * obtained after repeatedly calling `do_` on all of the song's chunks.
100     *
101     * Spectral flatness is the ratio between the geometric mean of the spectrum
102     * and its arithmetic mean.
103     *
104     * It is used to distinguish between tone-like and noise-like signals.
105     * Tone-like audio is f.ex. a piano key, something that has one or more
106     * specific frequencies, while (white) noise has an equal distribution
107     * of intensity among all frequencies.
108     *
109     * The value range is between 0 and 1, since the geometric mean is always less
110     * than the arithmetic mean.
111     */
112    #[inline]
113    pub fn get_flatness(&mut self) -> Vec<Feature> {
114        let max_value = 1.;
115        let min_value = 0.;
116        // Range is different from the other spectral algorithms, so normalizing
117        // manually here.
118        vec![
119            2. * (Feature::from(mean(&self.values_flatness)) - min_value) / (max_value - min_value)
120                - 1.,
121            2. * (Feature::from(
122                arr1(&self.values_flatness)
123                    .std_axis(Axis(0), 0.)
124                    .into_scalar(),
125            ) - min_value)
126                / (max_value - min_value)
127                - 1.,
128        ]
129    }
130
131    /// # Errors
132    ///
133    /// This function will return an error if there is an error loading the aubio objects
134    #[inline]
135    pub fn new(sample_rate: u32) -> AnalysisResult<Self> {
136        Ok(Self {
137            centroid_aubio_desc: SpecDesc::new(SpecShape::Centroid, Self::WINDOW_SIZE).map_err(
138                |e| {
139                    AnalysisError::AnalysisError(format!(
140                        "error while loading aubio centroid object: {e}",
141                    ))
142                },
143            )?,
144            rolloff_aubio_desc: SpecDesc::new(SpecShape::Rolloff, Self::WINDOW_SIZE).map_err(
145                |e| {
146                    AnalysisError::AnalysisError(format!(
147                        "error while loading aubio rolloff object: {e}",
148                    ))
149                },
150            )?,
151            phase_vocoder: PVoc::new(Self::WINDOW_SIZE, Self::HOP_SIZE).map_err(|e| {
152                AnalysisError::AnalysisError(format!("error while loading aubio pvoc object: {e}",))
153            })?,
154            values_centroid: Vec::new(),
155            values_rolloff: Vec::new(),
156            values_flatness: Vec::new(),
157            sample_rate,
158        })
159    }
160
161    /**
162    Compute all the descriptors' value for the given chunk.
163
164    After using this on all the song's chunks, you can call
165    `get_centroid`, `get_flatness` and `get_rolloff` to get the respective
166    descriptors' values.
167    */
168    #[allow(clippy::missing_errors_doc, clippy::missing_panics_doc)]
169    #[allow(clippy::missing_inline_in_public_items)]
170    pub fn do_(&mut self, chunk: &[f32]) -> AnalysisResult<()> {
171        let mut fftgrain: Vec<f32> = vec![0.0; Self::WINDOW_SIZE];
172        self.phase_vocoder
173            .do_(chunk, fftgrain.as_mut_slice())
174            .map_err(|e| {
175                AnalysisError::AnalysisError(format!("error while processing aubio pv object: {e}"))
176            })?;
177
178        let bin = self
179            .centroid_aubio_desc
180            .do_result(fftgrain.as_slice())
181            .map_err(|e| {
182                AnalysisError::AnalysisError(format!(
183                    "error while processing aubio centroid object: {e}",
184                ))
185            })?;
186
187        #[allow(clippy::cast_precision_loss)]
188        let freq = bin_to_freq(bin, self.sample_rate as f32, Self::WINDOW_SIZE as f32);
189        self.values_centroid.push(freq);
190
191        let mut bin = self
192            .rolloff_aubio_desc
193            .do_result(fftgrain.as_slice())
194            .unwrap();
195
196        // Until https://github.com/aubio/aubio/pull/318 is in
197        #[allow(clippy::cast_precision_loss)]
198        if bin > Self::WINDOW_SIZE as f32 / 2. {
199            bin = Self::WINDOW_SIZE as f32 / 2.;
200        }
201
202        #[allow(clippy::cast_precision_loss)]
203        let freq = bin_to_freq(bin, self.sample_rate as f32, Self::WINDOW_SIZE as f32);
204        self.values_rolloff.push(freq);
205
206        let cvec: CVec = fftgrain.as_slice().into();
207        let geo_mean = geometric_mean(cvec.norm());
208        if geo_mean == 0.0 {
209            self.values_flatness.push(0.0);
210            return Ok(());
211        }
212        let flatness = geo_mean / mean(cvec.norm());
213        self.values_flatness.push(flatness);
214        Ok(())
215    }
216}
217
218impl Normalize for SpectralDesc {
219    #[allow(clippy::cast_precision_loss)]
220    const MAX_VALUE: Feature = SAMPLE_RATE as Feature / 2.;
221    const MIN_VALUE: Feature = 0.;
222}
223
224/**
225 * [Zero-crossing rate](https://en.wikipedia.org/wiki/Zero-crossing_rate)
226 * detection object.
227 *
228 * Zero-crossing rate is mostly used to detect percussive sounds in an audio
229 * signal, as well as whether an audio signal contains speech or not.
230 *
231 * It is a good metric to differentiate between songs with people speaking clearly,
232 * (e.g. slam) and instrumental songs.
233 *
234 * The value range is between 0 and 1.
235 */
236#[derive(Default, Clone)]
237pub struct ZeroCrossingRateDesc {
238    crossings_sum: u32,
239    samples_checked: usize,
240}
241
242impl ZeroCrossingRateDesc {
243    #[must_use]
244    #[inline]
245    pub fn new(_sample_rate: u32) -> Self {
246        Self::default()
247    }
248
249    /// Count the number of zero-crossings for the current `chunk`.
250    #[inline]
251    pub fn do_(&mut self, chunk: &[f32]) {
252        self.crossings_sum += number_crossings(chunk);
253        self.samples_checked += chunk.len();
254    }
255
256    /// Sum the number of zero-crossings witnessed and divide by
257    /// the total number of samples.
258    #[allow(clippy::cast_precision_loss)]
259    #[inline]
260    pub fn get_value(&mut self) -> Feature {
261        self.normalize(Feature::from(self.crossings_sum) / self.samples_checked as Feature)
262    }
263}
264
265impl Normalize for ZeroCrossingRateDesc {
266    const MAX_VALUE: Feature = 1.;
267    const MIN_VALUE: Feature = 0.;
268}
269
270#[cfg(test)]
271mod tests {
272    use super::*;
273    use crate::decoder::{Decoder as DecoderTrait, MecompDecoder as Decoder};
274    use std::path::Path;
275
276    #[test]
277    fn test_zcr_boundaries() {
278        let mut zcr_desc = ZeroCrossingRateDesc::default();
279        let chunk = vec![0.; 1024];
280        zcr_desc.do_(&chunk);
281        let value = zcr_desc.get_value();
282        assert!(f64::EPSILON > (-1. - value).abs(), "{value} !~= -1");
283
284        let one_chunk = [-1., 1.];
285        let chunks = std::iter::repeat(one_chunk.iter())
286            .take(512)
287            .flatten()
288            .copied()
289            .collect::<Vec<f32>>();
290        let mut zcr_desc = ZeroCrossingRateDesc::default();
291        zcr_desc.do_(&chunks);
292        let value = zcr_desc.get_value();
293        assert!(0.001 > (0.998_046_9 - value).abs(), "{value} !~= 0.9980469");
294    }
295
296    #[test]
297    fn test_zcr() {
298        let song = Decoder::new()
299            .unwrap()
300            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
301            .unwrap();
302        let mut zcr_desc = ZeroCrossingRateDesc::default();
303        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
304            zcr_desc.do_(chunk);
305        }
306        let value = zcr_desc.get_value();
307        assert!(0.001 > (-0.85036 - value).abs(), "{value} !~= -0.85036");
308    }
309
310    #[test]
311    fn test_spectral_flatness_boundaries() {
312        let mut spectral_desc = SpectralDesc::new(10).unwrap();
313        let chunk = vec![0.; 1024];
314
315        let expected_values = [-1., -1.];
316        spectral_desc.do_(&chunk).unwrap();
317        for (expected, actual) in expected_values
318            .iter()
319            .zip(spectral_desc.get_flatness().iter())
320        {
321            assert!(
322                0.000_000_1 > (expected - actual).abs(),
323                "{expected} !~= {actual}"
324            );
325        }
326
327        let song = Decoder::new()
328            .unwrap()
329            .decode(Path::new("data/white_noise.mp3"))
330            .unwrap();
331        let mut spectral_desc = SpectralDesc::new(22050).unwrap();
332        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
333            spectral_desc.do_(chunk).unwrap();
334        }
335        println!("{:?}", spectral_desc.get_flatness());
336        // White noise - as close to 1 as possible
337        let expected_values = [0.578_530_3, -0.942_630_8];
338        for (expected, actual) in expected_values
339            .iter()
340            .zip(spectral_desc.get_flatness().iter())
341        {
342            // original test wanted absolute error < 0.001
343            // assert!(0.001 > (expected - actual).abs(), "{expected} !~= {actual}");
344            let relative_error = (expected - actual).abs() / expected.abs();
345            assert!(
346                relative_error < 0.078,
347                "relative error: {relative_error}, expected: {expected}, actual: {actual}"
348            );
349        }
350    }
351
352    #[test]
353    fn test_spectral_flatness() {
354        let song = Decoder::new()
355            .unwrap()
356            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
357            .unwrap();
358        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
359        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
360            spectral_desc.do_(chunk).unwrap();
361        }
362        // Spectral flatness mean value computed here with phase vocoder before normalization: 0.111949615
363        // Essentia value with spectrum / hann window: 0.11197535695207445
364        let expected_values = [-0.776_100_75, -0.814_817_9];
365        for (expected, actual) in expected_values
366            .iter()
367            .zip(spectral_desc.get_flatness().iter())
368        {
369            assert!(0.01 > (expected - actual).abs(), "{expected} !~= {actual}");
370        }
371    }
372
373    #[test]
374    fn test_spectral_roll_off_boundaries() {
375        let mut spectral_desc = SpectralDesc::new(10).unwrap();
376        let chunk = vec![0.; 512];
377
378        let expected_values = [-1., -1.];
379        spectral_desc.do_(&chunk).unwrap();
380        for (expected, actual) in expected_values
381            .iter()
382            .zip(spectral_desc.get_rolloff().iter())
383        {
384            assert!(
385                0.000_000_1 > (expected - actual).abs(),
386                "{expected} !~= {actual}"
387            );
388        }
389
390        let song = Decoder::new()
391            .unwrap()
392            .decode(Path::new("data/tone_11080Hz.flac"))
393            .unwrap();
394        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
395        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
396            spectral_desc.do_(chunk).unwrap();
397        }
398        let expected_values = [0.996_768_1, -0.996_151_75];
399        for (expected, actual) in expected_values
400            .iter()
401            .zip(spectral_desc.get_rolloff().iter())
402        {
403            assert!(
404                0.0001 > (expected - actual).abs(),
405                "{expected} !~= {actual}"
406            );
407        }
408    }
409
410    #[test]
411    fn test_spectral_roll_off() {
412        let song = Decoder::new()
413            .unwrap()
414            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
415            .unwrap();
416        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
417        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
418            spectral_desc.do_(chunk).unwrap();
419        }
420        let expected_values = [-0.632_648_6, -0.726_093_3];
421        // Roll-off mean value computed here with phase vocoder before normalization: 2026.7644
422        // Essentia value with spectrum / hann window: 1979.632683520047
423        for (expected, actual) in expected_values
424            .iter()
425            .zip(spectral_desc.get_rolloff().iter())
426        {
427            assert!(0.01 > (expected - actual).abs(), "{expected} !~= {actual}");
428        }
429    }
430
431    #[test]
432    fn test_spectral_centroid() {
433        let song = Decoder::new()
434            .unwrap()
435            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
436            .unwrap();
437        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
438        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
439            spectral_desc.do_(chunk).unwrap();
440        }
441        // Spectral centroid mean value computed here with phase vocoder before normalization: 1354.2273
442        // Essential value with spectrum / hann window: 1351
443        let expected_values = [-0.75483, -0.879_168_87];
444        for (expected, actual) in expected_values
445            .iter()
446            .zip(spectral_desc.get_centroid().iter())
447        {
448            assert!(
449                0.0001 > (expected - actual).abs(),
450                "{expected} !~= {actual}"
451            );
452        }
453    }
454
455    #[test]
456    fn test_spectral_centroid_boundaries() {
457        let mut spectral_desc = SpectralDesc::new(10).unwrap();
458        let chunk = vec![0.; 512];
459
460        spectral_desc.do_(&chunk).unwrap();
461        let expected_values = [-1., -1.];
462        for (expected, actual) in expected_values
463            .iter()
464            .zip(spectral_desc.get_centroid().iter())
465        {
466            assert!(
467                0.000_000_1 > (expected - actual).abs(),
468                "{expected} !~= {actual}"
469            );
470        }
471        let song = Decoder::new()
472            .unwrap()
473            .decode(Path::new("data/tone_11080Hz.flac"))
474            .unwrap();
475        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
476        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
477            spectral_desc.do_(chunk).unwrap();
478        }
479        let expected_values = [0.97266, -0.960_992_6];
480        for (expected, actual) in expected_values
481            .iter()
482            .zip(spectral_desc.get_centroid().iter())
483        {
484            // original test wanted absolute error < 0.00001
485            // assert!(0.00001 > (expected - actual).abs(), "{expected} !~= {actual}");
486            let relative_error = (expected - actual).abs() / expected.abs();
487            assert!(
488                relative_error < 0.039,
489                "relative error: {relative_error}, expected: {expected}, actual: {actual}"
490            );
491        }
492    }
493}
mecomp_analysis/timbral.rs

mecomp_analysis/
timbral.rs