mecomp_analysis/
timbral.rs

1//! Timbral feature extraction module.
2//!
3//! Contains functions to extract & summarize the zero-crossing rate,
4//! spectral centroid, spectral flatness and spectral roll-off of
5//! a given Song.
6
7use bliss_audio_aubio_rs::vec::CVec;
8use bliss_audio_aubio_rs::{PVoc, SpecDesc, SpecShape, bin_to_freq};
9use likely_stable::{LikelyResult, unlikely};
10use ndarray::{Axis, arr1};
11
12use crate::Feature;
13
14use super::SAMPLE_RATE;
15use super::errors::{AnalysisError, AnalysisResult};
16use super::utils::{Normalize, geometric_mean, mean, number_crossings};
17
18/**
19 * General object holding all the spectral descriptor.
20 *
21 * Holds 3 spectral descriptors together. It would be better conceptually
22 * to have 3 different spectral descriptor objects, but this avoids re-computing
23 * the same FFT three times.
24 *
25 * Current spectral descriptors are spectral centroid, spectral rolloff and
26 * spectral flatness (see `values_object` for a further description of the
27 * object.
28 *
29 * All descriptors are currently summarized by their mean only.
30 */
31pub struct SpectralDesc {
32    // reusable fft buffer
33    fftgrain: Vec<f32>,
34
35    phase_vocoder: PVoc,
36    sample_rate: u32,
37
38    centroid_aubio_desc: SpecDesc,
39    rolloff_aubio_desc: SpecDesc,
40    values_centroid: Vec<f32>,
41    values_rolloff: Vec<f32>,
42    values_flatness: Vec<f32>,
43}
44
45impl SpectralDesc {
46    pub const WINDOW_SIZE: usize = 512;
47    pub const HOP_SIZE: usize = Self::WINDOW_SIZE / 4;
48
49    /**
50     * Compute score related to the
51     * [spectral centroid](https://en.wikipedia.org/wiki/Spectral_centroid) values,
52     * obtained after repeatedly calling `do_` on all of the song's chunks.
53     *
54     * Spectral centroid is used to determine the "brightness" of a sound, i.e.
55     * how much high frequency there is in an audio signal.
56     *
57     * It of course depends of the instrument used: a piano-only track that makes
58     * use of high frequencies will still score less than a song using a lot of
59     * percussive sound, because the piano frequency range is lower.
60     *
61     * The value range is between 0 and `sample_rate / 2`.
62     */
63    #[inline]
64    pub fn get_centroid(&mut self) -> [Feature; 2] {
65        [
66            self.normalize(mean(&self.values_centroid)),
67            self.normalize(
68                arr1(&self.values_centroid)
69                    .std_axis(Axis(0), 0.)
70                    .into_scalar(),
71            ),
72        ]
73    }
74
75    /**
76     * Compute score related to the spectral roll-off values, obtained
77     * after repeatedly calling `do_` on all of the song's chunks.
78     *
79     * Spectral roll-off is the bin frequency number below which a certain
80     * percentage of the spectral energy is found, here, 95%.
81     *
82     * It can be used to distinguish voiced speech (low roll-off) and unvoiced
83     * speech (high roll-off). It is also a good indication of the energy
84     * repartition of a song.
85     *
86     * The value range is between 0 and `sample_rate / 2`
87     */
88    #[inline]
89    pub fn get_rolloff(&mut self) -> [Feature; 2] {
90        [
91            self.normalize(mean(&self.values_rolloff)),
92            self.normalize(
93                arr1(&self.values_rolloff)
94                    .std_axis(Axis(0), 0.)
95                    .into_scalar(),
96            ),
97        ]
98    }
99
100    /**
101     * Compute score related to the
102     * [spectral flatness](https://en.wikipedia.org/wiki/Spectral_flatness) values,
103     * obtained after repeatedly calling `do_` on all of the song's chunks.
104     *
105     * Spectral flatness is the ratio between the geometric mean of the spectrum
106     * and its arithmetic mean.
107     *
108     * It is used to distinguish between tone-like and noise-like signals.
109     * Tone-like audio is f.ex. a piano key, something that has one or more
110     * specific frequencies, while (white) noise has an equal distribution
111     * of intensity among all frequencies.
112     *
113     * The value range is between 0 and 1, since the geometric mean is always less
114     * than the arithmetic mean.
115     */
116    #[inline]
117    pub fn get_flatness(&mut self) -> [Feature; 2] {
118        let max_value = 1.;
119        let min_value = 0.;
120        // Range is different from the other spectral algorithms, so normalizing
121        // manually here.
122        [
123            2. * (mean(&self.values_flatness) - min_value) / (max_value - min_value) - 1.,
124            2. * (arr1(&self.values_flatness)
125                .std_axis(Axis(0), 0.)
126                .into_scalar()
127                - min_value)
128                / (max_value - min_value)
129                - 1.,
130        ]
131    }
132
133    /// # Errors
134    ///
135    /// This function will return an error if there is an error loading the aubio objects
136    #[inline]
137    pub fn new(sample_rate: u32) -> AnalysisResult<Self> {
138        Ok(Self {
139            fftgrain: vec![0.0; Self::WINDOW_SIZE],
140
141            centroid_aubio_desc: SpecDesc::new(SpecShape::Centroid, Self::WINDOW_SIZE)
142                .map_err_unlikely(|e| {
143                    AnalysisError::AnalysisError(format!(
144                        "error while loading aubio centroid object: {e}",
145                    ))
146                })?,
147            rolloff_aubio_desc: SpecDesc::new(SpecShape::Rolloff, Self::WINDOW_SIZE)
148                .map_err_unlikely(|e| {
149                    AnalysisError::AnalysisError(format!(
150                        "error while loading aubio rolloff object: {e}",
151                    ))
152                })?,
153            phase_vocoder: PVoc::new(Self::WINDOW_SIZE, Self::HOP_SIZE).map_err_unlikely(|e| {
154                AnalysisError::AnalysisError(format!("error while loading aubio pvoc object: {e}",))
155            })?,
156            values_centroid: Vec::new(),
157            values_rolloff: Vec::new(),
158            values_flatness: Vec::new(),
159            sample_rate,
160        })
161    }
162
163    /**
164    Compute all the descriptors' value for the given chunk.
165
166    After using this on all the song's chunks, you can call
167    `get_centroid`, `get_flatness` and `get_rolloff` to get the respective
168    descriptors' values.
169    */
170    #[allow(clippy::missing_errors_doc, clippy::missing_panics_doc)]
171    #[allow(clippy::missing_inline_in_public_items)]
172    pub fn do_(&mut self, chunk: &[f32]) -> AnalysisResult<()> {
173        self.phase_vocoder
174            .do_(chunk, self.fftgrain.as_mut_slice())
175            .map_err_unlikely(|e| {
176                AnalysisError::AnalysisError(format!("error while processing aubio pv object: {e}"))
177            })?;
178
179        let bin = self
180            .centroid_aubio_desc
181            .do_result(self.fftgrain.as_slice())
182            .map_err_unlikely(|e| {
183                AnalysisError::AnalysisError(format!(
184                    "error while processing aubio centroid object: {e}",
185                ))
186            })?;
187
188        #[allow(clippy::cast_precision_loss)]
189        let freq = bin_to_freq(bin, self.sample_rate as f32, Self::WINDOW_SIZE as f32);
190        self.values_centroid.push(freq);
191
192        let mut bin = self
193            .rolloff_aubio_desc
194            .do_result(self.fftgrain.as_slice())
195            .map_err_unlikely(|e| {
196                AnalysisError::AnalysisError(format!(
197                    "error while processing aubio rolloff object: {e}",
198                ))
199            })?;
200
201        // Until https://github.com/aubio/aubio/pull/318 is in
202        #[allow(clippy::cast_precision_loss)]
203        if unlikely(bin > Self::WINDOW_SIZE as f32 / 2.) {
204            // only happens on the last chunk
205            bin = Self::WINDOW_SIZE as f32 / 2.;
206        }
207
208        #[allow(clippy::cast_precision_loss)]
209        let freq = bin_to_freq(bin, self.sample_rate as f32, Self::WINDOW_SIZE as f32);
210        self.values_rolloff.push(freq);
211
212        let cvec: CVec<'_> = self.fftgrain.as_slice().into();
213        let norm = cvec.norm();
214        let geo_mean = geometric_mean(norm);
215        if unlikely(geo_mean == 0.0) {
216            self.values_flatness.push(0.0);
217            return Ok(());
218        }
219        let flatness = geo_mean / mean(norm);
220        self.values_flatness.push(flatness);
221        Ok(())
222    }
223}
224
225impl Normalize for SpectralDesc {
226    #[allow(clippy::cast_precision_loss)]
227    const MAX_VALUE: Feature = SAMPLE_RATE as Feature / 2.;
228    const MIN_VALUE: Feature = 0.;
229}
230
231/**
232 * [Zero-crossing rate](https://en.wikipedia.org/wiki/Zero-crossing_rate)
233 * detection object.
234 *
235 * Zero-crossing rate is mostly used to detect percussive sounds in an audio
236 * signal, as well as whether an audio signal contains speech or not.
237 *
238 * It is a good metric to differentiate between songs with people speaking clearly,
239 * (e.g. slam) and instrumental songs.
240 *
241 * The value range is between 0 and 1.
242 */
243#[derive(Default, Clone)]
244pub struct ZeroCrossingRateDesc {
245    crossings_sum: usize,
246    samples_checked: usize,
247}
248
249impl ZeroCrossingRateDesc {
250    #[must_use]
251    #[inline]
252    pub fn new(_sample_rate: u32) -> Self {
253        Self::default()
254    }
255
256    /// Count the number of zero-crossings for the current `chunk`.
257    #[inline]
258    pub fn do_(&mut self, chunk: &[f32]) {
259        self.crossings_sum += number_crossings(chunk);
260        self.samples_checked += chunk.len();
261    }
262
263    /// Sum the number of zero-crossings witnessed and divide by
264    /// the total number of samples.
265    #[allow(clippy::cast_precision_loss)]
266    #[inline]
267    pub fn get_value(&mut self) -> Feature {
268        self.normalize(self.crossings_sum as Feature / self.samples_checked as Feature)
269    }
270}
271
272impl Normalize for ZeroCrossingRateDesc {
273    const MAX_VALUE: Feature = 1.;
274    const MIN_VALUE: Feature = 0.;
275}
276
277#[cfg(test)]
278mod tests {
279    use super::*;
280    use crate::decoder::{Decoder as DecoderTrait, MecompDecoder as Decoder};
281    use std::path::Path;
282
283    #[test]
284    fn test_zcr_boundaries() {
285        let mut zcr_desc = ZeroCrossingRateDesc::default();
286        let chunk = vec![0.; 1024];
287        zcr_desc.do_(&chunk);
288        let value = zcr_desc.get_value();
289        assert!(f32::EPSILON > (-1. - value).abs(), "{value} !~= -1");
290
291        let one_chunk = [-1., 1.];
292        let chunks = std::iter::repeat_n(one_chunk.iter(), 512)
293            .flatten()
294            .copied()
295            .collect::<Vec<f32>>();
296        let mut zcr_desc = ZeroCrossingRateDesc::default();
297        zcr_desc.do_(&chunks);
298        let value = zcr_desc.get_value();
299        assert!(0.001 > (0.998_046_9 - value).abs(), "{value} !~= 0.9980469");
300    }
301
302    #[test]
303    fn test_zcr() {
304        let song = Decoder::new()
305            .unwrap()
306            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
307            .unwrap();
308        let mut zcr_desc = ZeroCrossingRateDesc::default();
309        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
310            zcr_desc.do_(chunk);
311        }
312        let value = zcr_desc.get_value();
313        assert!(0.001 > (-0.85036 - value).abs(), "{value} !~= -0.85036");
314    }
315
316    #[test]
317    fn test_spectral_flatness_boundaries() {
318        let mut spectral_desc = SpectralDesc::new(10).unwrap();
319        let chunk = vec![0.; 1024];
320
321        let expected_values = [-1., -1.];
322        spectral_desc.do_(&chunk).unwrap();
323        for (expected, actual) in expected_values
324            .iter()
325            .zip(spectral_desc.get_flatness().iter())
326        {
327            assert!(
328                0.000_000_1 > (expected - actual).abs(),
329                "{expected} !~= {actual}"
330            );
331        }
332
333        let song = Decoder::new()
334            .unwrap()
335            .decode(Path::new("data/white_noise.mp3"))
336            .unwrap();
337        let mut spectral_desc = SpectralDesc::new(22050).unwrap();
338        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
339            spectral_desc.do_(chunk).unwrap();
340        }
341        println!("{:?}", spectral_desc.get_flatness());
342        // White noise - as close to 1 as possible
343        let expected_values = [0.578_530_3, -0.942_630_8];
344        for (expected, actual) in expected_values
345            .iter()
346            .zip(spectral_desc.get_flatness().iter())
347        {
348            // original test wanted absolute error < 0.001
349            // assert!(0.001 > (expected - actual).abs(), "{expected} !~= {actual}");
350            let relative_error = (expected - actual).abs() / expected.abs();
351            assert!(
352                relative_error < 0.078,
353                "relative error: {relative_error}, expected: {expected}, actual: {actual}"
354            );
355        }
356    }
357
358    #[test]
359    fn test_spectral_flatness() {
360        let song = Decoder::new()
361            .unwrap()
362            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
363            .unwrap();
364        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
365        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
366            spectral_desc.do_(chunk).unwrap();
367        }
368        // Spectral flatness mean value computed here with phase vocoder before normalization: 0.111949615
369        // Essentia value with spectrum / hann window: 0.11197535695207445
370        let expected_values = [-0.776_100_75, -0.814_817_9];
371        for (expected, actual) in expected_values
372            .iter()
373            .zip(spectral_desc.get_flatness().iter())
374        {
375            assert!(0.01 > (expected - actual).abs(), "{expected} !~= {actual}");
376        }
377    }
378
379    #[test]
380    fn test_spectral_roll_off_boundaries() {
381        let mut spectral_desc = SpectralDesc::new(10).unwrap();
382        let chunk = vec![0.; 512];
383
384        let expected_values = [-1., -1.];
385        spectral_desc.do_(&chunk).unwrap();
386        for (expected, actual) in expected_values
387            .iter()
388            .zip(spectral_desc.get_rolloff().iter())
389        {
390            assert!(
391                0.000_000_1 > (expected - actual).abs(),
392                "{expected} !~= {actual}"
393            );
394        }
395
396        let song = Decoder::new()
397            .unwrap()
398            .decode(Path::new("data/tone_11080Hz.flac"))
399            .unwrap();
400        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
401        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
402            spectral_desc.do_(chunk).unwrap();
403        }
404        let expected_values = [0.996_768_1, -0.996_151_75];
405        for (expected, actual) in expected_values
406            .iter()
407            .zip(spectral_desc.get_rolloff().iter())
408        {
409            assert!(
410                0.0001 > (expected - actual).abs(),
411                "{expected} !~= {actual}"
412            );
413        }
414    }
415
416    #[test]
417    fn test_spectral_roll_off() {
418        let song = Decoder::new()
419            .unwrap()
420            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
421            .unwrap();
422        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
423        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
424            spectral_desc.do_(chunk).unwrap();
425        }
426        let expected_values = [-0.632_648_6, -0.726_093_3];
427        // Roll-off mean value computed here with phase vocoder before normalization: 2026.7644
428        // Essentia value with spectrum / hann window: 1979.632683520047
429        for (expected, actual) in expected_values
430            .iter()
431            .zip(spectral_desc.get_rolloff().iter())
432        {
433            assert!(0.01 > (expected - actual).abs(), "{expected} !~= {actual}");
434        }
435    }
436
437    #[test]
438    fn test_spectral_centroid() {
439        let song = Decoder::new()
440            .unwrap()
441            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
442            .unwrap();
443        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
444        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
445            spectral_desc.do_(chunk).unwrap();
446        }
447        // Spectral centroid mean value computed here with phase vocoder before normalization: 1354.2273
448        // Essential value with spectrum / hann window: 1351
449        let expected_values = [-0.75483, -0.879_168_87];
450        for (expected, actual) in expected_values
451            .iter()
452            .zip(spectral_desc.get_centroid().iter())
453        {
454            assert!(
455                0.0001 > (expected - actual).abs(),
456                "{expected} !~= {actual}"
457            );
458        }
459    }
460
461    #[test]
462    fn test_spectral_centroid_boundaries() {
463        let mut spectral_desc = SpectralDesc::new(10).unwrap();
464        let chunk = vec![0.; 512];
465
466        spectral_desc.do_(&chunk).unwrap();
467        let expected_values = [-1., -1.];
468        for (expected, actual) in expected_values
469            .iter()
470            .zip(spectral_desc.get_centroid().iter())
471        {
472            assert!(
473                0.000_000_1 > (expected - actual).abs(),
474                "{expected} !~= {actual}"
475            );
476        }
477        let song = Decoder::new()
478            .unwrap()
479            .decode(Path::new("data/tone_11080Hz.flac"))
480            .unwrap();
481        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
482        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
483            spectral_desc.do_(chunk).unwrap();
484        }
485        let expected_values = [0.97266, -0.960_992_6];
486        for (expected, actual) in expected_values
487            .iter()
488            .zip(spectral_desc.get_centroid().iter())
489        {
490            // original test wanted absolute error < 0.00001
491            // assert!(0.00001 > (expected - actual).abs(), "{expected} !~= {actual}");
492            let relative_error = (expected - actual).abs() / expected.abs();
493            assert!(
494                relative_error < 0.039,
495                "relative error: {relative_error}, expected: {expected}, actual: {actual}"
496            );
497        }
498    }
499}
mecomp_analysis/timbral.rs

mecomp_analysis/
timbral.rs