Skip to main content

mecomp_analysis/
timbral.rs

1//! Timbral feature extraction module.
2//!
3//! Contains functions to extract & summarize the zero-crossing rate,
4//! spectral centroid, spectral flatness and spectral roll-off of
5//! a given Song.
6
7use bliss_audio_aubio_rs::vec::CVec;
8use bliss_audio_aubio_rs::{PVoc, SpecDesc, SpecShape, bin_to_freq};
9use likely_stable::{LikelyResult, unlikely};
10use ndarray::{Axis, arr1};
11
12use crate::Feature;
13
14use super::SAMPLE_RATE;
15use super::errors::{AnalysisError, AnalysisResult};
16use super::utils::{Normalize, geometric_mean, mean, number_crossings};
17
18/**
19 * General object holding all the spectral descriptor.
20 *
21 * Holds 3 spectral descriptors together. It would be better conceptually
22 * to have 3 different spectral descriptor objects, but this avoids re-computing
23 * the same FFT three times.
24 *
25 * Current spectral descriptors are spectral centroid, spectral rolloff and
26 * spectral flatness (see `values_object` for a further description of the
27 * object.
28 *
29 * All descriptors are currently summarized by their mean only.
30 */
31pub struct SpectralDesc {
32    // reusable fft buffer
33    fftgrain: Vec<f32>,
34
35    phase_vocoder: PVoc,
36    sample_rate: u32,
37
38    centroid_aubio_desc: SpecDesc,
39    rolloff_aubio_desc: SpecDesc,
40    values_centroid: Vec<f32>,
41    values_rolloff: Vec<f32>,
42    values_flatness: Vec<f32>,
43}
44
45impl SpectralDesc {
46    pub const WINDOW_SIZE: usize = 512;
47    pub const HOP_SIZE: usize = Self::WINDOW_SIZE / 4;
48
49    /**
50     * Compute score related to the
51     * [spectral centroid](https://en.wikipedia.org/wiki/Spectral_centroid) values,
52     * obtained after repeatedly calling `do_` on all of the song's chunks.
53     *
54     * Spectral centroid is used to determine the "brightness" of a sound, i.e.
55     * how much high frequency there is in an audio signal.
56     *
57     * It of course depends of the instrument used: a piano-only track that makes
58     * use of high frequencies will still score less than a song using a lot of
59     * percussive sound, because the piano frequency range is lower.
60     *
61     * The value range is between 0 and `sample_rate / 2`.
62     */
63    #[inline]
64    pub fn get_centroid(&mut self) -> [Feature; 2] {
65        [
66            self.normalize(mean(&self.values_centroid)),
67            self.normalize(
68                arr1(&self.values_centroid)
69                    .std_axis(Axis(0), 0.)
70                    .into_scalar(),
71            ),
72        ]
73    }
74
75    /**
76     * Compute score related to the spectral roll-off values, obtained
77     * after repeatedly calling `do_` on all of the song's chunks.
78     *
79     * Spectral roll-off is the bin frequency number below which a certain
80     * percentage of the spectral energy is found, here, 95%.
81     *
82     * It can be used to distinguish voiced speech (low roll-off) and unvoiced
83     * speech (high roll-off). It is also a good indication of the energy
84     * repartition of a song.
85     *
86     * The value range is between 0 and `sample_rate / 2`
87     */
88    #[inline]
89    pub fn get_rolloff(&mut self) -> [Feature; 2] {
90        [
91            self.normalize(mean(&self.values_rolloff)),
92            self.normalize(
93                arr1(&self.values_rolloff)
94                    .std_axis(Axis(0), 0.)
95                    .into_scalar(),
96            ),
97        ]
98    }
99
100    /**
101     * Compute score related to the
102     * [spectral flatness](https://en.wikipedia.org/wiki/Spectral_flatness) values,
103     * obtained after repeatedly calling `do_` on all of the song's chunks.
104     *
105     * Spectral flatness is the ratio between the geometric mean of the spectrum
106     * and its arithmetic mean.
107     *
108     * It is used to distinguish between tone-like and noise-like signals.
109     * Tone-like audio is f.ex. a piano key, something that has one or more
110     * specific frequencies, while (white) noise has an equal distribution
111     * of intensity among all frequencies.
112     *
113     * The value range is between 0 and 1, since the geometric mean is always less
114     * than the arithmetic mean.
115     */
116    #[inline]
117    pub fn get_flatness(&mut self) -> [Feature; 2] {
118        let max_value = 1.;
119        let min_value = 0.;
120        // Range is different from the other spectral algorithms, so normalizing
121        // manually here.
122        [
123            2. * (mean(&self.values_flatness) - min_value) / (max_value - min_value) - 1.,
124            2. * (arr1(&self.values_flatness)
125                .std_axis(Axis(0), 0.)
126                .into_scalar()
127                - min_value)
128                / (max_value - min_value)
129                - 1.,
130        ]
131    }
132
133    /// # Errors
134    ///
135    /// This function will return an error if there is an error loading the aubio objects
136    #[inline]
137    pub fn new(sample_rate: u32) -> AnalysisResult<Self> {
138        Ok(Self {
139            fftgrain: vec![0.0; Self::WINDOW_SIZE],
140
141            centroid_aubio_desc: SpecDesc::new(SpecShape::Centroid, Self::WINDOW_SIZE)
142                .map_err_unlikely(|e| {
143                    AnalysisError::AnalysisError(format!(
144                        "error while loading aubio centroid object: {e}",
145                    ))
146                })?,
147            rolloff_aubio_desc: SpecDesc::new(SpecShape::Rolloff, Self::WINDOW_SIZE)
148                .map_err_unlikely(|e| {
149                    AnalysisError::AnalysisError(format!(
150                        "error while loading aubio rolloff object: {e}",
151                    ))
152                })?,
153            phase_vocoder: PVoc::new(Self::WINDOW_SIZE, Self::HOP_SIZE).map_err_unlikely(|e| {
154                AnalysisError::AnalysisError(format!("error while loading aubio pvoc object: {e}",))
155            })?,
156            values_centroid: Vec::new(),
157            values_rolloff: Vec::new(),
158            values_flatness: Vec::new(),
159            sample_rate,
160        })
161    }
162
163    /**
164    Compute all the descriptors' value for the given chunk.
165
166    After using this on all the song's chunks, you can call
167    `get_centroid`, `get_flatness` and `get_rolloff` to get the respective
168    descriptors' values.
169    */
170    #[allow(clippy::missing_errors_doc, clippy::missing_panics_doc)]
171    #[allow(clippy::missing_inline_in_public_items)]
172    pub fn do_(&mut self, chunk: &[f32]) -> AnalysisResult<()> {
173        self.phase_vocoder
174            .do_(chunk, self.fftgrain.as_mut_slice())
175            .map_err_unlikely(|e| {
176                AnalysisError::AnalysisError(format!("error while processing aubio pv object: {e}"))
177            })?;
178
179        let bin = self
180            .centroid_aubio_desc
181            .do_result(self.fftgrain.as_slice())
182            .map_err_unlikely(|e| {
183                AnalysisError::AnalysisError(format!(
184                    "error while processing aubio centroid object: {e}",
185                ))
186            })?;
187
188        #[allow(clippy::cast_precision_loss)]
189        let freq = bin_to_freq(bin, self.sample_rate as f32, Self::WINDOW_SIZE as f32);
190        self.values_centroid.push(freq);
191
192        let mut bin = self
193            .rolloff_aubio_desc
194            .do_result(self.fftgrain.as_slice())
195            .map_err_unlikely(|e| {
196                AnalysisError::AnalysisError(format!(
197                    "error while processing aubio rolloff object: {e}",
198                ))
199            })?;
200
201        // Until https://github.com/aubio/aubio/pull/318 is in
202        #[allow(clippy::cast_precision_loss)]
203        if unlikely(bin > Self::WINDOW_SIZE as f32 / 2.) {
204            // only happens on the last chunk
205            bin = Self::WINDOW_SIZE as f32 / 2.;
206        }
207
208        #[allow(clippy::cast_precision_loss)]
209        let freq = bin_to_freq(bin, self.sample_rate as f32, Self::WINDOW_SIZE as f32);
210        self.values_rolloff.push(freq);
211
212        let cvec: CVec<'_> = self.fftgrain.as_slice().into();
213        let norm = cvec.norm();
214        let geo_mean = geometric_mean(norm);
215        let arithmetic_mean = mean(norm);
216
217        // Avoid NaN from division by zero (silent audio or both means are zero)
218        if unlikely(geo_mean == 0.0 || arithmetic_mean == 0.0) {
219            self.values_flatness.push(0.0);
220            return Ok(());
221        }
222        let flatness = geo_mean / arithmetic_mean;
223        self.values_flatness.push(flatness);
224        Ok(())
225    }
226}
227
228impl Normalize for SpectralDesc {
229    #[allow(clippy::cast_precision_loss)]
230    const MAX_VALUE: Feature = SAMPLE_RATE as Feature / 2.;
231    const MIN_VALUE: Feature = 0.;
232}
233
234/**
235 * [Zero-crossing rate](https://en.wikipedia.org/wiki/Zero-crossing_rate)
236 * detection object.
237 *
238 * Zero-crossing rate is mostly used to detect percussive sounds in an audio
239 * signal, as well as whether an audio signal contains speech or not.
240 *
241 * It is a good metric to differentiate between songs with people speaking clearly,
242 * (e.g. slam) and instrumental songs.
243 *
244 * The value range is between 0 and 1.
245 */
246#[derive(Default, Clone)]
247pub struct ZeroCrossingRateDesc {
248    crossings_sum: usize,
249    samples_checked: usize,
250}
251
252impl ZeroCrossingRateDesc {
253    #[must_use]
254    #[inline]
255    pub fn new(_sample_rate: u32) -> Self {
256        Self::default()
257    }
258
259    /// Count the number of zero-crossings for the current `chunk`.
260    #[inline]
261    pub fn do_(&mut self, chunk: &[f32]) {
262        self.crossings_sum += number_crossings(chunk);
263        self.samples_checked += chunk.len();
264    }
265
266    /// Sum the number of zero-crossings witnessed and divide by
267    /// the total number of samples.
268    #[allow(clippy::cast_precision_loss)]
269    #[inline]
270    pub fn get_value(&mut self) -> Feature {
271        self.normalize(self.crossings_sum as Feature / self.samples_checked as Feature)
272    }
273}
274
275impl Normalize for ZeroCrossingRateDesc {
276    const MAX_VALUE: Feature = 1.;
277    const MIN_VALUE: Feature = 0.;
278}
279
280#[cfg(test)]
281mod tests {
282    use super::*;
283    use crate::decoder::{Decoder as DecoderTrait, MecompDecoder as Decoder};
284    use std::path::Path;
285
286    #[test]
287    fn test_zcr_boundaries() {
288        let mut zcr_desc = ZeroCrossingRateDesc::default();
289        let chunk = vec![0.; 1024];
290        zcr_desc.do_(&chunk);
291        let value = zcr_desc.get_value();
292        assert!(f32::EPSILON > (-1. - value).abs(), "{value} !~= -1");
293
294        let one_chunk = [-1., 1.];
295        let chunks = std::iter::repeat_n(one_chunk.iter(), 512)
296            .flatten()
297            .copied()
298            .collect::<Vec<f32>>();
299        let mut zcr_desc = ZeroCrossingRateDesc::default();
300        zcr_desc.do_(&chunks);
301        let value = zcr_desc.get_value();
302        assert!(0.001 > (0.998_046_9 - value).abs(), "{value} !~= 0.9980469");
303    }
304
305    #[test]
306    fn test_zcr() {
307        let song = Decoder::new()
308            .unwrap()
309            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
310            .unwrap();
311        let mut zcr_desc = ZeroCrossingRateDesc::default();
312        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
313            zcr_desc.do_(chunk);
314        }
315        let value = zcr_desc.get_value();
316        assert!(0.001 > (-0.85036 - value).abs(), "{value} !~= -0.85036");
317    }
318
319    #[test]
320    fn test_spectral_flatness_boundaries() {
321        let mut spectral_desc = SpectralDesc::new(10).unwrap();
322        let chunk = vec![0.; 1024];
323
324        let expected_values = [-1., -1.];
325        spectral_desc.do_(&chunk).unwrap();
326        for (expected, actual) in expected_values
327            .iter()
328            .zip(spectral_desc.get_flatness().iter())
329        {
330            assert!(
331                0.000_000_1 > (expected - actual).abs(),
332                "{expected} !~= {actual}"
333            );
334        }
335
336        let song = Decoder::new()
337            .unwrap()
338            .decode(Path::new("data/white_noise.mp3"))
339            .unwrap();
340        let mut spectral_desc = SpectralDesc::new(22050).unwrap();
341        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
342            spectral_desc.do_(chunk).unwrap();
343        }
344        println!("{:?}", spectral_desc.get_flatness());
345        // White noise - as close to 1 as possible
346        let expected_values = [0.578_530_3, -0.942_630_8];
347        for (expected, actual) in expected_values
348            .iter()
349            .zip(spectral_desc.get_flatness().iter())
350        {
351            // original test wanted absolute error < 0.001
352            // assert!(0.001 > (expected - actual).abs(), "{expected} !~= {actual}");
353            let relative_error = (expected - actual).abs() / expected.abs();
354            assert!(
355                relative_error < 0.078,
356                "relative error: {relative_error}, expected: {expected}, actual: {actual}"
357            );
358        }
359    }
360
361    #[test]
362    fn test_spectral_flatness() {
363        let song = Decoder::new()
364            .unwrap()
365            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
366            .unwrap();
367        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
368        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
369            spectral_desc.do_(chunk).unwrap();
370        }
371        // Spectral flatness mean value computed here with phase vocoder before normalization: 0.111949615
372        // Essentia value with spectrum / hann window: 0.11197535695207445
373        let expected_values = [-0.776_100_75, -0.814_817_9];
374        for (expected, actual) in expected_values
375            .iter()
376            .zip(spectral_desc.get_flatness().iter())
377        {
378            assert!(0.01 > (expected - actual).abs(), "{expected} !~= {actual}");
379        }
380    }
381
382    #[test]
383    fn test_spectral_roll_off_boundaries() {
384        let mut spectral_desc = SpectralDesc::new(10).unwrap();
385        let chunk = vec![0.; 512];
386
387        let expected_values = [-1., -1.];
388        spectral_desc.do_(&chunk).unwrap();
389        for (expected, actual) in expected_values
390            .iter()
391            .zip(spectral_desc.get_rolloff().iter())
392        {
393            assert!(
394                0.000_000_1 > (expected - actual).abs(),
395                "{expected} !~= {actual}"
396            );
397        }
398
399        let song = Decoder::new()
400            .unwrap()
401            .decode(Path::new("data/tone_11080Hz.flac"))
402            .unwrap();
403        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
404        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
405            spectral_desc.do_(chunk).unwrap();
406        }
407        let expected_values = [0.996_768_1, -0.996_151_75];
408        for (expected, actual) in expected_values
409            .iter()
410            .zip(spectral_desc.get_rolloff().iter())
411        {
412            assert!(
413                0.0001 > (expected - actual).abs(),
414                "{expected} !~= {actual}"
415            );
416        }
417    }
418
419    #[test]
420    fn test_spectral_roll_off() {
421        let song = Decoder::new()
422            .unwrap()
423            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
424            .unwrap();
425        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
426        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
427            spectral_desc.do_(chunk).unwrap();
428        }
429        let expected_values = [-0.632_648_6, -0.726_093_3];
430        // Roll-off mean value computed here with phase vocoder before normalization: 2026.7644
431        // Essentia value with spectrum / hann window: 1979.632683520047
432        for (expected, actual) in expected_values
433            .iter()
434            .zip(spectral_desc.get_rolloff().iter())
435        {
436            assert!(0.01 > (expected - actual).abs(), "{expected} !~= {actual}");
437        }
438    }
439
440    #[test]
441    fn test_spectral_centroid() {
442        let song = Decoder::new()
443            .unwrap()
444            .decode(Path::new("data/s16_mono_22_5kHz.flac"))
445            .unwrap();
446        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
447        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
448            spectral_desc.do_(chunk).unwrap();
449        }
450        // Spectral centroid mean value computed here with phase vocoder before normalization: 1354.2273
451        // Essential value with spectrum / hann window: 1351
452        let expected_values = [-0.75483, -0.879_168_87];
453        for (expected, actual) in expected_values
454            .iter()
455            .zip(spectral_desc.get_centroid().iter())
456        {
457            assert!(
458                0.0001 > (expected - actual).abs(),
459                "{expected} !~= {actual}"
460            );
461        }
462    }
463
464    #[test]
465    fn test_spectral_centroid_boundaries() {
466        let mut spectral_desc = SpectralDesc::new(10).unwrap();
467        let chunk = vec![0.; 512];
468
469        spectral_desc.do_(&chunk).unwrap();
470        let expected_values = [-1., -1.];
471        for (expected, actual) in expected_values
472            .iter()
473            .zip(spectral_desc.get_centroid().iter())
474        {
475            assert!(
476                0.000_000_1 > (expected - actual).abs(),
477                "{expected} !~= {actual}"
478            );
479        }
480        let song = Decoder::new()
481            .unwrap()
482            .decode(Path::new("data/tone_11080Hz.flac"))
483            .unwrap();
484        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
485        for chunk in song.samples.chunks_exact(SpectralDesc::HOP_SIZE) {
486            spectral_desc.do_(chunk).unwrap();
487        }
488        let expected_values = [0.97266, -0.960_992_6];
489        for (expected, actual) in expected_values
490            .iter()
491            .zip(spectral_desc.get_centroid().iter())
492        {
493            // original test wanted absolute error < 0.00001
494            // assert!(0.00001 > (expected - actual).abs(), "{expected} !~= {actual}");
495            let relative_error = (expected - actual).abs() / expected.abs();
496            assert!(
497                relative_error < 0.039,
498                "relative error: {relative_error}, expected: {expected}, actual: {actual}"
499            );
500        }
501    }
502}