bliss_rs/
timbral.rs

1//! Timbral feature extraction module.
2//!
3//! Contains functions to extract & summarize the zero-crossing rate,
4//! spectral centroid, spectral flatness and spectral roll-off of
5//! a given Song.
6
7use bliss_audio_aubio_rs::vec::CVec;
8use bliss_audio_aubio_rs::{bin_to_freq, PVoc, SpecDesc, SpecShape};
9use ndarray::{arr1, Axis};
10
11use super::utils::{geometric_mean, mean, number_crossings, Normalize};
12use crate::{BlissError, SAMPLE_RATE};
13
14/**
15 * General object holding all the spectral descriptor.
16 *
17 * Holds 3 spectral descriptors together. It would be better conceptually
18 * to have 3 different spectral descriptor objects, but this avoids re-computing
19 * the same FFT three times.
20 *
21 * Current spectral descriptors are spectral centroid, spectral rolloff and
22 * spectral flatness (see `values_object` for a further description of the
23 * object.
24 *
25 * All descriptors are currently summarized by their mean only.
26 */
27pub(crate) struct SpectralDesc {
28    phase_vocoder: PVoc,
29    sample_rate: u32,
30
31    centroid_aubio_desc: SpecDesc,
32    rolloff_aubio_desc: SpecDesc,
33    values_centroid: Vec<f32>,
34    values_rolloff: Vec<f32>,
35    values_flatness: Vec<f32>,
36}
37
38impl SpectralDesc {
39    pub const WINDOW_SIZE: usize = 512;
40    pub const HOP_SIZE: usize = SpectralDesc::WINDOW_SIZE / 4;
41
42    /**
43     * Compute score related to the
44     * [spectral centroid](https://en.wikipedia.org/wiki/Spectral_centroid) values,
45     * obtained after repeatedly calling `do_` on all of the song's chunks.
46     *
47     * Spectral centroid is used to determine the "brightness" of a sound, i.e.
48     * how much high frequency there is in an audio signal.
49     *
50     * It of course depends of the instrument used: a piano-only track that makes
51     * use of high frequencies will still score less than a song using a lot of
52     * percussive sound, because the piano frequency range is lower.
53     *
54     * The value range is between 0 and `sample_rate / 2`.
55     */
56    pub fn get_centroid(&mut self) -> Vec<f32> {
57        vec![
58            self.normalize(mean(&self.values_centroid)),
59            self.normalize(
60                arr1(&self.values_centroid)
61                    .std_axis(Axis(0), 0.)
62                    .into_scalar(),
63            ),
64        ]
65    }
66
67    /**
68     * Compute score related to the spectral roll-off values, obtained
69     * after repeatedly calling `do_` on all of the song's chunks.
70     *
71     * Spectral roll-off is the bin frequency number below which a certain
72     * percentage of the spectral energy is found, here, 95%.
73     *
74     * It can be used to distinguish voiced speech (low roll-off) and unvoiced
75     * speech (high roll-off). It is also a good indication of the energy
76     * repartition of a song.
77     *
78     * The value range is between 0 and `sample_rate / 2`
79     */
80    pub fn get_rolloff(&mut self) -> Vec<f32> {
81        vec![
82            self.normalize(mean(&self.values_rolloff)),
83            self.normalize(
84                arr1(&self.values_rolloff)
85                    .std_axis(Axis(0), 0.)
86                    .into_scalar(),
87            ),
88        ]
89    }
90
91    /**
92     * Compute score related to the
93     * [spectral flatness](https://en.wikipedia.org/wiki/Spectral_flatness) values,
94     * obtained after repeatedly calling `do_` on all of the song's chunks.
95     *
96     * Spectral flatness is the ratio between the geometric mean of the spectrum
97     * and its arithmetic mean.
98     *
99     * It is used to distinguish between tone-like and noise-like signals.
100     * Tone-like audio is f.ex. a piano key, something that has one or more
101     * specific frequencies, while (white) noise has an equal distribution
102     * of intensity among all frequencies.
103     *
104     * The value range is between 0 and 1, since the geometric mean is always less
105     * than the arithmetic mean.
106     */
107    pub fn get_flatness(&mut self) -> Vec<f32> {
108        let max_value = 1.;
109        let min_value = 0.;
110        // Range is different from the other spectral algorithms, so normalizing
111        // manually here.
112        vec![
113            2. * (mean(&self.values_flatness) - min_value) / (max_value - min_value) - 1.,
114            2. * (arr1(&self.values_flatness)
115                .std_axis(Axis(0), 0.)
116                .into_scalar()
117                - min_value)
118                / (max_value - min_value)
119                - 1.,
120        ]
121    }
122
123    pub fn new(sample_rate: u32) -> Result<Self, BlissError> {
124        Ok(SpectralDesc {
125            centroid_aubio_desc: SpecDesc::new(SpecShape::Centroid, SpectralDesc::WINDOW_SIZE)
126                .map_err(|e| {
127                    BlissError::AnalysisError(format!(
128                        "error while loading aubio centroid object: {}",
129                        e.to_string()
130                    ))
131                })?,
132            rolloff_aubio_desc: SpecDesc::new(SpecShape::Rolloff, SpectralDesc::WINDOW_SIZE)
133                .map_err(|e| {
134                    BlissError::AnalysisError(format!(
135                        "error while loading aubio rolloff object: {}",
136                        e.to_string()
137                    ))
138                })?,
139            phase_vocoder: PVoc::new(SpectralDesc::WINDOW_SIZE, SpectralDesc::HOP_SIZE).map_err(
140                |e| {
141                    BlissError::AnalysisError(format!(
142                        "error while loading aubio pvoc object: {}",
143                        e.to_string()
144                    ))
145                },
146            )?,
147            values_centroid: Vec::new(),
148            values_rolloff: Vec::new(),
149            values_flatness: Vec::new(),
150            sample_rate,
151        })
152    }
153
154    /**
155     * Compute all the descriptors' value for the given chunk.
156     *
157     * After using this on all the song's chunks, you can call
158     * `get_centroid`, `get_flatness` and `get_rolloff` to get the respective
159     * descriptors' values.
160     */
161    pub fn do_(&mut self, chunk: &[f32]) -> Result<(), BlissError> {
162        let mut fftgrain: Vec<f32> = vec![0.0; SpectralDesc::WINDOW_SIZE];
163        self.phase_vocoder
164            .do_(chunk, fftgrain.as_mut_slice())
165            .map_err(|e| {
166                BlissError::AnalysisError(format!(
167                    "error while processing aubio pv object: {}",
168                    e.to_string()
169                ))
170            })?;
171
172        let bin = self
173            .centroid_aubio_desc
174            .do_result(fftgrain.as_slice())
175            .map_err(|e| {
176                BlissError::AnalysisError(format!(
177                    "error while processing aubio centroid object: {}",
178                    e.to_string()
179                ))
180            })?;
181
182        let freq = bin_to_freq(
183            bin,
184            self.sample_rate as f32,
185            SpectralDesc::WINDOW_SIZE as f32,
186        );
187        self.values_centroid.push(freq);
188
189        let mut bin = self
190            .rolloff_aubio_desc
191            .do_result(fftgrain.as_slice())
192            .unwrap();
193
194        // Until https://github.com/aubio/aubio/pull/318 is in
195        if bin > SpectralDesc::WINDOW_SIZE as f32 / 2. {
196            bin = SpectralDesc::WINDOW_SIZE as f32 / 2.;
197        }
198
199        let freq = bin_to_freq(
200            bin,
201            self.sample_rate as f32,
202            SpectralDesc::WINDOW_SIZE as f32,
203        );
204        self.values_rolloff.push(freq);
205
206        let cvec: CVec = fftgrain.as_slice().into();
207        let geo_mean = geometric_mean(&cvec.norm());
208        if geo_mean == 0.0 {
209            self.values_flatness.push(0.0);
210            return Ok(());
211        }
212        let flatness = geo_mean / mean(&cvec.norm());
213        self.values_flatness.push(flatness);
214        Ok(())
215    }
216}
217
218impl Normalize for SpectralDesc {
219    const MAX_VALUE: f32 = SAMPLE_RATE as f32 / 2.;
220    const MIN_VALUE: f32 = 0.;
221}
222
223/**
224 * [Zero-crossing rate](https://en.wikipedia.org/wiki/Zero-crossing_rate)
225 * detection object.
226 *
227 * Zero-crossing rate is mostly used to detect percussive sounds in an audio
228 * signal, as well as whether an audio signal contains speech or not.
229 *
230 * It is a good metric to differentiate between songs with people speaking clearly,
231 * (e.g. slam) and instrumental songs.
232 *
233 * The value range is between 0 and 1.
234 */
235#[derive(Default)]
236pub(crate) struct ZeroCrossingRateDesc {
237    values: Vec<u32>,
238    number_samples: usize,
239}
240
241impl ZeroCrossingRateDesc {
242    #[allow(dead_code)]
243    pub fn new(_sample_rate: u32) -> Self {
244        ZeroCrossingRateDesc::default()
245    }
246
247    /// Count the number of zero-crossings for the current `chunk`.
248    pub fn do_(&mut self, chunk: &[f32]) {
249        self.values.push(number_crossings(chunk));
250        self.number_samples += chunk.len();
251    }
252
253    /// Sum the number of zero-crossings witnessed and divide by
254    /// the total number of samples.
255    pub fn get_value(&mut self) -> f32 {
256        self.normalize((self.values.iter().sum::<u32>()) as f32 / self.number_samples as f32)
257    }
258}
259
260impl Normalize for ZeroCrossingRateDesc {
261    const MAX_VALUE: f32 = 1.;
262    const MIN_VALUE: f32 = 0.;
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268    use crate::Song;
269
270    #[test]
271    fn test_zcr_boundaries() {
272        let mut zcr_desc = ZeroCrossingRateDesc::default();
273        let chunk = vec![0.; 1024];
274        zcr_desc.do_(&chunk);
275        assert_eq!(-1., zcr_desc.get_value());
276
277        let one_chunk = vec![-1., 1.];
278        let chunks = std::iter::repeat(one_chunk.iter())
279            .take(512)
280            .flatten()
281            .cloned()
282            .collect::<Vec<f32>>();
283        let mut zcr_desc = ZeroCrossingRateDesc::default();
284        zcr_desc.do_(&chunks);
285        assert!(0.001 > (0.9980469 - zcr_desc.get_value()).abs());
286    }
287
288    #[test]
289    fn test_zcr() {
290        let song = Song::decode("data/s16_mono_22_5kHz.flac").unwrap();
291        let mut zcr_desc = ZeroCrossingRateDesc::default();
292        for chunk in song.sample_array.chunks_exact(SpectralDesc::HOP_SIZE) {
293            zcr_desc.do_(&chunk);
294        }
295        assert!(0.001 > (-0.85036 - zcr_desc.get_value()).abs());
296    }
297
298    #[test]
299    fn test_spectral_flatness_boundaries() {
300        let mut spectral_desc = SpectralDesc::new(10).unwrap();
301        let chunk = vec![0.; 1024];
302
303        let expected_values = vec![-1., -1.];
304        spectral_desc.do_(&chunk).unwrap();
305        for (expected, actual) in expected_values
306            .iter()
307            .zip(spectral_desc.get_flatness().iter())
308        {
309            assert!(0.0000001 > (expected - actual).abs());
310        }
311
312        let song = Song::decode("data/white_noise.flac").unwrap();
313        let mut spectral_desc = SpectralDesc::new(22050).unwrap();
314        for chunk in song.sample_array.chunks_exact(SpectralDesc::HOP_SIZE) {
315            spectral_desc.do_(&chunk).unwrap();
316        }
317        // White noise - as close to 1 as possible
318        let expected_values = vec![0.6706717, -0.9685736];
319        for (expected, actual) in expected_values
320            .iter()
321            .zip(spectral_desc.get_flatness().iter())
322        {
323            assert!(0.001 > (expected - actual).abs());
324        }
325    }
326
327    #[test]
328    fn test_spectral_flatness() {
329        let song = Song::decode("data/s16_mono_22_5kHz.flac").unwrap();
330        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
331        for chunk in song.sample_array.chunks_exact(SpectralDesc::HOP_SIZE) {
332            spectral_desc.do_(&chunk).unwrap();
333        }
334        // Spectral flatness mean value computed here with phase vocoder before normalization: 0.111949615
335        // Essentia value with spectrum / hann window: 0.11197535695207445
336        let expected_values = vec![-0.77610075, -0.8148179];
337        for (expected, actual) in expected_values
338            .iter()
339            .zip(spectral_desc.get_flatness().iter())
340        {
341            assert!(0.01 > (expected - actual).abs());
342        }
343    }
344
345    #[test]
346    fn test_spectral_roll_off_boundaries() {
347        let mut spectral_desc = SpectralDesc::new(10).unwrap();
348        let chunk = vec![0.; 512];
349
350        let expected_values = vec![-1., -1.];
351        spectral_desc.do_(&chunk).unwrap();
352        for (expected, actual) in expected_values
353            .iter()
354            .zip(spectral_desc.get_rolloff().iter())
355        {
356            assert!(0.0000001 > (expected - actual).abs());
357        }
358
359        let song = Song::decode("data/tone_11080Hz.flac").unwrap();
360        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
361        for chunk in song.sample_array.chunks_exact(SpectralDesc::HOP_SIZE) {
362            spectral_desc.do_(&chunk).unwrap();
363        }
364        let expected_values = vec![0.9967681, -0.99615175];
365        for (expected, actual) in expected_values
366            .iter()
367            .zip(spectral_desc.get_rolloff().iter())
368        {
369            assert!(0.0001 > (expected - actual).abs());
370        }
371    }
372
373    #[test]
374    fn test_spectral_roll_off() {
375        let song = Song::decode("data/s16_mono_22_5kHz.flac").unwrap();
376        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
377        for chunk in song.sample_array.chunks_exact(SpectralDesc::HOP_SIZE) {
378            spectral_desc.do_(&chunk).unwrap();
379        }
380        let expected_values = vec![-0.6326486, -0.7260933];
381        // Roll-off mean value computed here with phase vocoder before normalization: 2026.7644
382        // Essentia value with spectrum / hann window: 1979.632683520047
383        for (expected, actual) in expected_values
384            .iter()
385            .zip(spectral_desc.get_rolloff().iter())
386        {
387            assert!(0.01 > (expected - actual).abs());
388        }
389    }
390
391    #[test]
392    fn test_spectral_centroid() {
393        let song = Song::decode("data/s16_mono_22_5kHz.flac").unwrap();
394        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
395        for chunk in song.sample_array.chunks_exact(SpectralDesc::HOP_SIZE) {
396            spectral_desc.do_(&chunk).unwrap();
397        }
398        // Spectral centroid mean value computed here with phase vocoder before normalization: 1354.2273
399        // Essentia value with spectrum / hann window: 1351
400        let expected_values = vec![-0.75483, -0.87916887];
401        for (expected, actual) in expected_values
402            .iter()
403            .zip(spectral_desc.get_centroid().iter())
404        {
405            assert!(0.0001 > (expected - actual).abs());
406        }
407    }
408
409    #[test]
410    fn test_spectral_centroid_boundaries() {
411        let mut spectral_desc = SpectralDesc::new(10).unwrap();
412        let chunk = vec![0.; 512];
413
414        spectral_desc.do_(&chunk).unwrap();
415        let expected_values = vec![-1., -1.];
416        for (expected, actual) in expected_values
417            .iter()
418            .zip(spectral_desc.get_centroid().iter())
419        {
420            assert!(0.0000001 > (expected - actual).abs());
421        }
422        let song = Song::decode("data/tone_11080Hz.flac").unwrap();
423        let mut spectral_desc = SpectralDesc::new(SAMPLE_RATE).unwrap();
424        for chunk in song.sample_array.chunks_exact(SpectralDesc::HOP_SIZE) {
425            spectral_desc.do_(&chunk).unwrap();
426        }
427        let expected_values = vec![0.97266, -0.9609926];
428        for (expected, actual) in expected_values
429            .iter()
430            .zip(spectral_desc.get_centroid().iter())
431        {
432            assert!(0.00001 > (expected - actual).abs());
433        }
434    }
435}
436
437#[cfg(all(feature = "bench", test))]
438mod bench {
439    extern crate test;
440    use crate::timbral::{SpectralDesc, ZeroCrossingRateDesc};
441    use test::Bencher;
442
443    #[bench]
444    fn bench_spectral_desc(b: &mut Bencher) {
445        let mut spectral_desc = SpectralDesc::new(10).unwrap();
446        let chunk = vec![0.; 512];
447
448        b.iter(|| {
449            spectral_desc.do_(&chunk).unwrap();
450        });
451    }
452
453    #[bench]
454    fn bench_zcr_desc(b: &mut Bencher) {
455        let mut zcr_desc = ZeroCrossingRateDesc::new(10);
456        let chunk = vec![0.; 512];
457
458        b.iter(|| {
459            zcr_desc.do_(&chunk);
460        });
461    }
462}