audio_samples 1.0.10

A typed audio processing library for Rust that treats audio as a first-class, invariant-preserving object rather than an unstructured numeric buffer.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
//! Psychoacoustic masking model.
//!
//! Implements the three core steps of a perceptual masking model:
//!
//! 1. **Absolute threshold of hearing (ATH)** — the minimum audible level at each
//!    frequency, below which masking is never needed.
//! 2. **Spreading function** — how a loud masker raises the masking threshold in
//!    neighbouring Bark bands (simultaneous masking). Tonal maskers (sinusoids)
//!    are more effective than noise maskers and use a higher gain value.
//! 3. **Masking-threshold computation** — combines ATH, band energies, masker
//!    classification, and the spreading function to produce a per-band threshold
//!    and signal-to-mask ratio.
//!
//! These primitives follow the structure of the MPEG-1 psychoacoustic model (ISO
//! 11172-3 Annex D), simplified for use as a general-purpose analysis layer.

use super::{BandLayout, BandMetric, BandMetrics, PsychoacousticConfig};
use non_empty_slice::NonEmptySlice;

// ── Masker type ────────────────────────────────────────────────────────────────

/// Classification of a spectral masker as tonal (sinusoidal) or noise-like.
///
/// The masking effectiveness differs between masker types: tonal maskers (pure
/// tones, harmonics) create a tighter but deeper masking threshold, while noise
/// maskers create a broader but shallower threshold. MPEG-1 uses 14.5 dB offset
/// for tonal maskers and 5.5 dB for noise maskers.
///
/// Determined per-band by [`classify_masker_types`].
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum MaskerType {
    /// Tonal masker (sinusoidal, narrow-band). Uses `config.masking_gain`.
    Tonal,
    /// Noise-like masker (broadband). Uses `config.noise_masking_gain`.
    Noise,
}

/// Classifies each band's masker type based on local spectral prominence.
///
/// A band is classified as [`MaskerType::Tonal`] if its energy is at least
/// `tonal_threshold_db` above the average energy of its two immediate neighbours.
/// Bands with no neighbours (the first and last) are compared against their single
/// available neighbour. All other bands are classified as [`MaskerType::Noise`].
///
/// # Arguments
/// - `band_energy_db` – Per-band energy in dB, in band order.
/// - `tonal_threshold_db` – Minimum prominence (in dB) for a band to be tonal.
///   Typical value: `7.0` dB.
///
/// # Returns
/// A `Vec<MaskerType>` of the same length as `band_energy_db`.
#[must_use]
pub fn classify_masker_types(band_energy_db: &[f32], tonal_threshold_db: f32) -> Vec<MaskerType> {
    let n = band_energy_db.len();
    (0..n)
        .map(|i| {
            let neighbours: f32 = match (i.checked_sub(1), (i + 1 < n).then_some(i + 1)) {
                (Some(l), Some(r)) => (band_energy_db[l] + band_energy_db[r]) / 2.0,
                (Some(l), None) => band_energy_db[l],
                (None, Some(r)) => band_energy_db[r],
                (None, None) => band_energy_db[i],
            };
            if band_energy_db[i] - neighbours >= tonal_threshold_db {
                MaskerType::Tonal
            } else {
                MaskerType::Noise
            }
        })
        .collect()
}

// ── Absolute threshold of hearing ────────────────────────────────────────────

/// Returns the absolute threshold of hearing at `freq_hz` in dB SPL.
///
/// Uses the ISO 226 / MPEG-1 polynomial approximation:
///
/// ```text
/// ATH(f) = 3.64·(f/1000)^−0.8 − 6.5·exp(−0.6·(f/1000 − 3.3)²) + 10⁻³·(f/1000)⁴
/// ```
///
/// # Arguments
/// - `freq_hz` – Frequency in hertz. Values ≤ 0 return 60 dB (conservative maximum).
///
/// # Returns
/// Threshold in dB SPL, clamped to [−20, 60].
#[inline]
#[must_use]
pub fn absolute_threshold_of_hearing(freq_hz: f32) -> f32 {
    if freq_hz <= 0.0 {
        return 60.0;
    }
    let f = freq_hz / 1000.0;
    let term1 = 3.64_f32 * f.powf(-0.8_f32);
    let term2 = -6.5_f32 * (-0.6_f32 * (f - 3.3_f32).powi(2)).exp();
    let term3 = 1e-3_f32 * f.powi(4);
    (term1 + term2 + term3).clamp(-20.0, 60.0)
}

// ── Spreading function ────────────────────────────────────────────────────────

/// Returns the total masking attenuation in dB between a masker at `masker_bark`
/// and a maskee at `maskee_bark`.
///
/// The masking threshold the masker creates at `maskee_bark` is:
/// `masker_level_db − spreading_attenuation(masker_bark, maskee_bark, masker_type, config)`.
/// This means:
///
/// - At zero distance: attenuation equals the masker-type gain. For tonal maskers
///   this is `config.masking_gain` (e.g. 14.5 dB); for noise maskers it is
///   `config.noise_masking_gain` (e.g. 5.5 dB). A larger gain means the threshold
///   sits further below the masker, so the masker is more effective.
/// - With increasing Bark distance: attenuation grows by `upward_spread` dB/Bark
///   upward and `downward_spread` dB/Bark downward, reducing the masker's reach.
///
/// A signal is audible when its SMR (energy − threshold) is positive.
///
/// # Arguments
/// - `masker_bark` – Bark position of the masking tone or band.
/// - `maskee_bark` – Bark position of the masked band.
/// - `masker_type` – Whether the masker is tonal or noise-like.
/// - `config` – Psychoacoustic configuration containing spread slopes and gains.
///
/// # Returns
/// Total attenuation in dB.
#[inline]
#[must_use]
pub fn spreading_attenuation(
    masker_bark: f32,
    maskee_bark: f32,
    masker_type: MaskerType,
    config: &PsychoacousticConfig,
) -> f32 {
    let base_gain = match masker_type {
        MaskerType::Tonal => config.masking_gain,
        MaskerType::Noise => config.noise_masking_gain,
    };
    let dz = maskee_bark - masker_bark;
    let distance_decay = if dz >= 0.0 {
        config.upward_spread * dz
    } else {
        config.downward_spread * (-dz)
    };
    base_gain + distance_decay
}

// ── Band-metric computation ───────────────────────────────────────────────────

/// Computes per-band psychoacoustic metrics from a linear-scale bin-energy array.
///
/// Steps:
/// 1. Aggregate bin energies within each band's `[start_bin, end_bin)` range,
///    normalised by band width.
/// 2. Convert aggregated energy to dB, applying `config.epsilon` as a noise floor.
/// 3. Compute ATH for each band's centre frequency.
/// 4. For each target band `j`, the masking threshold = max(ATH[j], max over all
///    bands `i` of `energy[i] − spreading_attenuation(i → j)`). Bands far from
///    the masker contribute a lower threshold; the dominant masker wins.
/// 5. Derive SMR, importance (perceptual weight × SMR), and allowed noise.
///
/// # Arguments
/// - `bin_energies` – Linear power per spectral bin (e.g. `|MDCT[k]|²`).
/// - `band_layout` – Band definitions mapping bins to perceptual positions.
/// - `config` – Psychoacoustic model parameters.
/// - `n_bins` – Total number of spectral bins; must match the length of
///   `bin_energies`.
///
/// # Returns
/// A [`BandMetrics`] with one [`BandMetric`] per band in `band_layout`.
///
/// # Panics
///
/// Panics if `n_bins == 0` or if `config.perceptual_weights` length does not
/// match `band_layout.len()` (use [`PsychoacousticConfig::is_compatible_with`]
/// to validate beforehand).
pub fn compute_band_metrics(
    bin_energies: &[f32],
    band_layout: &BandLayout,
    config: &PsychoacousticConfig,
    n_bins: usize,
) -> BandMetrics {
    use super::bands::hz_to_bark;

    let bands = band_layout.as_slice();
    let n_bands = bands.len().get();
    let weights = config.perceptual_weights.as_non_empty_slice();

    // Step 1 & 2: per-band energy in dB.
    let band_energy_db: Vec<f32> = bands
        .iter()
        .map(|band| {
            let start = band.start_bin.min(n_bins);
            let end = band.end_bin.min(n_bins);
            let width = (end - start).max(1);
            let sum: f32 = bin_energies[start..end].iter().sum();
            10.0_f32 * (sum / width as f32 + config.epsilon).log10()
        })
        .collect();

    // Step 3: ATH per band.
    let ath_db: Vec<f32> = bands
        .iter()
        .map(|band| absolute_threshold_of_hearing(band.centre_frequency))
        .collect();

    // Bark positions for spreading computation.
    let bark_positions: Vec<f32> = bands
        .iter()
        .map(|band| hz_to_bark(band.centre_frequency))
        .collect();

    // Classify each band as tonal or noise-like (7 dB prominence threshold).
    let masker_types = classify_masker_types(&band_energy_db, 7.0);

    // Step 4: masking threshold per band.
    // For each target band j, find the strongest masking contribution from
    // any band i: threshold_contribution(i→j) = energy[i] − attenuation(i,j).
    // The dominant contribution determines the masking threshold (floored by ATH).
    let masking_thresholds: Vec<f32> = (0..n_bands)
        .map(|j| {
            let dominated_by: f32 = (0..n_bands)
                .map(|i| {
                    let attenuation = spreading_attenuation(
                        bark_positions[i],
                        bark_positions[j],
                        masker_types[i],
                        config,
                    );
                    band_energy_db[i] - attenuation
                })
                .fold(f32::NEG_INFINITY, f32::max);
            dominated_by.max(ath_db[j])
        })
        .collect();

    // Step 5: per-band metrics.
    let metrics: Vec<BandMetric> = (0..n_bands)
        .map(|i| {
            let energy = band_energy_db[i];
            let masking_threshold = masking_thresholds[i];
            let smr = compute_smr(energy, masking_threshold);
            let weight = weights[i];
            let importance = weight * smr.max(0.0);
            let allowed_noise = masking_threshold - smr.max(0.0);
            BandMetric::new(
                bands[i].clone(),
                energy,
                masking_threshold,
                smr,
                importance,
                allowed_noise,
            )
        })
        .collect();

    // SAFETY: n_bands >= 1 (BandLayout invariant) so metrics is non-empty.
    let metrics_slice = unsafe { NonEmptySlice::new_unchecked(&metrics) };
    BandMetrics::new(metrics_slice)
}

// ── Temporal masking ──────────────────────────────────────────────────────────

/// Applies temporal (time-domain) masking to a sequence of per-frame band metrics.
///
/// Simultaneous masking (computed by [`compute_band_metrics`]) only considers
/// energy present in the same MDCT frame. Temporal masking extends this:
///
/// - **Post-masking (forward masking)**: a loud event at frame `t` elevates the
///   masking threshold in subsequent frames for roughly 100–200 ms. The threshold
///   decays at `post_masking_decay_db_per_ms` dB per millisecond.
/// - **Pre-masking (backward masking)**: a loud event at frame `t+1` slightly
///   raises the threshold at earlier frames for roughly 20 ms. This is weaker
///   and decays faster at `pre_masking_decay_db_per_ms` dB per millisecond.
///
/// Both effects propagate until the contribution falls below the already-computed
/// simultaneous threshold, so the simultaneous threshold is always the lower bound.
///
/// # Arguments
/// - `frame_metrics` – Per-frame [`BandMetrics`] in temporal order, as produced by
///   repeated [`compute_band_metrics`] calls. Returns an empty `Vec` if empty.
/// - `config` – Psychoacoustic config; used to re-derive `importance` and
///   `allowed_noise` with the updated thresholds.
/// - `hop_duration_ms` – Duration of each MDCT hop in milliseconds
///   (`hop_size / sample_rate × 1000`).
/// - `post_masking_decay_db_per_ms` – Forward masking decay rate.
///   Typical MPEG-1 value: `0.15` dB/ms.
/// - `pre_masking_decay_db_per_ms` – Backward masking decay rate.
///   Typical MPEG-1 value: `3.0` dB/ms.
///
/// # Returns
/// A `Vec<BandMetrics>` of the same length with masking thresholds, SMR,
/// importance, and allowed noise updated to reflect temporal spread.
///
/// # Panics
///
/// Panics if any `BandMetrics` in `frame_metrics` has a different band count
/// than the first frame, or if `config.perceptual_weights` has fewer entries
/// than the number of bands.
pub fn apply_temporal_masking(
    frame_metrics: &[BandMetrics],
    config: &PsychoacousticConfig,
    hop_duration_ms: f32,
    post_masking_decay_db_per_ms: f32,
    pre_masking_decay_db_per_ms: f32,
) -> Vec<BandMetrics> {
    if frame_metrics.is_empty() {
        return Vec::new();
    }

    let n_frames = frame_metrics.len();
    let n_bands = frame_metrics[0].metrics.len().get();

    // Work on a mutable 2D grid [frame][band] of masking thresholds.
    // Initialise from the simultaneous-masking thresholds.
    let mut thresholds: Vec<Vec<f32>> = frame_metrics
        .iter()
        .map(|fm| fm.metrics.iter().map(|m| m.masking_threshold).collect())
        .collect();

    // Post-masking: frame t propagates forward until decay removes the contribution.
    for t in 0..n_frames {
        for b in 0..n_bands {
            let masker = frame_metrics[t].metrics[b].masking_threshold;
            let mut k = 1usize;
            loop {
                let future = t + k;
                if future >= n_frames {
                    break;
                }
                let decay = post_masking_decay_db_per_ms * (k as f32 * hop_duration_ms);
                let contribution = masker - decay;
                if contribution <= thresholds[future][b] {
                    break;
                }
                thresholds[future][b] = contribution;
                k += 1;
            }
        }
    }

    // Pre-masking: frame t propagates backward (weaker, faster decay).
    for t in 1..n_frames {
        for b in 0..n_bands {
            let masker = frame_metrics[t].metrics[b].masking_threshold;
            let mut k = 1usize;
            loop {
                if k > t {
                    break;
                }
                let past = t - k;
                let decay = pre_masking_decay_db_per_ms * (k as f32 * hop_duration_ms);
                let contribution = masker - decay;
                if contribution <= thresholds[past][b] {
                    break;
                }
                thresholds[past][b] = contribution;
                k += 1;
            }
        }
    }

    // Rebuild BandMetrics with updated derived fields.
    frame_metrics
        .iter()
        .zip(thresholds.iter())
        .map(|(fm, thresh_row)| {
            let metrics: Vec<BandMetric> = fm
                .metrics
                .iter()
                .zip(thresh_row.iter())
                .enumerate()
                .map(|(b, (m, &new_threshold))| {
                    let energy = m.energy;
                    let smr = compute_smr(energy, new_threshold);
                    let weight = config.perceptual_weights[b];
                    let importance = weight * smr.max(0.0);
                    let allowed_noise = new_threshold - smr.max(0.0);
                    BandMetric::new(
                        m.band.clone(),
                        energy,
                        new_threshold,
                        smr,
                        importance,
                        allowed_noise,
                    )
                })
                .collect();
            // SAFETY: n_bands >= 1 (BandMetrics invariant, preserved from input).
            let metrics_ne = unsafe { non_empty_slice::NonEmptySlice::new_unchecked(&metrics) };
            BandMetrics::new(metrics_ne)
        })
        .collect()
}

// ── Transient detection ───────────────────────────────────────────────────────

/// Detects transient windows in a mono signal by comparing short-term energy.
///
/// A window is classified as a transient if its RMS power exceeds the previous
/// window's RMS power by more than `threshold_ratio`. This identifies sudden
/// energy onsets (attacks, percussive events) where short windows are needed
/// to prevent pre-echo.
///
/// The first window is never a transient (no prior reference).
///
/// # Arguments
/// - `samples` – Mono f32 signal samples.
/// - `window_size` – Analysis window width in samples.
/// - `hop_size` – Hop between consecutive windows (typically `window_size / 2`).
/// - `threshold_ratio` – Energy ratio that triggers a transient flag.
///   Values of `6.0`–`10.0` correspond to roughly 8–10 dB sudden onsets.
///
/// # Returns
/// A [`NonEmptyVec<bool>`] with one entry per window; `true` = transient detected.
///
/// # Panics
///
/// Panics if `window_size` or `hop_size` is zero (both are `NonZeroUsize`).
#[must_use]
pub fn detect_transient_windows(
    samples: &non_empty_slice::NonEmptySlice<f32>,
    window_size: std::num::NonZeroUsize,
    hop_size: std::num::NonZeroUsize,
    threshold_ratio: f32,
) -> non_empty_slice::NonEmptyVec<bool> {
    let ws = window_size.get();
    let hs = hop_size.get();
    let n = samples.len().get();

    let energies: Vec<f32> = (0..)
        .map(|i: usize| i * hs)
        .take_while(|&pos| pos < n)
        .map(|pos| {
            let end = (pos + ws).min(n);
            let w = &samples[pos..end];
            w.iter().map(|s| s * s).sum::<f32>() / w.len() as f32
        })
        .collect();

    let result: Vec<bool> = energies
        .iter()
        .enumerate()
        .map(|(i, &e)| i > 0 && energies[i - 1] > 1e-10 && e > energies[i - 1] * threshold_ratio)
        .collect();

    // SAFETY: at least one window since n >= 1 (NonEmptySlice).
    unsafe { non_empty_slice::NonEmptyVec::new_unchecked(result) }
}

/// Computes the signal-to-mask ratio for a single band.
///
/// SMR = band energy (dB) − masking threshold (dB). A positive value means the
/// band is audible above the masking threshold; a negative value means it is
/// fully masked.
///
/// # Arguments
/// - `band_energy_db` – Aggregated band energy in dB.
/// - `masking_threshold_db` – Masking threshold for the band in dB.
///
/// # Returns
/// SMR in dB.
#[inline]
#[must_use]
pub fn compute_smr(band_energy_db: f32, masking_threshold_db: f32) -> f32 {
    band_energy_db - masking_threshold_db
}