oximedia-audio-analysis 0.1.0

Audio analysis tools for media including loudness, spectrum, and speech detection
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
#![allow(dead_code)]
//! Psychoacoustic modeling for audio analysis.
//!
//! This module implements psychoacoustic models including:
//! - Bark and ERB critical band scales
//! - Equal-loudness contour approximation (ISO 226)
//! - Masking threshold estimation (simultaneous masking)
//! - Specific loudness computation (Zwicker model approximation)

// ── Critical-band scales ──────────────────────────────────────────────

/// Convert frequency in Hz to Bark scale.
#[allow(clippy::cast_precision_loss)]
#[must_use]
pub fn hz_to_bark(hz: f64) -> f64 {
    13.0 * (0.00076 * hz).atan() + 3.5 * ((hz / 7500.0).powi(2)).atan()
}

/// Convert Bark scale value back to Hz (iterative Newton approximation).
#[allow(clippy::cast_precision_loss)]
#[must_use]
pub fn bark_to_hz(bark: f64) -> f64 {
    // Newton iteration starting from a rough estimate
    let mut hz = bark * 100.0; // rough seed
    for _ in 0..20 {
        let b = hz_to_bark(hz);
        let db = 13.0 * 0.00076 / (1.0 + (0.00076 * hz).powi(2))
            + 3.5 * 2.0 * hz / (7500.0 * 7500.0) / (1.0 + (hz / 7500.0).powi(4));
        if db.abs() < 1e-15 {
            break;
        }
        hz -= (b - bark) / db;
        if hz < 0.0 {
            hz = 0.0;
        }
    }
    hz
}

/// Convert frequency in Hz to ERB (Equivalent Rectangular Bandwidth) rate.
#[allow(clippy::cast_precision_loss)]
#[must_use]
pub fn hz_to_erb_rate(hz: f64) -> f64 {
    21.4 * (0.00437 * hz + 1.0).log10()
}

/// Compute ERB bandwidth at a given frequency in Hz.
#[allow(clippy::cast_precision_loss)]
#[must_use]
pub fn erb_bandwidth(hz: f64) -> f64 {
    24.7 * (0.00437 * hz + 1.0)
}

// ── Equal-loudness contour (ISO 226 approximation) ────────────────────

/// Approximate A-weighting at a given frequency in Hz.
///
/// Returns the weighting in dB. This follows the standard IEC 61672-1 formula.
#[allow(clippy::cast_precision_loss)]
#[must_use]
pub fn a_weighting_db(hz: f64) -> f64 {
    if hz <= 0.0 {
        return -200.0;
    }
    let f2 = hz * hz;
    let f4 = f2 * f2;
    let num = 12194.0_f64.powi(2) * f4;
    let den = (f2 + 20.6_f64.powi(2))
        * ((f2 + 107.7_f64.powi(2)) * (f2 + 737.9_f64.powi(2))).sqrt()
        * (f2 + 12194.0_f64.powi(2));
    if den <= 0.0 {
        return -200.0;
    }
    20.0 * (num / den).log10() + 2.0
}

/// Approximate equal-loudness level at a given frequency for a given phon level.
///
/// Returns the SPL in dB that produces the given loudness in phon at that frequency.
/// This is a simplified approximation.
#[allow(clippy::cast_precision_loss)]
#[must_use]
pub fn equal_loudness_spl(hz: f64, phon: f64) -> f64 {
    // Simple approximation: at 1 kHz, phon = dB SPL.
    // Adjust using A-weighting as a rough proxy.
    let weight = a_weighting_db(hz);
    phon - weight
}

// ── Masking model ─────────────────────────────────────────────────────

/// Configuration for the masking model.
#[derive(Debug, Clone)]
pub struct MaskingConfig {
    /// FFT size for spectral analysis.
    pub fft_size: usize,
    /// Offset for tonality estimation (dB).
    pub tonal_offset_db: f64,
    /// Offset for noise masking (dB).
    pub noise_offset_db: f64,
}

impl Default for MaskingConfig {
    fn default() -> Self {
        Self {
            fft_size: 2048,
            tonal_offset_db: 14.5,
            noise_offset_db: 5.5,
        }
    }
}

/// A masking threshold at a specific frequency.
#[derive(Debug, Clone, Copy)]
pub struct MaskingThreshold {
    /// Center frequency in Hz.
    pub frequency_hz: f64,
    /// Masking threshold in dB SPL.
    pub threshold_db: f64,
    /// Whether the masker is tonal.
    pub is_tonal: bool,
}

/// Compute the spreading function for simultaneous masking.
///
/// `bark_diff` is the distance in Bark between masker and maskee.
#[allow(clippy::cast_precision_loss)]
#[must_use]
pub fn spreading_function(bark_diff: f64) -> f64 {
    // Schroeder spreading function approximation

    // Convert dB attenuation to linear, then back (keep in dB domain)
    if bark_diff < -1.0 {
        27.0 * bark_diff // steep lower slope
    } else if bark_diff <= 1.0 {
        -6.7 * bark_diff // gentler near masker / below 1 Bark
    } else {
        (-27.0 + 0.37 * 60.0) * bark_diff // upper slope level-dependent approx
    }
}

/// Estimate masking thresholds from a power spectrum.
///
/// `power_spectrum_db` should have `fft_size / 2 + 1` entries in dB.
/// `sample_rate` is in Hz.
#[allow(clippy::cast_precision_loss)]
#[must_use]
pub fn estimate_masking_thresholds(
    power_spectrum_db: &[f64],
    sample_rate: f64,
    config: &MaskingConfig,
) -> Vec<MaskingThreshold> {
    let num_bins = power_spectrum_db.len();
    let bin_hz = sample_rate / config.fft_size as f64;

    // Find local peaks as potential tonal maskers
    let mut thresholds = Vec::new();
    for i in 1..num_bins.saturating_sub(1) {
        let freq = i as f64 * bin_hz;
        if freq < 20.0 || freq > sample_rate / 2.0 {
            continue;
        }

        let is_peak = power_spectrum_db[i] > power_spectrum_db[i - 1]
            && power_spectrum_db[i] > power_spectrum_db[i + 1];

        let (threshold, is_tonal) = if is_peak && power_spectrum_db[i] > -60.0 {
            (power_spectrum_db[i] - config.tonal_offset_db, true)
        } else {
            (power_spectrum_db[i] - config.noise_offset_db, false)
        };

        thresholds.push(MaskingThreshold {
            frequency_hz: freq,
            threshold_db: threshold,
            is_tonal,
        });
    }
    thresholds
}

// ── Specific loudness (Zwicker approximation) ─────────────────────────

/// Result of specific loudness computation.
#[derive(Debug, Clone)]
pub struct LoudnessResult {
    /// Specific loudness per Bark band (sone/Bark).
    pub specific_loudness: Vec<f64>,
    /// Total loudness in sone.
    pub total_sone: f64,
    /// Total loudness in phon.
    pub total_phon: f64,
}

/// Compute total loudness from a power spectrum using a simplified Zwicker model.
///
/// `power_spectrum_db` should have `fft_size / 2 + 1` entries in dB SPL.
#[allow(clippy::cast_precision_loss)]
#[must_use]
pub fn compute_loudness(
    power_spectrum_db: &[f64],
    sample_rate: f64,
    fft_size: usize,
) -> LoudnessResult {
    let num_bins = power_spectrum_db.len();
    let bin_hz = sample_rate / fft_size as f64;

    // Accumulate energy into Bark bands (0..24 Bark)
    let num_bands = 24_usize;
    let mut band_energy_db = vec![-100.0_f64; num_bands];

    #[allow(clippy::needless_range_loop)]
    for i in 0..num_bins {
        let freq = i as f64 * bin_hz;
        let bark = hz_to_bark(freq);
        let band = (bark.floor() as usize).min(num_bands - 1);
        // Energetic sum in linear domain
        let current_lin = 10.0_f64.powf(band_energy_db[band] / 10.0);
        let add_lin = 10.0_f64.powf(power_spectrum_db[i] / 10.0);
        band_energy_db[band] = 10.0 * (current_lin + add_lin).log10();
    }

    // Simplified specific loudness: power law
    let specific_loudness: Vec<f64> = band_energy_db
        .iter()
        .map(|&db| {
            if db > -60.0 {
                let lin = 10.0_f64.powf(db / 10.0);
                lin.powf(0.23) // Stevens' power law approximation
            } else {
                0.0
            }
        })
        .collect();

    let total_sone: f64 = specific_loudness.iter().sum();
    let total_phon = if total_sone > 0.0 {
        40.0 + 10.0 * (total_sone / 0.0625_f64).log2() // inverse of sone = 2^((phon-40)/10) * 0.0625
    } else {
        0.0
    };

    LoudnessResult {
        specific_loudness,
        total_sone,
        total_phon,
    }
}

/// Compute the absolute threshold of hearing (in quiet) at a given frequency.
///
/// Returns threshold in dB SPL. Uses the ISO 226:2003 approximation formula.
#[allow(clippy::cast_precision_loss)]
#[must_use]
pub fn threshold_in_quiet(hz: f64) -> f64 {
    if hz <= 0.0 {
        return 200.0;
    }
    let khz = hz / 1000.0;
    3.64 * khz.powf(-0.8) - 6.5 * (-0.6 * (khz - 3.3).powi(2)).exp() + 1e-3 * khz.powf(4.0)
}

/// Compute critical bandwidth at a given frequency in Hz.
#[allow(clippy::cast_precision_loss)]
#[must_use]
pub fn critical_bandwidth(hz: f64) -> f64 {
    25.0 + 75.0 * (1.0 + 1.4 * (hz / 1000.0).powi(2)).powf(0.69)
}

/// Determine if a signal bin is above the absolute threshold of hearing.
#[must_use]
pub fn is_audible(frequency_hz: f64, level_db_spl: f64) -> bool {
    level_db_spl > threshold_in_quiet(frequency_hz)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_hz_to_bark_zero() {
        assert!((hz_to_bark(0.0)).abs() < 0.01);
    }

    #[test]
    fn test_hz_to_bark_1000() {
        let bark = hz_to_bark(1000.0);
        // ~8.5 Bark at 1 kHz
        assert!(bark > 7.5 && bark < 9.5, "Bark at 1kHz = {bark}");
    }

    #[test]
    fn test_bark_to_hz_roundtrip() {
        for &freq in &[100.0, 500.0, 1000.0, 4000.0, 8000.0] {
            let bark = hz_to_bark(freq);
            let back = bark_to_hz(bark);
            assert!(
                (freq - back).abs() < 5.0,
                "Roundtrip failed for {freq} Hz: got {back}"
            );
        }
    }

    #[test]
    fn test_hz_to_erb_rate() {
        let rate = hz_to_erb_rate(1000.0);
        // ~15.6 ERB rate at 1 kHz
        assert!(rate > 14.0 && rate < 17.0, "ERB rate at 1kHz = {rate}");
    }

    #[test]
    fn test_erb_bandwidth() {
        let bw = erb_bandwidth(1000.0);
        // ~132 Hz at 1 kHz
        assert!(bw > 100.0 && bw < 200.0, "ERB bw at 1kHz = {bw}");
    }

    #[test]
    fn test_a_weighting_1000hz() {
        let w = a_weighting_db(1000.0);
        // A-weighting at 1 kHz should be ~0 dB
        assert!(w.abs() < 1.0, "A-weight at 1kHz = {w}");
    }

    #[test]
    fn test_a_weighting_low_freq() {
        let w = a_weighting_db(50.0);
        // A-weighting strongly attenuates low freqs
        assert!(w < -20.0, "A-weight at 50Hz = {w}");
    }

    #[test]
    fn test_spreading_function_zero() {
        let s = spreading_function(0.0);
        assert!((s - 0.0).abs() < 0.1);
    }

    #[test]
    fn test_spreading_function_symmetric_falloff() {
        let s_pos = spreading_function(2.0);
        let s_neg = spreading_function(-2.0);
        // Both should be negative (attenuation)
        assert!(s_pos < 0.0);
        assert!(s_neg < 0.0);
    }

    #[test]
    fn test_threshold_in_quiet_1000hz() {
        let t = threshold_in_quiet(1000.0);
        // Around 3-4 dB at 1 kHz
        assert!(t > -5.0 && t < 15.0, "TiQ at 1kHz = {t}");
    }

    #[test]
    fn test_threshold_in_quiet_100hz() {
        let t = threshold_in_quiet(100.0);
        // Should be much higher at 100 Hz
        assert!(t > 15.0, "TiQ at 100Hz = {t}");
    }

    #[test]
    fn test_critical_bandwidth_low() {
        let cb = critical_bandwidth(100.0);
        assert!(cb > 25.0 && cb < 200.0, "CB at 100Hz = {cb}");
    }

    #[test]
    fn test_is_audible() {
        // 1 kHz at 60 dB SPL should be audible
        assert!(is_audible(1000.0, 60.0));
        // 1 kHz at -50 dB SPL should not be audible
        assert!(!is_audible(1000.0, -50.0));
    }

    #[test]
    fn test_compute_loudness_basic() {
        let fft_size = 1024;
        let num_bins = fft_size / 2 + 1;
        // Moderate level spectrum
        let spectrum: Vec<f64> = vec![40.0; num_bins];
        let result = compute_loudness(&spectrum, 44100.0, fft_size);
        assert_eq!(result.specific_loudness.len(), 24);
        assert!(result.total_sone > 0.0);
        assert!(result.total_phon > 0.0);
    }

    #[test]
    fn test_compute_loudness_silence() {
        let fft_size = 512;
        let num_bins = fft_size / 2 + 1;
        let spectrum: Vec<f64> = vec![-100.0; num_bins];
        let result = compute_loudness(&spectrum, 44100.0, fft_size);
        assert!((result.total_sone).abs() < 1e-10);
    }

    #[test]
    fn test_estimate_masking_thresholds() {
        let fft_size = 512;
        let num_bins = fft_size / 2 + 1;
        let mut spectrum = vec![-80.0_f64; num_bins];
        // Add a tonal peak
        spectrum[50] = 60.0;
        let config = MaskingConfig {
            fft_size,
            ..MaskingConfig::default()
        };
        let thresholds = estimate_masking_thresholds(&spectrum, 44100.0, &config);
        assert!(!thresholds.is_empty());
        // The peak should produce a tonal masker
        assert!(thresholds.iter().any(|t| t.is_tonal));
    }
}