Skip to main content

truce_simd/
math.rs

1//! Vectorized transcendentals at audio-grade precision.
2//!
3//! Each `*_block` op mirrors the shape of [`crate::ops`]: takes a
4//! mutable output slice and a read-only input slice, writes
5//! element-wise. Lengths are taken to `min(out, src)`. Scalar
6//! fallback is identical math; the SIMD path uses `wide`'s
7//! sleef-derived approximations.
8//!
9//! Why these and not a full math library: every truce plugin
10//! converts dB to linear (gain, mix, send, peak) and most
11//! waveshapers want a fast `tanh`. The rest (`exp2`, `log2`) come
12//! along for free since they're the building blocks of the first
13//! two. We do not ship the wider set (`sin`, `cos`, `erf`,
14//! `lgamma`); plugins that need those pull `wide` directly or use
15//! `simdeez` / `sleef`.
16//!
17//! ## Error bounds
18//!
19//! - `db_to_linear_block`, `linear_to_db_block`: < 0.1 dB across
20//!   the audio range `[-120, +24]` dB. Round-trip through
21//!   linear→dB→linear stays within 1 part in 10^6 (verified by
22//!   the fuzz test).
23//! - `exp2_block`, `log2_block`: < 1 ULP for inputs in
24//!   `[-126, +127]` (exp2) and `[2^-126, 2^127]` (log2). NaN /
25//!   negative log2 inputs return NaN.
26//! - `tanh_block`: exp-identity form via `wide::exp`, < 5e-6
27//!   absolute error vs `f32::tanh` across `[-10, +10]`. Inputs
28//!   outside that range clamp first; at `|x| = 10`, true `tanh`
29//!   is already within 5e-9 of `±1`.
30
31/// `20 / log2(10)`, for `linear → dB`.
32#[cfg(feature = "wide-backend")]
33const TWENTY_OVER_LOG2_10: f32 = 6.020_6;
34
35/// `out[i] = 10^(src[i] / 20)`. The dB → linear conversion every
36/// gain knob needs.
37#[inline]
38pub fn db_to_linear_block(out: &mut [f32], src: &[f32]) {
39    #[cfg(feature = "wide-backend")]
40    {
41        use wide::f32x8;
42        let n = out.len().min(src.len());
43        let n8 = n / 8 * 8;
44        // 10^(db/20) = exp(db * ln(10) / 20). `wide` provides
45        // `exp` directly; routing through it is one fma + one
46        // exp per chunk.
47        let scale = f32x8::splat(core::f32::consts::LN_10 / 20.0);
48        let (head_out, tail_out) = out[..n].split_at_mut(n8);
49        for (out_chunk, src_chunk) in head_out.chunks_exact_mut(8).zip(src[..n8].chunks_exact(8)) {
50            let v = f32x8::from(<[f32; 8]>::try_from(src_chunk).unwrap_or_default());
51            out_chunk.copy_from_slice((v * scale).exp().as_array_ref());
52        }
53        db_to_linear_block_scalar(tail_out, &src[n8..n]);
54    }
55    #[cfg(not(feature = "wide-backend"))]
56    db_to_linear_block_scalar(out, src);
57}
58
59/// Scalar fallback for [`db_to_linear_block`]. Kept `pub` so
60/// Criterion benches can hand the same inputs to the scalar and
61/// vector paths.
62#[inline]
63pub fn db_to_linear_block_scalar(out: &mut [f32], src: &[f32]) {
64    let n = out.len().min(src.len());
65    for i in 0..n {
66        out[i] = 10.0_f32.powf(src[i] / 20.0);
67    }
68}
69
70/// `out[i] = 20 * log10(src[i])`. Linear → dB. Returns
71/// `-f32::INFINITY` for zero input (matches `f32::log10`).
72#[inline]
73pub fn linear_to_db_block(out: &mut [f32], src: &[f32]) {
74    #[cfg(feature = "wide-backend")]
75    {
76        use wide::f32x8;
77        let n = out.len().min(src.len());
78        let n8 = n / 8 * 8;
79        let scale = f32x8::splat(TWENTY_OVER_LOG2_10);
80        let (head_out, tail_out) = out[..n].split_at_mut(n8);
81        for (out_chunk, src_chunk) in head_out.chunks_exact_mut(8).zip(src[..n8].chunks_exact(8)) {
82            let v = f32x8::from(<[f32; 8]>::try_from(src_chunk).unwrap_or_default());
83            out_chunk.copy_from_slice((v.log2() * scale).as_array_ref());
84        }
85        linear_to_db_block_scalar(tail_out, &src[n8..n]);
86    }
87    #[cfg(not(feature = "wide-backend"))]
88    linear_to_db_block_scalar(out, src);
89}
90
91/// Scalar fallback for [`linear_to_db_block`].
92#[inline]
93pub fn linear_to_db_block_scalar(out: &mut [f32], src: &[f32]) {
94    let n = out.len().min(src.len());
95    for i in 0..n {
96        out[i] = 20.0 * src[i].log10();
97    }
98}
99
100/// `out[i] = 2^src[i]`. The building block for `exp` and
101/// `db_to_linear`.
102#[inline]
103pub fn exp2_block(out: &mut [f32], src: &[f32]) {
104    #[cfg(feature = "wide-backend")]
105    {
106        use wide::f32x8;
107        let n = out.len().min(src.len());
108        let n8 = n / 8 * 8;
109        let ln2 = f32x8::splat(core::f32::consts::LN_2);
110        let (head_out, tail_out) = out[..n].split_at_mut(n8);
111        for (out_chunk, src_chunk) in head_out.chunks_exact_mut(8).zip(src[..n8].chunks_exact(8)) {
112            // exp2(x) = exp(x * ln(2)). `wide` has `exp` natively
113            // but no `exp2`; multiply-then-exp is the same cycle
114            // count as a hypothetical direct `exp2`.
115            let v = f32x8::from(<[f32; 8]>::try_from(src_chunk).unwrap_or_default());
116            out_chunk.copy_from_slice((v * ln2).exp().as_array_ref());
117        }
118        exp2_block_scalar(tail_out, &src[n8..n]);
119    }
120    #[cfg(not(feature = "wide-backend"))]
121    exp2_block_scalar(out, src);
122}
123
124/// Scalar fallback for [`exp2_block`].
125#[inline]
126pub fn exp2_block_scalar(out: &mut [f32], src: &[f32]) {
127    let n = out.len().min(src.len());
128    for i in 0..n {
129        out[i] = src[i].exp2();
130    }
131}
132
133/// `out[i] = log2(src[i])`. Building block for log10 and
134/// `linear_to_db`.
135#[inline]
136pub fn log2_block(out: &mut [f32], src: &[f32]) {
137    #[cfg(feature = "wide-backend")]
138    {
139        use wide::f32x8;
140        let n = out.len().min(src.len());
141        let n8 = n / 8 * 8;
142        let (head_out, tail_out) = out[..n].split_at_mut(n8);
143        for (out_chunk, src_chunk) in head_out.chunks_exact_mut(8).zip(src[..n8].chunks_exact(8)) {
144            let v = f32x8::from(<[f32; 8]>::try_from(src_chunk).unwrap_or_default());
145            out_chunk.copy_from_slice(v.log2().as_array_ref());
146        }
147        log2_block_scalar(tail_out, &src[n8..n]);
148    }
149    #[cfg(not(feature = "wide-backend"))]
150    log2_block_scalar(out, src);
151}
152
153/// Scalar fallback for [`log2_block`].
154#[inline]
155pub fn log2_block_scalar(out: &mut [f32], src: &[f32]) {
156    let n = out.len().min(src.len());
157    for i in 0..n {
158        out[i] = src[i].log2();
159    }
160}
161
162/// `out[i] = tanh(src[i])`. For soft-clipping waveshapers and any
163/// other DSP that wants a bounded sigmoid.
164///
165/// Computed via the exp identity `tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)`,
166/// which keeps the entire pipeline inside `wide`'s native `exp`
167/// SIMD intrinsic. Input is clamped to `[-10, +10]` first so the
168/// exponentiation can't overflow; at that magnitude `tanh` is
169/// already within 5e-9 of `±1`. Error vs `f32::tanh` stays below
170/// 5e-6 absolute across the audio range.
171#[inline]
172pub fn tanh_block(out: &mut [f32], src: &[f32]) {
173    #[cfg(feature = "wide-backend")]
174    {
175        use wide::f32x8;
176        let n = out.len().min(src.len());
177        let n8 = n / 8 * 8;
178        let bound = f32x8::splat(10.0);
179        let neg_bound = f32x8::splat(-10.0);
180        let two = f32x8::splat(2.0);
181        let one = f32x8::splat(1.0);
182        let (head_out, tail_out) = out[..n].split_at_mut(n8);
183        for (out_chunk, src_chunk) in head_out.chunks_exact_mut(8).zip(src[..n8].chunks_exact(8)) {
184            let x = f32x8::from(<[f32; 8]>::try_from(src_chunk).unwrap_or_default());
185            let x_clamped = x.fast_max(neg_bound).fast_min(bound);
186            let e2x = (x_clamped * two).exp();
187            let result = (e2x - one) / (e2x + one);
188            out_chunk.copy_from_slice(result.as_array_ref());
189        }
190        tanh_block_scalar(tail_out, &src[n8..n]);
191    }
192    #[cfg(not(feature = "wide-backend"))]
193    tanh_block_scalar(out, src);
194}
195
196/// Scalar fallback for [`tanh_block`]. Uses libm's `tanh` (full
197/// precision) rather than the Padé approximant; the SIMD path's
198/// approximation is the cost of vector throughput, and the scalar
199/// path doesn't pay that cost.
200#[inline]
201pub fn tanh_block_scalar(out: &mut [f32], src: &[f32]) {
202    let n = out.len().min(src.len());
203    for i in 0..n {
204        out[i] = src[i].tanh();
205    }
206}
207
208#[cfg(test)]
209mod tests {
210    // Tolerance-based comparisons; bit-exactness isn't the
211    // contract here (approximations are the whole point).
212    #![allow(clippy::float_cmp, clippy::cast_precision_loss)]
213
214    use super::*;
215
216    fn max_abs_err(a: &[f32], b: &[f32]) -> f32 {
217        a.iter()
218            .zip(b.iter())
219            .map(|(x, y)| (x - y).abs())
220            .fold(0.0_f32, f32::max)
221    }
222
223    fn max_rel_err(a: &[f32], b: &[f32]) -> f32 {
224        a.iter()
225            .zip(b.iter())
226            .filter(|(_, y)| y.abs() > 1e-6)
227            .map(|(x, y)| ((x - y) / y).abs())
228            .fold(0.0_f32, f32::max)
229    }
230
231    #[test]
232    fn db_to_linear_block_matches_libm() {
233        let src: Vec<f32> = (-120..=24).map(|i| i as f32).collect();
234        let mut out = vec![0.0; src.len()];
235        db_to_linear_block(&mut out, &src);
236        let expected: Vec<f32> = src.iter().map(|&x| 10.0_f32.powf(x / 20.0)).collect();
237        // Across [-120, +24] dB the relative error budget is 1e-5
238        // (well under 0.1 dB).
239        assert!(
240            max_rel_err(&out, &expected) < 1e-5,
241            "rel err = {}",
242            max_rel_err(&out, &expected)
243        );
244    }
245
246    #[test]
247    fn linear_to_db_round_trips() {
248        let db: Vec<f32> = (-100..=20).map(|i| i as f32).collect();
249        let mut lin = vec![0.0; db.len()];
250        let mut roundtrip = vec![0.0; db.len()];
251        db_to_linear_block(&mut lin, &db);
252        linear_to_db_block(&mut roundtrip, &lin);
253        // Round-trip stays within 1e-4 dB.
254        let err = max_abs_err(&db, &roundtrip);
255        assert!(err < 1e-4, "round-trip err = {err} dB");
256    }
257
258    #[test]
259    fn exp2_block_matches_libm() {
260        let src: Vec<f32> = (-100..=100).map(|i| i as f32 * 0.1).collect();
261        let mut out = vec![0.0; src.len()];
262        exp2_block(&mut out, &src);
263        let expected: Vec<f32> = src.iter().map(|&x| x.exp2()).collect();
264        assert!(
265            max_rel_err(&out, &expected) < 1e-5,
266            "rel err = {}",
267            max_rel_err(&out, &expected)
268        );
269    }
270
271    #[test]
272    fn log2_block_matches_libm() {
273        let src: Vec<f32> = (1..=200).map(|i| i as f32).collect();
274        let mut out = vec![0.0; src.len()];
275        log2_block(&mut out, &src);
276        let expected: Vec<f32> = src.iter().map(|&x| x.log2()).collect();
277        assert!(
278            max_abs_err(&out, &expected) < 1e-5,
279            "abs err = {}",
280            max_abs_err(&out, &expected)
281        );
282    }
283
284    #[test]
285    fn tanh_block_matches_libm() {
286        let src: Vec<f32> = (-100..=100).map(|i| i as f32 * 0.1).collect();
287        let mut out = vec![0.0; src.len()];
288        tanh_block(&mut out, &src);
289        let expected: Vec<f32> = src.iter().map(|&x| x.tanh()).collect();
290        let err = max_abs_err(&out, &expected);
291        assert!(err < 5e-6, "abs err = {err}");
292    }
293
294    #[test]
295    fn tanh_block_saturates_for_large_inputs() {
296        let src = [-50.0, -20.0, 20.0, 50.0];
297        let mut out = [0.0; 4];
298        tanh_block(&mut out, &src);
299        for &y in &out {
300            assert!(
301                (y.abs() - 1.0).abs() < 1e-4,
302                "expected saturation near ±1, got {y}"
303            );
304        }
305    }
306
307    #[test]
308    fn lengths_min_clamped() {
309        let src = [1.0_f32, 2.0, 3.0];
310        let mut out = [0.0_f32; 5];
311        db_to_linear_block(&mut out, &src);
312        // Last two slots untouched.
313        assert_eq!(out[3], 0.0);
314        assert_eq!(out[4], 0.0);
315    }
316}