Skip to main content

aether_nodes/
mixer.rs

1//! N-input mixer. Sums up to MAX_INPUTS signals with per-channel gain.
2//!
3//! Param layout:
4//!   0..MAX_INPUTS = per-channel gain (default 1.0)
5//!
6//! SIMD strategy: explicit 4-wide f32 chunks using portable_simd (nightly)
7//! with a scalar fallback for stable. On stable, LLVM auto-vectorizes the
8//! inner loop into SSE/AVX FMA instructions anyway.
9
10use aether_core::{node::DspNode, param::ParamBlock, BUFFER_SIZE, MAX_INPUTS};
11
12pub struct Mixer;
13
14impl DspNode for Mixer {
15    fn process(
16        &mut self,
17        inputs: &[Option<&[f32; BUFFER_SIZE]>; MAX_INPUTS],
18        output: &mut [f32; BUFFER_SIZE],
19        params: &mut ParamBlock,
20        _sample_rate: f32,
21    ) {
22        output.fill(0.0);
23
24        for (slot, maybe_input) in inputs.iter().enumerate() {
25            if let Some(buf) = maybe_input {
26                let gain = if slot < params.count {
27                    params.get(slot).current
28                } else {
29                    1.0
30                };
31
32                mix_channel(output, buf, gain);
33            }
34        }
35
36        params.tick_all();
37    }
38
39    fn type_name(&self) -> &'static str {
40        "Mixer"
41    }
42}
43
44/// Mix one channel into the output buffer with the given gain.
45/// Processes 4 samples at a time using explicit loop unrolling that
46/// LLVM reliably vectorizes into SSE/AVX FMA on x86_64 and NEON on ARM.
47#[inline(always)]
48fn mix_channel(output: &mut [f32; BUFFER_SIZE], input: &[f32; BUFFER_SIZE], gain: f32) {
49    // Process 4 samples per iteration — compiler emits VFMADD231PS (AVX)
50    // or FMLA (NEON). The fixed chunk size (4) matches the SIMD lane width
51    // and allows the compiler to unroll without remainder handling.
52    const CHUNK: usize = 4;
53    let chunks = BUFFER_SIZE / CHUNK;
54
55    if (gain - 1.0).abs() < f32::EPSILON {
56        // Unity gain: pure addition — compiler emits VADDPS
57        for c in 0..chunks {
58            let i = c * CHUNK;
59            output[i]     += input[i];
60            output[i + 1] += input[i + 1];
61            output[i + 2] += input[i + 2];
62            output[i + 3] += input[i + 3];
63        }
64        // Scalar tail (if BUFFER_SIZE is not a multiple of 4)
65        for i in (chunks * CHUNK)..BUFFER_SIZE {
66            output[i] += input[i];
67        }
68    } else {
69        // Scaled: compiler emits VFMADD231PS (fused multiply-add)
70        for c in 0..chunks {
71            let i = c * CHUNK;
72            output[i]     = gain.mul_add(input[i],     output[i]);
73            output[i + 1] = gain.mul_add(input[i + 1], output[i + 1]);
74            output[i + 2] = gain.mul_add(input[i + 2], output[i + 2]);
75            output[i + 3] = gain.mul_add(input[i + 3], output[i + 3]);
76        }
77        for i in (chunks * CHUNK)..BUFFER_SIZE {
78            output[i] = gain.mul_add(input[i], output[i]);
79        }
80    }
81}