Skip to main content

maolan_engine/
simd.rs

1//! Portable SIMD helper routines for buffer math.
2//!
3//! Uses runtime CPU feature detection to dispatch:
4//! - AVX (`f32x8`) on x86_64/x86 when available
5//! - SSE (`wide::f32x4`) as fallback on x86_64/x86
6//! - Scalar loops on all other architectures
7
8#![allow(unsafe_op_in_unsafe_fn)]
9
10#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
11mod x86 {
12    pub use std::arch::x86_64::*;
13    pub use wide::f32x4;
14}
15
16#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
17use x86::*;
18
19// ---------------------------------------------------------------------------
20// Public API
21// ---------------------------------------------------------------------------
22
23/// dst[i] += src[i]
24pub fn add_inplace(dst: &mut [f32], src: &[f32]) {
25    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
26    unsafe {
27        if is_x86_feature_detected!("avx") {
28            add_inplace_avx(dst, src);
29            return;
30        }
31        if is_x86_feature_detected!("sse") {
32            add_inplace_sse(dst, src);
33            return;
34        }
35    }
36    add_inplace_scalar(dst, src);
37}
38
39/// dst[i] *= gain
40pub fn mul_inplace(dst: &mut [f32], gain: f32) {
41    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
42    unsafe {
43        if is_x86_feature_detected!("avx") {
44            mul_inplace_avx(dst, gain);
45            return;
46        }
47        if is_x86_feature_detected!("sse") {
48            mul_inplace_sse(dst, gain);
49            return;
50        }
51    }
52    mul_inplace_scalar(dst, gain);
53}
54
55/// dst[i] += src[i] * gain
56pub fn add_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
57    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
58    unsafe {
59        if is_x86_feature_detected!("avx") {
60            add_scaled_inplace_avx(dst, src, gain);
61            return;
62        }
63        if is_x86_feature_detected!("sse") {
64            add_scaled_inplace_sse(dst, src, gain);
65            return;
66        }
67    }
68    add_scaled_inplace_scalar(dst, src, gain);
69}
70
71/// dst[i] = src[i] * gain
72pub fn copy_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
73    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
74    unsafe {
75        if is_x86_feature_detected!("avx") {
76            copy_scaled_inplace_avx(dst, src, gain);
77            return;
78        }
79        if is_x86_feature_detected!("sse") {
80            copy_scaled_inplace_sse(dst, src, gain);
81            return;
82        }
83    }
84    copy_scaled_inplace_scalar(dst, src, gain);
85}
86
87/// dst[i] += sanitize_finite(src[i])
88pub fn add_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
89    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
90    unsafe {
91        if is_x86_feature_detected!("avx") {
92            add_sanitized_inplace_avx(dst, src);
93            return;
94        }
95        if is_x86_feature_detected!("sse") {
96            add_sanitized_inplace_sse(dst, src);
97            return;
98        }
99    }
100    add_sanitized_inplace_scalar(dst, src);
101}
102
103/// dst[i] = sanitize_finite(src[i])
104pub fn copy_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
105    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
106    unsafe {
107        if is_x86_feature_detected!("avx") {
108            copy_sanitized_inplace_avx(dst, src);
109            return;
110        }
111        if is_x86_feature_detected!("sse") {
112            copy_sanitized_inplace_sse(dst, src);
113            return;
114        }
115    }
116    copy_sanitized_inplace_scalar(dst, src);
117}
118
119/// Replace NaN / ±Inf with 0.0 in place.
120pub fn sanitize_finite_inplace(buf: &mut [f32]) {
121    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
122    unsafe {
123        if is_x86_feature_detected!("avx") {
124            sanitize_finite_inplace_avx(buf);
125            return;
126        }
127        if is_x86_feature_detected!("sse") {
128            sanitize_finite_inplace_sse(buf);
129            return;
130        }
131    }
132    sanitize_finite_inplace_scalar(buf);
133}
134
135/// Horizontal max of abs(buf[i]).
136pub fn peak_abs(buf: &[f32]) -> f32 {
137    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
138    unsafe {
139        if is_x86_feature_detected!("avx") {
140            return peak_abs_avx(buf);
141        }
142        if is_x86_feature_detected!("sse") {
143            return peak_abs_sse(buf);
144        }
145    }
146    peak_abs_scalar(buf)
147}
148
149/// Clamp every element to [min, max].
150pub fn clamp_inplace(buf: &mut [f32], min: f32, max: f32) {
151    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
152    unsafe {
153        if is_x86_feature_detected!("avx") {
154            clamp_inplace_avx(buf, min, max);
155            return;
156        }
157        if is_x86_feature_detected!("sse") {
158            clamp_inplace_sse(buf, min, max);
159            return;
160        }
161    }
162    clamp_inplace_scalar(buf, min, max);
163}
164
165// ---------------------------------------------------------------------------
166// Scalar fallbacks
167// ---------------------------------------------------------------------------
168
169fn add_inplace_scalar(dst: &mut [f32], src: &[f32]) {
170    for (d, s) in dst.iter_mut().zip(src.iter()) {
171        *d += *s;
172    }
173}
174
175fn mul_inplace_scalar(dst: &mut [f32], gain: f32) {
176    for d in dst.iter_mut() {
177        *d *= gain;
178    }
179}
180
181fn add_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
182    for (d, s) in dst.iter_mut().zip(src.iter()) {
183        *d += *s * gain;
184    }
185}
186
187fn copy_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
188    for (d, s) in dst.iter_mut().zip(src.iter()) {
189        *d = *s * gain;
190    }
191}
192
193fn add_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
194    for (d, s) in dst.iter_mut().zip(src.iter()) {
195        *d += if s.is_finite() { *s } else { 0.0 };
196    }
197}
198
199fn copy_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
200    for (d, s) in dst.iter_mut().zip(src.iter()) {
201        *d = if s.is_finite() { *s } else { 0.0 };
202    }
203}
204
205fn sanitize_finite_inplace_scalar(buf: &mut [f32]) {
206    for s in buf.iter_mut() {
207        if !s.is_finite() {
208            *s = 0.0;
209        }
210    }
211}
212
213fn peak_abs_scalar(buf: &[f32]) -> f32 {
214    buf.iter().fold(0.0f32, |acc, s| acc.max(s.abs()))
215}
216
217fn clamp_inplace_scalar(buf: &mut [f32], min: f32, max: f32) {
218    for s in buf.iter_mut() {
219        *s = s.clamp(min, max);
220    }
221}
222
223// ---------------------------------------------------------------------------
224// SSE paths (wide::f32x4)
225// ---------------------------------------------------------------------------
226
227#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
228unsafe fn add_inplace_sse(dst: &mut [f32], src: &[f32]) {
229    let len = dst.len().min(src.len());
230    let dst_head = &mut dst[..len];
231    let src_head = &src[..len];
232    let n = dst_head.len() / 4;
233    for i in 0..n {
234        let d_chunk = &mut dst_head[i * 4..(i + 1) * 4];
235        let s_chunk = &src_head[i * 4..(i + 1) * 4];
236        let d: f32x4 = [d_chunk[0], d_chunk[1], d_chunk[2], d_chunk[3]].into();
237        let s: f32x4 = [s_chunk[0], s_chunk[1], s_chunk[2], s_chunk[3]].into();
238        let r = d + s;
239        d_chunk.copy_from_slice(&r.to_array());
240    }
241    for (d, s) in dst_head[n * 4..].iter_mut().zip(src_head[n * 4..].iter()) {
242        *d += *s;
243    }
244}
245
246#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
247unsafe fn mul_inplace_sse(dst: &mut [f32], gain: f32) {
248    let n = dst.len() / 4;
249    let g: f32x4 = [gain; 4].into();
250    for i in 0..n {
251        let d_chunk = &mut dst[i * 4..(i + 1) * 4];
252        let d: f32x4 = [d_chunk[0], d_chunk[1], d_chunk[2], d_chunk[3]].into();
253        let r = d * g;
254        d_chunk.copy_from_slice(&r.to_array());
255    }
256    for d in &mut dst[n * 4..] {
257        *d *= gain;
258    }
259}
260
261#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
262unsafe fn add_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
263    let len = dst.len().min(src.len());
264    let dst_head = &mut dst[..len];
265    let src_head = &src[..len];
266    let n = dst_head.len() / 4;
267    let g: f32x4 = [gain; 4].into();
268    for i in 0..n {
269        let d_chunk = &mut dst_head[i * 4..(i + 1) * 4];
270        let s_chunk = &src_head[i * 4..(i + 1) * 4];
271        let d: f32x4 = [d_chunk[0], d_chunk[1], d_chunk[2], d_chunk[3]].into();
272        let s: f32x4 = [s_chunk[0], s_chunk[1], s_chunk[2], s_chunk[3]].into();
273        let r = d + s * g;
274        d_chunk.copy_from_slice(&r.to_array());
275    }
276    for (d, s) in dst_head[n * 4..].iter_mut().zip(src_head[n * 4..].iter()) {
277        *d += *s * gain;
278    }
279}
280
281#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
282unsafe fn copy_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
283    let len = dst.len().min(src.len());
284    let dst_head = &mut dst[..len];
285    let src_head = &src[..len];
286    let n = dst_head.len() / 4;
287    let g: f32x4 = [gain; 4].into();
288    for i in 0..n {
289        let d_chunk = &mut dst_head[i * 4..(i + 1) * 4];
290        let s_chunk = &src_head[i * 4..(i + 1) * 4];
291        let s: f32x4 = [s_chunk[0], s_chunk[1], s_chunk[2], s_chunk[3]].into();
292        let r = s * g;
293        d_chunk.copy_from_slice(&r.to_array());
294    }
295    for (d, s) in dst_head[n * 4..].iter_mut().zip(src_head[n * 4..].iter()) {
296        *d = *s * gain;
297    }
298}
299
300#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
301unsafe fn add_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
302    let len = dst.len().min(src.len());
303    let dst_head = &mut dst[..len];
304    let src_head = &src[..len];
305    let n = dst_head.len() / 4;
306    let zero: f32x4 = [0.0; 4].into();
307    for i in 0..n {
308        let d_chunk = &mut dst_head[i * 4..(i + 1) * 4];
309        let s_chunk = &src_head[i * 4..(i + 1) * 4];
310        let d: f32x4 = [d_chunk[0], d_chunk[1], d_chunk[2], d_chunk[3]].into();
311        let s: f32x4 = [s_chunk[0], s_chunk[1], s_chunk[2], s_chunk[3]].into();
312        let mask = s.is_finite();
313        let sanitized = mask.blend(s, zero);
314        let r = d + sanitized;
315        d_chunk.copy_from_slice(&r.to_array());
316    }
317    for (d, s) in dst_head[n * 4..].iter_mut().zip(src_head[n * 4..].iter()) {
318        *d += if s.is_finite() { *s } else { 0.0 };
319    }
320}
321
322#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
323unsafe fn copy_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
324    let len = dst.len().min(src.len());
325    let dst_head = &mut dst[..len];
326    let src_head = &src[..len];
327    let n = dst_head.len() / 4;
328    let zero: f32x4 = [0.0; 4].into();
329    for i in 0..n {
330        let d_chunk = &mut dst_head[i * 4..(i + 1) * 4];
331        let s_chunk = &src_head[i * 4..(i + 1) * 4];
332        let s: f32x4 = [s_chunk[0], s_chunk[1], s_chunk[2], s_chunk[3]].into();
333        let mask = s.is_finite();
334        let r = mask.blend(s, zero);
335        d_chunk.copy_from_slice(&r.to_array());
336    }
337    for (d, s) in dst_head[n * 4..].iter_mut().zip(src_head[n * 4..].iter()) {
338        *d = if s.is_finite() { *s } else { 0.0 };
339    }
340}
341
342#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
343unsafe fn sanitize_finite_inplace_sse(buf: &mut [f32]) {
344    let n = buf.len() / 4;
345    let zero: f32x4 = [0.0; 4].into();
346    for i in 0..n {
347        let chunk = &mut buf[i * 4..(i + 1) * 4];
348        let v: f32x4 = [chunk[0], chunk[1], chunk[2], chunk[3]].into();
349        let mask = v.is_finite();
350        let r = mask.blend(v, zero);
351        chunk.copy_from_slice(&r.to_array());
352    }
353    for s in &mut buf[n * 4..] {
354        if !s.is_finite() {
355            *s = 0.0;
356        }
357    }
358}
359
360#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
361unsafe fn peak_abs_sse(buf: &[f32]) -> f32 {
362    let n = buf.len() / 4;
363    let mut peak: f32x4 = [0.0; 4].into();
364    for i in 0..n {
365        let chunk = &buf[i * 4..(i + 1) * 4];
366        let v: f32x4 = [chunk[0], chunk[1], chunk[2], chunk[3]].into();
367        peak = peak.max(v.abs());
368    }
369    let mut max_scalar = peak.to_array().into_iter().fold(0.0f32, |a, b| a.max(b));
370    for s in &buf[n * 4..] {
371        max_scalar = max_scalar.max(s.abs());
372    }
373    max_scalar
374}
375
376#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
377unsafe fn clamp_inplace_sse(buf: &mut [f32], min: f32, max: f32) {
378    let n = buf.len() / 4;
379    let vmin: f32x4 = [min; 4].into();
380    let vmax: f32x4 = [max; 4].into();
381    for i in 0..n {
382        let chunk = &mut buf[i * 4..(i + 1) * 4];
383        let v: f32x4 = [chunk[0], chunk[1], chunk[2], chunk[3]].into();
384        let r = v.clamp(vmin, vmax);
385        chunk.copy_from_slice(&r.to_array());
386    }
387    for s in &mut buf[n * 4..] {
388        *s = s.clamp(min, max);
389    }
390}
391
392// ---------------------------------------------------------------------------
393// AVX paths (std::arch::x86_64::__m256)
394// ---------------------------------------------------------------------------
395
396#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
397#[target_feature(enable = "avx")]
398unsafe fn add_inplace_avx(dst: &mut [f32], src: &[f32]) {
399    let len = dst.len().min(src.len());
400    let dst_head = &mut dst[..len];
401    let src_head = &src[..len];
402    let mut i = 0;
403    while i + 8 <= dst_head.len() {
404        let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
405        let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
406        let r = _mm256_add_ps(d, s);
407        _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
408        i += 8;
409    }
410    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
411        *d += *s;
412    }
413}
414
415#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
416#[target_feature(enable = "avx")]
417unsafe fn mul_inplace_avx(dst: &mut [f32], gain: f32) {
418    let g = _mm256_set1_ps(gain);
419    let mut i = 0;
420    while i + 8 <= dst.len() {
421        let d = _mm256_loadu_ps(dst.as_ptr().add(i));
422        let r = _mm256_mul_ps(d, g);
423        _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
424        i += 8;
425    }
426    for d in &mut dst[i..] {
427        *d *= gain;
428    }
429}
430
431#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
432#[target_feature(enable = "avx")]
433unsafe fn add_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
434    let len = dst.len().min(src.len());
435    let dst_head = &mut dst[..len];
436    let src_head = &src[..len];
437    let g = _mm256_set1_ps(gain);
438    let mut i = 0;
439    while i + 8 <= dst_head.len() {
440        let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
441        let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
442        let r = _mm256_add_ps(d, _mm256_mul_ps(s, g));
443        _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
444        i += 8;
445    }
446    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
447        *d += *s * gain;
448    }
449}
450
451#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
452#[target_feature(enable = "avx")]
453unsafe fn copy_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
454    let len = dst.len().min(src.len());
455    let dst_head = &mut dst[..len];
456    let src_head = &src[..len];
457    let g = _mm256_set1_ps(gain);
458    let mut i = 0;
459    while i + 8 <= dst_head.len() {
460        let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
461        let r = _mm256_mul_ps(s, g);
462        _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
463        i += 8;
464    }
465    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
466        *d = *s * gain;
467    }
468}
469
470#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
471#[target_feature(enable = "avx")]
472unsafe fn add_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
473    let len = dst.len().min(src.len());
474    let dst_head = &mut dst[..len];
475    let src_head = &src[..len];
476    let zero = _mm256_setzero_ps();
477    let max_val = _mm256_set1_ps(f32::MAX);
478    let mut i = 0;
479    while i + 8 <= dst_head.len() {
480        let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
481        let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
482        let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
483        let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
484        let sanitized = _mm256_blendv_ps(zero, s, mask);
485        let r = _mm256_add_ps(d, sanitized);
486        _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
487        i += 8;
488    }
489    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
490        *d += if s.is_finite() { *s } else { 0.0 };
491    }
492}
493
494#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
495#[target_feature(enable = "avx")]
496unsafe fn copy_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
497    let len = dst.len().min(src.len());
498    let dst_head = &mut dst[..len];
499    let src_head = &src[..len];
500    let zero = _mm256_setzero_ps();
501    let max_val = _mm256_set1_ps(f32::MAX);
502    let mut i = 0;
503    while i + 8 <= dst_head.len() {
504        let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
505        let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
506        let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
507        let r = _mm256_blendv_ps(zero, s, mask);
508        _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
509        i += 8;
510    }
511    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
512        *d = if s.is_finite() { *s } else { 0.0 };
513    }
514}
515
516#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
517#[target_feature(enable = "avx")]
518unsafe fn sanitize_finite_inplace_avx(buf: &mut [f32]) {
519    let zero = _mm256_setzero_ps();
520    let max_val = _mm256_set1_ps(f32::MAX);
521    let mut i = 0;
522    while i + 8 <= buf.len() {
523        let v = _mm256_loadu_ps(buf.as_ptr().add(i));
524        let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
525        let mask = _mm256_cmp_ps(abs_v, max_val, _CMP_LE_OQ);
526        let r = _mm256_blendv_ps(zero, v, mask);
527        _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
528        i += 8;
529    }
530    for s in &mut buf[i..] {
531        if !s.is_finite() {
532            *s = 0.0;
533        }
534    }
535}
536
537#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
538#[target_feature(enable = "avx")]
539unsafe fn peak_abs_avx(buf: &[f32]) -> f32 {
540    let mut peak = _mm256_setzero_ps();
541    let mut i = 0;
542    while i + 8 <= buf.len() {
543        let v = _mm256_loadu_ps(buf.as_ptr().add(i));
544        let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
545        peak = _mm256_max_ps(peak, abs_v);
546        i += 8;
547    }
548    let mut arr = [0.0f32; 8];
549    _mm256_storeu_ps(arr.as_mut_ptr(), peak);
550    let mut max_scalar = arr.iter().fold(0.0f32, |a, &b| a.max(b));
551    for s in &buf[i..] {
552        max_scalar = max_scalar.max(s.abs());
553    }
554    max_scalar
555}
556
557#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
558#[target_feature(enable = "avx")]
559unsafe fn clamp_inplace_avx(buf: &mut [f32], min: f32, max: f32) {
560    let vmin = _mm256_set1_ps(min);
561    let vmax = _mm256_set1_ps(max);
562    let mut i = 0;
563    while i + 8 <= buf.len() {
564        let v = _mm256_loadu_ps(buf.as_ptr().add(i));
565        let r = _mm256_max_ps(vmin, _mm256_min_ps(vmax, v));
566        _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
567        i += 8;
568    }
569    for s in &mut buf[i..] {
570        *s = s.clamp(min, max);
571    }
572}
573
574#[cfg(test)]
575mod tests {
576    use super::*;
577
578    #[test]
579    fn add_inplace_basic() {
580        let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
581        let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
582        add_inplace(&mut a, &b);
583        assert_eq!(a, [11.0, 22.0, 33.0, 44.0, 55.0]);
584    }
585
586    #[test]
587    fn mul_inplace_basic() {
588        let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
589        mul_inplace(&mut a, 2.0);
590        assert_eq!(a, [2.0, 4.0, 6.0, 8.0, 10.0]);
591    }
592
593    #[test]
594    fn add_scaled_inplace_basic() {
595        let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
596        let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
597        add_scaled_inplace(&mut a, &b, 0.5);
598        assert_eq!(a, [6.0, 12.0, 18.0, 24.0, 30.0]);
599    }
600
601    #[test]
602    fn copy_scaled_inplace_basic() {
603        let mut a = [0.0f32; 5];
604        let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
605        copy_scaled_inplace(&mut a, &b, 0.5);
606        assert_eq!(a, [5.0, 10.0, 15.0, 20.0, 25.0]);
607    }
608
609    #[test]
610    fn add_sanitized_inplace_basic() {
611        let mut a = [1.0f32, 2.0, 3.0, 4.0];
612        let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
613        add_sanitized_inplace(&mut a, &b);
614        assert!(a[0].is_finite() && a[0] == 1.5);
615        assert!(a[1].is_finite() && a[1] == 2.0);
616        assert!(a[2].is_finite() && a[2] == 3.0);
617        assert!(a[3].is_finite() && a[3] == 5.0);
618    }
619
620    #[test]
621    fn copy_sanitized_inplace_basic() {
622        let mut a = [0.0f32; 4];
623        let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
624        copy_sanitized_inplace(&mut a, &b);
625        assert!(a[0].is_finite() && a[0] == 0.5);
626        assert!(a[1].is_finite() && a[1] == 0.0);
627        assert!(a[2].is_finite() && a[2] == 0.0);
628        assert!(a[3].is_finite() && a[3] == 1.0);
629    }
630
631    #[test]
632    fn sanitize_finite_inplace_basic() {
633        let mut a = [1.0f32, f32::NAN, f32::INFINITY, 4.0, f32::NEG_INFINITY];
634        sanitize_finite_inplace(&mut a);
635        assert!(a[0].is_finite() && a[0] == 1.0);
636        assert_eq!(a[1], 0.0);
637        assert_eq!(a[2], 0.0);
638        assert!(a[3].is_finite() && a[3] == 4.0);
639        assert_eq!(a[4], 0.0);
640    }
641
642    #[test]
643    fn peak_abs_basic() {
644        let a = [1.0f32, -3.0, 2.0, 0.5];
645        assert_eq!(peak_abs(&a), 3.0);
646    }
647
648    #[test]
649    fn clamp_inplace_basic() {
650        let mut a = [-2.0f32, -0.5, 0.0, 0.5, 2.0];
651        clamp_inplace(&mut a, -1.0, 1.0);
652        assert_eq!(a, [-1.0, -0.5, 0.0, 0.5, 1.0]);
653    }
654}
655
656// ---------------------------------------------------------------------------
657// Integer -> f32 conversion
658// ---------------------------------------------------------------------------
659
660/// Convert i32 samples to f32 and scale by `gain`.
661/// `dst` must be at least as long as `src`.
662pub fn convert_i32_to_f32(src: &[i32], dst: &mut [f32], gain: f32) {
663    let n = src.len().min(dst.len());
664    if n == 0 {
665        return;
666    }
667    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
668    unsafe {
669        if is_x86_feature_detected!("avx") {
670            convert_i32_to_f32_avx(&src[..n], &mut dst[..n], gain);
671            return;
672        }
673        if is_x86_feature_detected!("sse") {
674            convert_i32_to_f32_sse(&src[..n], &mut dst[..n], gain);
675            return;
676        }
677    }
678    convert_i32_to_f32_scalar(&src[..n], &mut dst[..n], gain);
679}
680
681fn convert_i32_to_f32_scalar(src: &[i32], dst: &mut [f32], gain: f32) {
682    for (s, d) in src.iter().zip(dst.iter_mut()) {
683        *d = *s as f32 * gain;
684    }
685}
686
687#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
688unsafe fn convert_i32_to_f32_sse(src: &[i32], dst: &mut [f32], gain: f32) {
689    let g = _mm_set1_ps(gain);
690    let mut i = 0;
691    while i + 4 <= src.len() {
692        let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
693        let f = _mm_cvtepi32_ps(s);
694        let r = _mm_mul_ps(f, g);
695        _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
696        i += 4;
697    }
698    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
699        *d = *s as f32 * gain;
700    }
701}
702
703#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
704#[target_feature(enable = "avx")]
705unsafe fn convert_i32_to_f32_avx(src: &[i32], dst: &mut [f32], gain: f32) {
706    let g = _mm256_set1_ps(gain);
707    let mut i = 0;
708    while i + 8 <= src.len() {
709        let s = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
710        let f = _mm256_cvtepi32_ps(s);
711        let r = _mm256_mul_ps(f, g);
712        _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
713        i += 8;
714    }
715    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
716        *d = *s as f32 * gain;
717    }
718}
719
720/// Convert i16 samples to f32 and scale by `gain`.
721/// `dst` must be at least as long as `src`.
722pub fn convert_i16_to_f32(src: &[i16], dst: &mut [f32], gain: f32) {
723    let n = src.len().min(dst.len());
724    if n == 0 {
725        return;
726    }
727    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
728    unsafe {
729        if is_x86_feature_detected!("avx2") {
730            convert_i16_to_f32_avx2(&src[..n], &mut dst[..n], gain);
731            return;
732        }
733        if is_x86_feature_detected!("sse4.1") {
734            convert_i16_to_f32_sse41(&src[..n], &mut dst[..n], gain);
735            return;
736        }
737    }
738    convert_i16_to_f32_scalar(&src[..n], &mut dst[..n], gain);
739}
740
741fn convert_i16_to_f32_scalar(src: &[i16], dst: &mut [f32], gain: f32) {
742    for (s, d) in src.iter().zip(dst.iter_mut()) {
743        *d = *s as f32 * gain;
744    }
745}
746
747#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
748unsafe fn convert_i16_to_f32_sse41(src: &[i16], dst: &mut [f32], gain: f32) {
749    let g = _mm_set1_ps(gain);
750    let mut i = 0;
751    while i + 8 <= src.len() {
752        let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
753        let low = _mm_cvtepi16_epi32(bytes);
754        let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
755        let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), g);
756        let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), g);
757        _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
758        _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
759        i += 8;
760    }
761    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
762        *d = *s as f32 * gain;
763    }
764}
765
766#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
767#[target_feature(enable = "avx2")]
768unsafe fn convert_i16_to_f32_avx2(src: &[i16], dst: &mut [f32], gain: f32) {
769    let g = _mm256_set1_ps(gain);
770    let mut i = 0;
771    while i + 16 <= src.len() {
772        let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
773        let low = _mm256_cvtepi16_epi32(bytes);
774        let high = _mm256_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
775        let low_f = _mm256_mul_ps(_mm256_cvtepi32_ps(low), g);
776        let high_f = _mm256_mul_ps(_mm256_cvtepi32_ps(high), g);
777        _mm256_storeu_ps(dst.as_mut_ptr().add(i), low_f);
778        _mm256_storeu_ps(dst.as_mut_ptr().add(i + 8), high_f);
779        i += 16;
780    }
781    if i + 8 <= src.len() {
782        let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
783        let low = _mm_cvtepi16_epi32(bytes);
784        let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
785        let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), _mm_set1_ps(gain));
786        let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), _mm_set1_ps(gain));
787        _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
788        _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
789        i += 8;
790    }
791    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
792        *d = *s as f32 * gain;
793    }
794}
795
796/// Convert i8 samples to f32 and scale by `gain`.
797/// `dst` must be at least as long as `src`.
798pub fn convert_i8_to_f32(src: &[i8], dst: &mut [f32], gain: f32) {
799    let n = src.len().min(dst.len());
800    if n == 0 {
801        return;
802    }
803    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
804    unsafe {
805        if is_x86_feature_detected!("avx2") {
806            convert_i8_to_f32_avx2(&src[..n], &mut dst[..n], gain);
807            return;
808        }
809        if is_x86_feature_detected!("sse4.1") {
810            convert_i8_to_f32_sse41(&src[..n], &mut dst[..n], gain);
811            return;
812        }
813    }
814    convert_i8_to_f32_scalar(&src[..n], &mut dst[..n], gain);
815}
816
817fn convert_i8_to_f32_scalar(src: &[i8], dst: &mut [f32], gain: f32) {
818    for (s, d) in src.iter().zip(dst.iter_mut()) {
819        *d = *s as f32 * gain;
820    }
821}
822
823#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
824unsafe fn convert_i8_to_f32_sse41(src: &[i8], dst: &mut [f32], gain: f32) {
825    let g = _mm_set1_ps(gain);
826    let mut i = 0;
827    while i + 4 <= src.len() {
828        let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
829        let i32s = _mm_cvtepi8_epi32(bytes);
830        let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), g);
831        _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
832        i += 4;
833    }
834    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
835        *d = *s as f32 * gain;
836    }
837}
838
839#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
840#[target_feature(enable = "avx2")]
841unsafe fn convert_i8_to_f32_avx2(src: &[i8], dst: &mut [f32], gain: f32) {
842    let g = _mm256_set1_ps(gain);
843    let mut i = 0;
844    while i + 8 <= src.len() {
845        let bytes = _mm_cvtsi64_si128(*(src.as_ptr().add(i) as *const i64));
846        let i32s = _mm256_cvtepi8_epi32(bytes);
847        let f32s = _mm256_mul_ps(_mm256_cvtepi32_ps(i32s), g);
848        _mm256_storeu_ps(dst.as_mut_ptr().add(i), f32s);
849        i += 8;
850    }
851    if i + 4 <= src.len() {
852        let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
853        let i32s = _mm_cvtepi8_epi32(bytes);
854        let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), _mm_set1_ps(gain));
855        _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
856        i += 4;
857    }
858    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
859        *d = *s as f32 * gain;
860    }
861}
862
863/// Convert i32 samples with lower 24 bits valid to f32 and scale by `gain`.
864/// Sign-extends the lower 24 bits of each i32 before conversion.
865/// `dst` must be at least as long as `src`.
866pub fn convert_i24_to_f32(src: &[i32], dst: &mut [f32], gain: f32) {
867    let n = src.len().min(dst.len());
868    if n == 0 {
869        return;
870    }
871    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
872    unsafe {
873        if is_x86_feature_detected!("avx2") {
874            convert_i24_to_f32_avx2(&src[..n], &mut dst[..n], gain);
875            return;
876        }
877        if is_x86_feature_detected!("sse4.1") {
878            convert_i24_to_f32_sse41(&src[..n], &mut dst[..n], gain);
879            return;
880        }
881    }
882    convert_i24_to_f32_scalar(&src[..n], &mut dst[..n], gain);
883}
884
885fn convert_i24_to_f32_scalar(src: &[i32], dst: &mut [f32], gain: f32) {
886    for (s, d) in src.iter().zip(dst.iter_mut()) {
887        let mut v = *s & 0x00FF_FFFF;
888        if (v & 0x0080_0000) != 0 {
889            v |= 0xFF00_0000u32 as i32;
890        }
891        *d = v as f32 * gain;
892    }
893}
894
895#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
896unsafe fn convert_i24_to_f32_sse41(src: &[i32], dst: &mut [f32], gain: f32) {
897    let g = _mm_set1_ps(gain);
898    let mut i = 0;
899    while i + 4 <= src.len() {
900        let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
901        let extended = _mm_srai_epi32(_mm_slli_epi32(s, 8), 8);
902        let f = _mm_cvtepi32_ps(extended);
903        let r = _mm_mul_ps(f, g);
904        _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
905        i += 4;
906    }
907    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
908        let mut v = *s & 0x00FF_FFFF;
909        if (v & 0x0080_0000) != 0 {
910            v |= 0xFF00_0000u32 as i32;
911        }
912        *d = v as f32 * gain;
913    }
914}
915
916#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
917#[target_feature(enable = "avx2")]
918unsafe fn convert_i24_to_f32_avx2(src: &[i32], dst: &mut [f32], gain: f32) {
919    let g = _mm256_set1_ps(gain);
920    let mut i = 0;
921    while i + 8 <= src.len() {
922        let s = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
923        let extended = _mm256_srai_epi32(_mm256_slli_epi32(s, 8), 8);
924        let f = _mm256_cvtepi32_ps(extended);
925        let r = _mm256_mul_ps(f, g);
926        _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
927        i += 8;
928    }
929    if i + 4 <= src.len() {
930        let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
931        let extended = _mm_srai_epi32(_mm_slli_epi32(s, 8), 8);
932        let f = _mm_cvtepi32_ps(extended);
933        let r = _mm_mul_ps(f, _mm_set1_ps(gain));
934        _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
935        i += 4;
936    }
937    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
938        let mut v = *s & 0x00FF_FFFF;
939        if (v & 0x0080_0000) != 0 {
940            v |= 0xFF00_0000u32 as i32;
941        }
942        *d = v as f32 * gain;
943    }
944}
945
946/// Convert f32 samples to i32 and scale by `gain`, masking to lower 24 bits.
947/// Uses truncation toward zero (matching Rust `as i32`).
948/// `dst` must be at least as long as `src`.
949pub fn convert_f32_to_i24(src: &[f32], dst: &mut [i32], gain: f32) {
950    let n = src.len().min(dst.len());
951    if n == 0 {
952        return;
953    }
954    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
955    unsafe {
956        if is_x86_feature_detected!("avx") {
957            convert_f32_to_i24_avx(&src[..n], &mut dst[..n], gain);
958            return;
959        }
960        if is_x86_feature_detected!("sse2") {
961            convert_f32_to_i24_sse2(&src[..n], &mut dst[..n], gain);
962            return;
963        }
964    }
965    convert_f32_to_i24_scalar(&src[..n], &mut dst[..n], gain);
966}
967
968fn convert_f32_to_i24_scalar(src: &[f32], dst: &mut [i32], gain: f32) {
969    for (s, d) in src.iter().zip(dst.iter_mut()) {
970        *d = (*s * gain) as i32 & 0x00FF_FFFF;
971    }
972}
973
974#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
975unsafe fn convert_f32_to_i24_sse2(src: &[f32], dst: &mut [i32], gain: f32) {
976    let g = _mm_set1_ps(gain);
977    let mask = _mm_set1_epi32(0x00FF_FFFF);
978    let mut i = 0;
979    while i + 4 <= src.len() {
980        let s = _mm_loadu_ps(src.as_ptr().add(i));
981        let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
982        let m = _mm_and_si128(v, mask);
983        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, m);
984        i += 4;
985    }
986    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
987        *d = (*s * gain) as i32 & 0x00FF_FFFF;
988    }
989}
990
991#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
992#[target_feature(enable = "avx")]
993unsafe fn convert_f32_to_i24_avx(src: &[f32], dst: &mut [i32], gain: f32) {
994    let g = _mm256_set1_ps(gain);
995    let mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x00FF_FFFF));
996    let mut i = 0;
997    while i + 8 <= src.len() {
998        let s = _mm256_loadu_ps(src.as_ptr().add(i));
999        let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1000        let m = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), mask));
1001        _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, m);
1002        i += 8;
1003    }
1004    if i + 4 <= src.len() {
1005        let g_sse = _mm_set1_ps(gain);
1006        let mask_sse = _mm_set1_epi32(0x00FF_FFFF);
1007        let s = _mm_loadu_ps(src.as_ptr().add(i));
1008        let v = _mm_cvttps_epi32(_mm_mul_ps(s, g_sse));
1009        let m = _mm_and_si128(v, mask_sse);
1010        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, m);
1011        i += 4;
1012    }
1013    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1014        *d = (*s * gain) as i32 & 0x00FF_FFFF;
1015    }
1016}
1017
1018/// Convert f32 samples to i32 and scale by `gain`.
1019/// Uses truncation toward zero (matching Rust `as i32`).
1020/// `dst` must be at least as long as `src`.
1021pub fn convert_f32_to_i32(src: &[f32], dst: &mut [i32], gain: f32) {
1022    let n = src.len().min(dst.len());
1023    if n == 0 {
1024        return;
1025    }
1026    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1027    unsafe {
1028        if is_x86_feature_detected!("avx") {
1029            convert_f32_to_i32_avx(&src[..n], &mut dst[..n], gain);
1030            return;
1031        }
1032        if is_x86_feature_detected!("sse2") {
1033            convert_f32_to_i32_sse2(&src[..n], &mut dst[..n], gain);
1034            return;
1035        }
1036    }
1037    convert_f32_to_i32_scalar(&src[..n], &mut dst[..n], gain);
1038}
1039
1040fn convert_f32_to_i32_scalar(src: &[f32], dst: &mut [i32], gain: f32) {
1041    for (s, d) in src.iter().zip(dst.iter_mut()) {
1042        *d = (*s * gain) as i32;
1043    }
1044}
1045
1046#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1047unsafe fn convert_f32_to_i32_sse2(src: &[f32], dst: &mut [i32], gain: f32) {
1048    let g = _mm_set1_ps(gain);
1049    let mut i = 0;
1050    while i + 4 <= src.len() {
1051        let s = _mm_loadu_ps(src.as_ptr().add(i));
1052        let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
1053        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, v);
1054        i += 4;
1055    }
1056    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1057        *d = (*s * gain) as i32;
1058    }
1059}
1060
1061#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1062#[target_feature(enable = "avx")]
1063unsafe fn convert_f32_to_i32_avx(src: &[f32], dst: &mut [i32], gain: f32) {
1064    let g = _mm256_set1_ps(gain);
1065    let mut i = 0;
1066    while i + 8 <= src.len() {
1067        let s = _mm256_loadu_ps(src.as_ptr().add(i));
1068        let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1069        _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, v);
1070        i += 8;
1071    }
1072    if i + 4 <= src.len() {
1073        let s = _mm_loadu_ps(src.as_ptr().add(i));
1074        let v = _mm_cvttps_epi32(_mm_mul_ps(s, _mm_set1_ps(gain)));
1075        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, v);
1076        i += 4;
1077    }
1078    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1079        *d = (*s * gain) as i32;
1080    }
1081}
1082
1083/// Convert f32 samples to i16 and scale by `gain`.
1084/// Uses truncation toward zero (matching Rust `as i16`).
1085/// `dst` must be at least as long as `src`.
1086pub fn convert_f32_to_i16(src: &[f32], dst: &mut [i16], gain: f32) {
1087    let n = src.len().min(dst.len());
1088    if n == 0 {
1089        return;
1090    }
1091    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1092    unsafe {
1093        if is_x86_feature_detected!("sse2") {
1094            convert_f32_to_i16_sse2(&src[..n], &mut dst[..n], gain);
1095            return;
1096        }
1097    }
1098    convert_f32_to_i16_scalar(&src[..n], &mut dst[..n], gain);
1099}
1100
1101fn convert_f32_to_i16_scalar(src: &[f32], dst: &mut [i16], gain: f32) {
1102    for (s, d) in src.iter().zip(dst.iter_mut()) {
1103        *d = (*s * gain) as i16;
1104    }
1105}
1106
1107#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1108unsafe fn convert_f32_to_i16_sse2(src: &[f32], dst: &mut [i16], gain: f32) {
1109    let g = _mm_set1_ps(gain);
1110    let mut i = 0;
1111    while i + 8 <= src.len() {
1112        let s0 = _mm_loadu_ps(src.as_ptr().add(i));
1113        let s1 = _mm_loadu_ps(src.as_ptr().add(i + 4));
1114        let v0 = _mm_cvttps_epi32(_mm_mul_ps(s0, g));
1115        let v1 = _mm_cvttps_epi32(_mm_mul_ps(s1, g));
1116        let packed = _mm_packs_epi32(v0, v1);
1117        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1118        i += 8;
1119    }
1120    if i + 4 <= src.len() {
1121        let s = _mm_loadu_ps(src.as_ptr().add(i));
1122        let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
1123        let packed = _mm_packs_epi32(v, _mm_setzero_si128());
1124        _mm_storel_epi64(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1125        i += 4;
1126    }
1127    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1128        *d = (*s * gain) as i16;
1129    }
1130}
1131
1132/// Convert f32 samples to i8 and scale by `gain`.
1133/// Uses truncation toward zero (matching Rust `as i8`).
1134/// `dst` must be at least as long as `src`.
1135pub fn convert_f32_to_i8(src: &[f32], dst: &mut [i8], gain: f32) {
1136    let n = src.len().min(dst.len());
1137    if n == 0 {
1138        return;
1139    }
1140    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1141    unsafe {
1142        if is_x86_feature_detected!("sse2") {
1143            convert_f32_to_i8_sse2(&src[..n], &mut dst[..n], gain);
1144            return;
1145        }
1146    }
1147    convert_f32_to_i8_scalar(&src[..n], &mut dst[..n], gain);
1148}
1149
1150fn convert_f32_to_i8_scalar(src: &[f32], dst: &mut [i8], gain: f32) {
1151    for (s, d) in src.iter().zip(dst.iter_mut()) {
1152        *d = (*s * gain) as i8;
1153    }
1154}
1155
1156#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1157unsafe fn convert_f32_to_i8_sse2(src: &[f32], dst: &mut [i8], gain: f32) {
1158    let g = _mm_set1_ps(gain);
1159    let mut i = 0;
1160    while i + 16 <= src.len() {
1161        let s0 = _mm_loadu_ps(src.as_ptr().add(i));
1162        let s1 = _mm_loadu_ps(src.as_ptr().add(i + 4));
1163        let s2 = _mm_loadu_ps(src.as_ptr().add(i + 8));
1164        let s3 = _mm_loadu_ps(src.as_ptr().add(i + 12));
1165        let v0 = _mm_cvttps_epi32(_mm_mul_ps(s0, g));
1166        let v1 = _mm_cvttps_epi32(_mm_mul_ps(s1, g));
1167        let v2 = _mm_cvttps_epi32(_mm_mul_ps(s2, g));
1168        let v3 = _mm_cvttps_epi32(_mm_mul_ps(s3, g));
1169        let p0 = _mm_packs_epi32(v0, v1);
1170        let p1 = _mm_packs_epi32(v2, v3);
1171        let packed = _mm_packs_epi16(p0, p1);
1172        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1173        i += 16;
1174    }
1175    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1176        *d = (*s * gain) as i8;
1177    }
1178}
1179
1180// ---------------------------------------------------------------------------
1181// Fade ramps (sin/cos based)
1182// ---------------------------------------------------------------------------
1183
1184/// Apply a sine-based fade-in gain ramp in place: `gain = sin(t * π/2)`.
1185/// `t` for sample `i` is `(start_t + i as f32 * dt).clamp(0.0, 1.0)`.
1186pub fn apply_fade_in_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
1187    use wide::f32x4;
1188    let pi_2 = f32x4::splat(std::f32::consts::FRAC_PI_2);
1189    let dt_vec = f32x4::from([0.0, dt, 2.0 * dt, 3.0 * dt]);
1190    let mut i = 0;
1191    while i + 4 <= buf.len() {
1192        let base_t = f32x4::splat(start_t + i as f32 * dt);
1193        let t = (base_t + dt_vec).clamp(f32x4::splat(0.0), f32x4::splat(1.0));
1194        let gain = (t * pi_2).sin();
1195        let chunk = &mut buf[i..i + 4];
1196        let s: f32x4 = [chunk[0], chunk[1], chunk[2], chunk[3]].into();
1197        let r = s * gain;
1198        chunk.copy_from_slice(&r.to_array());
1199        i += 4;
1200    }
1201    for (j, v) in buf[i..].iter_mut().enumerate() {
1202        let t = (((i + j) as f32 * dt) + start_t).clamp(0.0, 1.0);
1203        *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1204    }
1205}
1206
1207/// Apply a cosine-based fade-out gain ramp in place: `gain = cos(t * π/2)`.
1208/// `t` for sample `i` is `(start_t + i as f32 * dt).clamp(0.0, 1.0)`.
1209pub fn apply_fade_out_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
1210    use wide::f32x4;
1211    let pi_2 = f32x4::splat(std::f32::consts::FRAC_PI_2);
1212    let dt_vec = f32x4::from([0.0, dt, 2.0 * dt, 3.0 * dt]);
1213    let mut i = 0;
1214    while i + 4 <= buf.len() {
1215        let base_t = f32x4::splat(start_t + i as f32 * dt);
1216        let t = (base_t + dt_vec).clamp(f32x4::splat(0.0), f32x4::splat(1.0));
1217        let gain = (t * pi_2).cos();
1218        let chunk = &mut buf[i..i + 4];
1219        let s: f32x4 = [chunk[0], chunk[1], chunk[2], chunk[3]].into();
1220        let r = s * gain;
1221        chunk.copy_from_slice(&r.to_array());
1222        i += 4;
1223    }
1224    for (j, v) in buf[i..].iter_mut().enumerate() {
1225        let t = (((i + j) as f32 * dt) + start_t).clamp(0.0, 1.0);
1226        *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1227    }
1228}