Skip to main content

maolan_engine/
simd.rs

1//! Portable SIMD helper routines for buffer math.
2//!
3//! Uses runtime CPU feature detection to dispatch:
4//! - AVX (`f32x8`) on x86_64/x86 when available
5//! - SSE intrinsics as fallback on x86_64/x86
6//! - Scalar loops on all other architectures
7
8#![allow(unsafe_op_in_unsafe_fn)]
9
10#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
11mod x86 {
12    pub use std::arch::x86_64::*;
13}
14
15#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
16use x86::*;
17
18/// dst[i] += src[i]
19pub fn add_inplace(dst: &mut [f32], src: &[f32]) {
20    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
21    unsafe {
22        if is_x86_feature_detected!("avx") {
23            add_inplace_avx(dst, src);
24            return;
25        }
26        if is_x86_feature_detected!("sse") {
27            add_inplace_sse(dst, src);
28            return;
29        }
30    }
31    add_inplace_scalar(dst, src);
32}
33
34/// dst[i] *= gain
35pub fn mul_inplace(dst: &mut [f32], gain: f32) {
36    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
37    unsafe {
38        if is_x86_feature_detected!("avx") {
39            mul_inplace_avx(dst, gain);
40            return;
41        }
42        if is_x86_feature_detected!("sse") {
43            mul_inplace_sse(dst, gain);
44            return;
45        }
46    }
47    mul_inplace_scalar(dst, gain);
48}
49
50/// dst[i] += src[i] * gain
51pub fn add_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
52    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
53    unsafe {
54        if is_x86_feature_detected!("avx") && is_x86_feature_detected!("fma") {
55            add_scaled_inplace_avx_fma(dst, src, gain);
56            return;
57        }
58        if is_x86_feature_detected!("avx") {
59            add_scaled_inplace_avx(dst, src, gain);
60            return;
61        }
62        if is_x86_feature_detected!("sse") {
63            add_scaled_inplace_sse(dst, src, gain);
64            return;
65        }
66    }
67    add_scaled_inplace_scalar(dst, src, gain);
68}
69
70/// dst[i] = src[i] * gain
71pub fn copy_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
72    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
73    unsafe {
74        if is_x86_feature_detected!("avx") {
75            copy_scaled_inplace_avx(dst, src, gain);
76            return;
77        }
78        if is_x86_feature_detected!("sse") {
79            copy_scaled_inplace_sse(dst, src, gain);
80            return;
81        }
82    }
83    copy_scaled_inplace_scalar(dst, src, gain);
84}
85
86/// dst[i] += sanitize_finite(src[i])
87pub fn add_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
88    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
89    unsafe {
90        if is_x86_feature_detected!("avx") {
91            add_sanitized_inplace_avx(dst, src);
92            return;
93        }
94        if is_x86_feature_detected!("sse") {
95            add_sanitized_inplace_sse(dst, src);
96            return;
97        }
98    }
99    add_sanitized_inplace_scalar(dst, src);
100}
101
102/// dst[i] = sanitize_finite(src[i])
103pub fn copy_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
104    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
105    unsafe {
106        if is_x86_feature_detected!("avx") {
107            copy_sanitized_inplace_avx(dst, src);
108            return;
109        }
110        if is_x86_feature_detected!("sse") {
111            copy_sanitized_inplace_sse(dst, src);
112            return;
113        }
114    }
115    copy_sanitized_inplace_scalar(dst, src);
116}
117
118/// Replace NaN / ±Inf with 0.0 in place.
119pub fn sanitize_finite_inplace(buf: &mut [f32]) {
120    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
121    unsafe {
122        if is_x86_feature_detected!("avx") {
123            sanitize_finite_inplace_avx(buf);
124            return;
125        }
126        if is_x86_feature_detected!("sse") {
127            sanitize_finite_inplace_sse(buf);
128            return;
129        }
130    }
131    sanitize_finite_inplace_scalar(buf);
132}
133
134/// Horizontal max of abs(buf[i]).
135pub fn peak_abs(buf: &[f32]) -> f32 {
136    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
137    unsafe {
138        if is_x86_feature_detected!("avx") {
139            return peak_abs_avx(buf);
140        }
141        if is_x86_feature_detected!("sse") {
142            return peak_abs_sse(buf);
143        }
144    }
145    peak_abs_scalar(buf)
146}
147
148/// Clamp every element to [min, max].
149pub fn clamp_inplace(buf: &mut [f32], min: f32, max: f32) {
150    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
151    unsafe {
152        if is_x86_feature_detected!("avx") {
153            clamp_inplace_avx(buf, min, max);
154            return;
155        }
156        if is_x86_feature_detected!("sse") {
157            clamp_inplace_sse(buf, min, max);
158            return;
159        }
160    }
161    clamp_inplace_scalar(buf, min, max);
162}
163
164fn add_inplace_scalar(dst: &mut [f32], src: &[f32]) {
165    for (d, s) in dst.iter_mut().zip(src.iter()) {
166        *d += *s;
167    }
168}
169
170fn mul_inplace_scalar(dst: &mut [f32], gain: f32) {
171    for d in dst.iter_mut() {
172        *d *= gain;
173    }
174}
175
176fn add_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
177    for (d, s) in dst.iter_mut().zip(src.iter()) {
178        *d += *s * gain;
179    }
180}
181
182fn copy_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
183    for (d, s) in dst.iter_mut().zip(src.iter()) {
184        *d = *s * gain;
185    }
186}
187
188fn add_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
189    for (d, s) in dst.iter_mut().zip(src.iter()) {
190        *d += if s.is_finite() { *s } else { 0.0 };
191    }
192}
193
194fn copy_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
195    for (d, s) in dst.iter_mut().zip(src.iter()) {
196        *d = if s.is_finite() { *s } else { 0.0 };
197    }
198}
199
200fn sanitize_finite_inplace_scalar(buf: &mut [f32]) {
201    for s in buf.iter_mut() {
202        if !s.is_finite() {
203            *s = 0.0;
204        }
205    }
206}
207
208fn peak_abs_scalar(buf: &[f32]) -> f32 {
209    buf.iter().fold(0.0f32, |acc, s| acc.max(s.abs()))
210}
211
212fn clamp_inplace_scalar(buf: &mut [f32], min: f32, max: f32) {
213    for s in buf.iter_mut() {
214        *s = s.clamp(min, max);
215    }
216}
217
218#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
219#[target_feature(enable = "sse")]
220unsafe fn add_inplace_sse(dst: &mut [f32], src: &[f32]) {
221    let len = dst.len().min(src.len());
222    let dst_head = &mut dst[..len];
223    let src_head = &src[..len];
224    let mut i = 0usize;
225    while i + 4 <= dst_head.len() {
226        let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
227        let s = _mm_loadu_ps(src_head.as_ptr().add(i));
228        let r = _mm_add_ps(d, s);
229        _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
230        i += 4;
231    }
232    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
233        *d += *s;
234    }
235}
236
237#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
238#[target_feature(enable = "sse")]
239unsafe fn mul_inplace_sse(dst: &mut [f32], gain: f32) {
240    let g = _mm_set1_ps(gain);
241    let mut i = 0usize;
242    while i + 4 <= dst.len() {
243        let d = _mm_loadu_ps(dst.as_ptr().add(i));
244        let r = _mm_mul_ps(d, g);
245        _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
246        i += 4;
247    }
248    for d in &mut dst[i..] {
249        *d *= gain;
250    }
251}
252
253#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
254#[target_feature(enable = "sse")]
255unsafe fn add_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
256    let len = dst.len().min(src.len());
257    let dst_head = &mut dst[..len];
258    let src_head = &src[..len];
259    let g = _mm_set1_ps(gain);
260    let mut i = 0usize;
261    while i + 4 <= dst_head.len() {
262        let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
263        let s = _mm_loadu_ps(src_head.as_ptr().add(i));
264        let r = _mm_add_ps(d, _mm_mul_ps(s, g));
265        _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
266        i += 4;
267    }
268    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
269        *d += *s * gain;
270    }
271}
272
273#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
274#[target_feature(enable = "sse")]
275unsafe fn copy_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
276    let len = dst.len().min(src.len());
277    let dst_head = &mut dst[..len];
278    let src_head = &src[..len];
279    let g = _mm_set1_ps(gain);
280    let mut i = 0usize;
281    while i + 4 <= dst_head.len() {
282        let s = _mm_loadu_ps(src_head.as_ptr().add(i));
283        let r = _mm_mul_ps(s, g);
284        _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
285        i += 4;
286    }
287    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
288        *d = *s * gain;
289    }
290}
291
292#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
293#[target_feature(enable = "sse")]
294unsafe fn add_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
295    let len = dst.len().min(src.len());
296    let dst_head = &mut dst[..len];
297    let src_head = &src[..len];
298    let sign_mask = _mm_set1_ps(-0.0);
299    let finite_max = _mm_set1_ps(f32::MAX);
300    let mut i = 0usize;
301    while i + 4 <= dst_head.len() {
302        let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
303        let s = _mm_loadu_ps(src_head.as_ptr().add(i));
304        let abs_s = _mm_andnot_ps(sign_mask, s);
305        let finite_mask = _mm_cmple_ps(abs_s, finite_max);
306        let sanitized = _mm_and_ps(s, finite_mask);
307        let r = _mm_add_ps(d, sanitized);
308        _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
309        i += 4;
310    }
311    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
312        *d += if s.is_finite() { *s } else { 0.0 };
313    }
314}
315
316#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
317#[target_feature(enable = "sse")]
318unsafe fn copy_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
319    let len = dst.len().min(src.len());
320    let dst_head = &mut dst[..len];
321    let src_head = &src[..len];
322    let sign_mask = _mm_set1_ps(-0.0);
323    let finite_max = _mm_set1_ps(f32::MAX);
324    let mut i = 0usize;
325    while i + 4 <= dst_head.len() {
326        let s = _mm_loadu_ps(src_head.as_ptr().add(i));
327        let abs_s = _mm_andnot_ps(sign_mask, s);
328        let finite_mask = _mm_cmple_ps(abs_s, finite_max);
329        let r = _mm_and_ps(s, finite_mask);
330        _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
331        i += 4;
332    }
333    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
334        *d = if s.is_finite() { *s } else { 0.0 };
335    }
336}
337
338#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
339#[target_feature(enable = "sse")]
340unsafe fn sanitize_finite_inplace_sse(buf: &mut [f32]) {
341    let sign_mask = _mm_set1_ps(-0.0);
342    let finite_max = _mm_set1_ps(f32::MAX);
343    let mut i = 0usize;
344    while i + 4 <= buf.len() {
345        let v = _mm_loadu_ps(buf.as_ptr().add(i));
346        let abs_v = _mm_andnot_ps(sign_mask, v);
347        let finite_mask = _mm_cmple_ps(abs_v, finite_max);
348        let r = _mm_and_ps(v, finite_mask);
349        _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
350        i += 4;
351    }
352    for s in &mut buf[i..] {
353        if !s.is_finite() {
354            *s = 0.0;
355        }
356    }
357}
358
359#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
360#[target_feature(enable = "sse")]
361unsafe fn peak_abs_sse(buf: &[f32]) -> f32 {
362    let sign_mask = _mm_set1_ps(-0.0);
363    let mut peak = _mm_setzero_ps();
364    let mut i = 0usize;
365    while i + 4 <= buf.len() {
366        let v = _mm_loadu_ps(buf.as_ptr().add(i));
367        let abs_v = _mm_andnot_ps(sign_mask, v);
368        peak = _mm_max_ps(peak, abs_v);
369        i += 4;
370    }
371    let mut arr = [0.0f32; 4];
372    _mm_storeu_ps(arr.as_mut_ptr(), peak);
373    let mut max_scalar = arr.into_iter().fold(0.0f32, |a, b| a.max(b));
374    for s in &buf[i..] {
375        max_scalar = max_scalar.max(s.abs());
376    }
377    max_scalar
378}
379
380#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
381#[target_feature(enable = "sse")]
382unsafe fn clamp_inplace_sse(buf: &mut [f32], min: f32, max: f32) {
383    let vmin = _mm_set1_ps(min);
384    let vmax = _mm_set1_ps(max);
385    let mut i = 0usize;
386    while i + 4 <= buf.len() {
387        let v = _mm_loadu_ps(buf.as_ptr().add(i));
388        let r = _mm_min_ps(_mm_max_ps(v, vmin), vmax);
389        _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
390        i += 4;
391    }
392    for s in &mut buf[i..] {
393        *s = s.clamp(min, max);
394    }
395}
396
397#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
398#[target_feature(enable = "avx")]
399unsafe fn add_inplace_avx(dst: &mut [f32], src: &[f32]) {
400    let len = dst.len().min(src.len());
401    let dst_head = &mut dst[..len];
402    let src_head = &src[..len];
403    let mut i = 0;
404    while i + 8 <= dst_head.len() {
405        let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
406        let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
407        let r = _mm256_add_ps(d, s);
408        _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
409        i += 8;
410    }
411    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
412        *d += *s;
413    }
414}
415
416#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
417#[target_feature(enable = "avx")]
418unsafe fn mul_inplace_avx(dst: &mut [f32], gain: f32) {
419    let g = _mm256_set1_ps(gain);
420    let mut i = 0;
421    while i + 8 <= dst.len() {
422        let d = _mm256_loadu_ps(dst.as_ptr().add(i));
423        let r = _mm256_mul_ps(d, g);
424        _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
425        i += 8;
426    }
427    for d in &mut dst[i..] {
428        *d *= gain;
429    }
430}
431
432#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
433#[target_feature(enable = "avx")]
434unsafe fn add_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
435    let len = dst.len().min(src.len());
436    let dst_head = &mut dst[..len];
437    let src_head = &src[..len];
438    let g = _mm256_set1_ps(gain);
439    let mut i = 0;
440    while i + 8 <= dst_head.len() {
441        let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
442        let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
443        let r = _mm256_add_ps(d, _mm256_mul_ps(s, g));
444        _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
445        i += 8;
446    }
447    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
448        *d += *s * gain;
449    }
450}
451
452#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
453#[target_feature(enable = "avx,fma")]
454unsafe fn add_scaled_inplace_avx_fma(dst: &mut [f32], src: &[f32], gain: f32) {
455    let len = dst.len().min(src.len());
456    let dst_head = &mut dst[..len];
457    let src_head = &src[..len];
458    let g = _mm256_set1_ps(gain);
459    let mut i = 0;
460    while i + 8 <= dst_head.len() {
461        let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
462        let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
463        let r = _mm256_fmadd_ps(s, g, d);
464        _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
465        i += 8;
466    }
467    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
468        *d += *s * gain;
469    }
470}
471
472#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
473#[target_feature(enable = "avx")]
474unsafe fn copy_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
475    let len = dst.len().min(src.len());
476    let dst_head = &mut dst[..len];
477    let src_head = &src[..len];
478    let g = _mm256_set1_ps(gain);
479    let mut i = 0;
480    while i + 8 <= dst_head.len() {
481        let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
482        let r = _mm256_mul_ps(s, g);
483        _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
484        i += 8;
485    }
486    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
487        *d = *s * gain;
488    }
489}
490
491#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
492#[target_feature(enable = "avx")]
493unsafe fn add_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
494    let len = dst.len().min(src.len());
495    let dst_head = &mut dst[..len];
496    let src_head = &src[..len];
497    let zero = _mm256_setzero_ps();
498    let max_val = _mm256_set1_ps(f32::MAX);
499    let mut i = 0;
500    while i + 8 <= dst_head.len() {
501        let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
502        let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
503        let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
504        let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
505        let sanitized = _mm256_blendv_ps(zero, s, mask);
506        let r = _mm256_add_ps(d, sanitized);
507        _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
508        i += 8;
509    }
510    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
511        *d += if s.is_finite() { *s } else { 0.0 };
512    }
513}
514
515#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
516#[target_feature(enable = "avx")]
517unsafe fn copy_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
518    let len = dst.len().min(src.len());
519    let dst_head = &mut dst[..len];
520    let src_head = &src[..len];
521    let zero = _mm256_setzero_ps();
522    let max_val = _mm256_set1_ps(f32::MAX);
523    let mut i = 0;
524    while i + 8 <= dst_head.len() {
525        let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
526        let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
527        let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
528        let r = _mm256_blendv_ps(zero, s, mask);
529        _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
530        i += 8;
531    }
532    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
533        *d = if s.is_finite() { *s } else { 0.0 };
534    }
535}
536
537#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
538#[target_feature(enable = "avx")]
539unsafe fn sanitize_finite_inplace_avx(buf: &mut [f32]) {
540    let zero = _mm256_setzero_ps();
541    let max_val = _mm256_set1_ps(f32::MAX);
542    let mut i = 0;
543    while i + 8 <= buf.len() {
544        let v = _mm256_loadu_ps(buf.as_ptr().add(i));
545        let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
546        let mask = _mm256_cmp_ps(abs_v, max_val, _CMP_LE_OQ);
547        let r = _mm256_blendv_ps(zero, v, mask);
548        _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
549        i += 8;
550    }
551    for s in &mut buf[i..] {
552        if !s.is_finite() {
553            *s = 0.0;
554        }
555    }
556}
557
558#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
559#[target_feature(enable = "avx")]
560unsafe fn peak_abs_avx(buf: &[f32]) -> f32 {
561    let mut peak = _mm256_setzero_ps();
562    let mut i = 0;
563    while i + 8 <= buf.len() {
564        let v = _mm256_loadu_ps(buf.as_ptr().add(i));
565        let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
566        peak = _mm256_max_ps(peak, abs_v);
567        i += 8;
568    }
569    let mut arr = [0.0f32; 8];
570    _mm256_storeu_ps(arr.as_mut_ptr(), peak);
571    let mut max_scalar = arr.iter().fold(0.0f32, |a, &b| a.max(b));
572    for s in &buf[i..] {
573        max_scalar = max_scalar.max(s.abs());
574    }
575    max_scalar
576}
577
578#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
579#[target_feature(enable = "avx")]
580unsafe fn clamp_inplace_avx(buf: &mut [f32], min: f32, max: f32) {
581    let vmin = _mm256_set1_ps(min);
582    let vmax = _mm256_set1_ps(max);
583    let mut i = 0;
584    while i + 8 <= buf.len() {
585        let v = _mm256_loadu_ps(buf.as_ptr().add(i));
586        let r = _mm256_max_ps(vmin, _mm256_min_ps(vmax, v));
587        _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
588        i += 8;
589    }
590    for s in &mut buf[i..] {
591        *s = s.clamp(min, max);
592    }
593}
594
595#[cfg(test)]
596mod tests {
597    use super::*;
598
599    #[test]
600    fn add_inplace_basic() {
601        let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
602        let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
603        add_inplace(&mut a, &b);
604        assert_eq!(a, [11.0, 22.0, 33.0, 44.0, 55.0]);
605    }
606
607    #[test]
608    fn mul_inplace_basic() {
609        let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
610        mul_inplace(&mut a, 2.0);
611        assert_eq!(a, [2.0, 4.0, 6.0, 8.0, 10.0]);
612    }
613
614    #[test]
615    fn add_scaled_inplace_basic() {
616        let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
617        let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
618        add_scaled_inplace(&mut a, &b, 0.5);
619        assert_eq!(a, [6.0, 12.0, 18.0, 24.0, 30.0]);
620    }
621
622    #[test]
623    fn copy_scaled_inplace_basic() {
624        let mut a = [0.0f32; 5];
625        let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
626        copy_scaled_inplace(&mut a, &b, 0.5);
627        assert_eq!(a, [5.0, 10.0, 15.0, 20.0, 25.0]);
628    }
629
630    #[test]
631    fn add_sanitized_inplace_basic() {
632        let mut a = [1.0f32, 2.0, 3.0, 4.0];
633        let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
634        add_sanitized_inplace(&mut a, &b);
635        assert!(a[0].is_finite() && a[0] == 1.5);
636        assert!(a[1].is_finite() && a[1] == 2.0);
637        assert!(a[2].is_finite() && a[2] == 3.0);
638        assert!(a[3].is_finite() && a[3] == 5.0);
639    }
640
641    #[test]
642    fn copy_sanitized_inplace_basic() {
643        let mut a = [0.0f32; 4];
644        let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
645        copy_sanitized_inplace(&mut a, &b);
646        assert!(a[0].is_finite() && a[0] == 0.5);
647        assert!(a[1].is_finite() && a[1] == 0.0);
648        assert!(a[2].is_finite() && a[2] == 0.0);
649        assert!(a[3].is_finite() && a[3] == 1.0);
650    }
651
652    #[test]
653    fn sanitize_finite_inplace_basic() {
654        let mut a = [1.0f32, f32::NAN, f32::INFINITY, 4.0, f32::NEG_INFINITY];
655        sanitize_finite_inplace(&mut a);
656        assert!(a[0].is_finite() && a[0] == 1.0);
657        assert_eq!(a[1], 0.0);
658        assert_eq!(a[2], 0.0);
659        assert!(a[3].is_finite() && a[3] == 4.0);
660        assert_eq!(a[4], 0.0);
661    }
662
663    #[test]
664    fn peak_abs_basic() {
665        let a = [1.0f32, -3.0, 2.0, 0.5];
666        assert_eq!(peak_abs(&a), 3.0);
667    }
668
669    #[test]
670    fn clamp_inplace_basic() {
671        let mut a = [-2.0f32, -0.5, 0.0, 0.5, 2.0];
672        clamp_inplace(&mut a, -1.0, 1.0);
673        assert_eq!(a, [-1.0, -0.5, 0.0, 0.5, 1.0]);
674    }
675}
676
677/// Convert i32 samples to f32 and scale by `gain`.
678/// `dst` must be at least as long as `src`.
679pub fn convert_i32_to_f32(src: &[i32], dst: &mut [f32], gain: f32) {
680    let n = src.len().min(dst.len());
681    if n == 0 {
682        return;
683    }
684    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
685    unsafe {
686        if is_x86_feature_detected!("avx") {
687            convert_i32_to_f32_avx(&src[..n], &mut dst[..n], gain);
688            return;
689        }
690        if is_x86_feature_detected!("sse") {
691            convert_i32_to_f32_sse(&src[..n], &mut dst[..n], gain);
692            return;
693        }
694    }
695    convert_i32_to_f32_scalar(&src[..n], &mut dst[..n], gain);
696}
697
698fn convert_i32_to_f32_scalar(src: &[i32], dst: &mut [f32], gain: f32) {
699    for (s, d) in src.iter().zip(dst.iter_mut()) {
700        *d = *s as f32 * gain;
701    }
702}
703
704#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
705unsafe fn convert_i32_to_f32_sse(src: &[i32], dst: &mut [f32], gain: f32) {
706    let g = _mm_set1_ps(gain);
707    let mut i = 0;
708    while i + 4 <= src.len() {
709        let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
710        let f = _mm_cvtepi32_ps(s);
711        let r = _mm_mul_ps(f, g);
712        _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
713        i += 4;
714    }
715    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
716        *d = *s as f32 * gain;
717    }
718}
719
720#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
721#[target_feature(enable = "avx")]
722unsafe fn convert_i32_to_f32_avx(src: &[i32], dst: &mut [f32], gain: f32) {
723    let g = _mm256_set1_ps(gain);
724    let mut i = 0;
725    while i + 8 <= src.len() {
726        let s = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
727        let f = _mm256_cvtepi32_ps(s);
728        let r = _mm256_mul_ps(f, g);
729        _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
730        i += 8;
731    }
732    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
733        *d = *s as f32 * gain;
734    }
735}
736
737/// Convert i16 samples to f32 and scale by `gain`.
738/// `dst` must be at least as long as `src`.
739pub fn convert_i16_to_f32(src: &[i16], dst: &mut [f32], gain: f32) {
740    let n = src.len().min(dst.len());
741    if n == 0 {
742        return;
743    }
744    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
745    unsafe {
746        if is_x86_feature_detected!("avx2") {
747            convert_i16_to_f32_avx2(&src[..n], &mut dst[..n], gain);
748            return;
749        }
750        if is_x86_feature_detected!("sse4.1") {
751            convert_i16_to_f32_sse41(&src[..n], &mut dst[..n], gain);
752            return;
753        }
754    }
755    convert_i16_to_f32_scalar(&src[..n], &mut dst[..n], gain);
756}
757
758fn convert_i16_to_f32_scalar(src: &[i16], dst: &mut [f32], gain: f32) {
759    for (s, d) in src.iter().zip(dst.iter_mut()) {
760        *d = *s as f32 * gain;
761    }
762}
763
764#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
765unsafe fn convert_i16_to_f32_sse41(src: &[i16], dst: &mut [f32], gain: f32) {
766    let g = _mm_set1_ps(gain);
767    let mut i = 0;
768    while i + 8 <= src.len() {
769        let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
770        let low = _mm_cvtepi16_epi32(bytes);
771        let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
772        let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), g);
773        let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), g);
774        _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
775        _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
776        i += 8;
777    }
778    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
779        *d = *s as f32 * gain;
780    }
781}
782
783#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
784#[target_feature(enable = "avx2")]
785unsafe fn convert_i16_to_f32_avx2(src: &[i16], dst: &mut [f32], gain: f32) {
786    let g = _mm256_set1_ps(gain);
787    let mut i = 0;
788    while i + 16 <= src.len() {
789        let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
790        let low = _mm256_cvtepi16_epi32(bytes);
791        let high = _mm256_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
792        let low_f = _mm256_mul_ps(_mm256_cvtepi32_ps(low), g);
793        let high_f = _mm256_mul_ps(_mm256_cvtepi32_ps(high), g);
794        _mm256_storeu_ps(dst.as_mut_ptr().add(i), low_f);
795        _mm256_storeu_ps(dst.as_mut_ptr().add(i + 8), high_f);
796        i += 16;
797    }
798    if i + 8 <= src.len() {
799        let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
800        let low = _mm_cvtepi16_epi32(bytes);
801        let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
802        let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), _mm_set1_ps(gain));
803        let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), _mm_set1_ps(gain));
804        _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
805        _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
806        i += 8;
807    }
808    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
809        *d = *s as f32 * gain;
810    }
811}
812
813/// Convert i8 samples to f32 and scale by `gain`.
814/// `dst` must be at least as long as `src`.
815pub fn convert_i8_to_f32(src: &[i8], dst: &mut [f32], gain: f32) {
816    let n = src.len().min(dst.len());
817    if n == 0 {
818        return;
819    }
820    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
821    unsafe {
822        if is_x86_feature_detected!("avx2") {
823            convert_i8_to_f32_avx2(&src[..n], &mut dst[..n], gain);
824            return;
825        }
826        if is_x86_feature_detected!("sse4.1") {
827            convert_i8_to_f32_sse41(&src[..n], &mut dst[..n], gain);
828            return;
829        }
830    }
831    convert_i8_to_f32_scalar(&src[..n], &mut dst[..n], gain);
832}
833
834fn convert_i8_to_f32_scalar(src: &[i8], dst: &mut [f32], gain: f32) {
835    for (s, d) in src.iter().zip(dst.iter_mut()) {
836        *d = *s as f32 * gain;
837    }
838}
839
840#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
841unsafe fn convert_i8_to_f32_sse41(src: &[i8], dst: &mut [f32], gain: f32) {
842    let g = _mm_set1_ps(gain);
843    let mut i = 0;
844    while i + 4 <= src.len() {
845        let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
846        let i32s = _mm_cvtepi8_epi32(bytes);
847        let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), g);
848        _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
849        i += 4;
850    }
851    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
852        *d = *s as f32 * gain;
853    }
854}
855
856#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
857#[target_feature(enable = "avx2")]
858unsafe fn convert_i8_to_f32_avx2(src: &[i8], dst: &mut [f32], gain: f32) {
859    let g = _mm256_set1_ps(gain);
860    let mut i = 0;
861    while i + 8 <= src.len() {
862        let bytes = _mm_cvtsi64_si128(*(src.as_ptr().add(i) as *const i64));
863        let i32s = _mm256_cvtepi8_epi32(bytes);
864        let f32s = _mm256_mul_ps(_mm256_cvtepi32_ps(i32s), g);
865        _mm256_storeu_ps(dst.as_mut_ptr().add(i), f32s);
866        i += 8;
867    }
868    if i + 4 <= src.len() {
869        let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
870        let i32s = _mm_cvtepi8_epi32(bytes);
871        let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), _mm_set1_ps(gain));
872        _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
873        i += 4;
874    }
875    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
876        *d = *s as f32 * gain;
877    }
878}
879
880/// Convert i32 samples with lower 24 bits valid to f32 and scale by `gain`.
881/// Sign-extends the lower 24 bits of each i32 before conversion.
882/// `dst` must be at least as long as `src`.
883pub fn convert_i24_to_f32(src: &[i32], dst: &mut [f32], gain: f32) {
884    let n = src.len().min(dst.len());
885    if n == 0 {
886        return;
887    }
888    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
889    unsafe {
890        if is_x86_feature_detected!("avx2") {
891            convert_i24_to_f32_avx2(&src[..n], &mut dst[..n], gain);
892            return;
893        }
894        if is_x86_feature_detected!("sse4.1") {
895            convert_i24_to_f32_sse41(&src[..n], &mut dst[..n], gain);
896            return;
897        }
898    }
899    convert_i24_to_f32_scalar(&src[..n], &mut dst[..n], gain);
900}
901
902fn convert_i24_to_f32_scalar(src: &[i32], dst: &mut [f32], gain: f32) {
903    for (s, d) in src.iter().zip(dst.iter_mut()) {
904        let mut v = *s & 0x00FF_FFFF;
905        if (v & 0x0080_0000) != 0 {
906            v |= 0xFF00_0000u32 as i32;
907        }
908        *d = v as f32 * gain;
909    }
910}
911
912#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
913unsafe fn convert_i24_to_f32_sse41(src: &[i32], dst: &mut [f32], gain: f32) {
914    let g = _mm_set1_ps(gain);
915    let mut i = 0;
916    while i + 4 <= src.len() {
917        let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
918        let extended = _mm_srai_epi32(_mm_slli_epi32(s, 8), 8);
919        let f = _mm_cvtepi32_ps(extended);
920        let r = _mm_mul_ps(f, g);
921        _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
922        i += 4;
923    }
924    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
925        let mut v = *s & 0x00FF_FFFF;
926        if (v & 0x0080_0000) != 0 {
927            v |= 0xFF00_0000u32 as i32;
928        }
929        *d = v as f32 * gain;
930    }
931}
932
933#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
934#[target_feature(enable = "avx2")]
935unsafe fn convert_i24_to_f32_avx2(src: &[i32], dst: &mut [f32], gain: f32) {
936    let g = _mm256_set1_ps(gain);
937    let mut i = 0;
938    while i + 8 <= src.len() {
939        let s = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
940        let extended = _mm256_srai_epi32(_mm256_slli_epi32(s, 8), 8);
941        let f = _mm256_cvtepi32_ps(extended);
942        let r = _mm256_mul_ps(f, g);
943        _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
944        i += 8;
945    }
946    if i + 4 <= src.len() {
947        let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
948        let extended = _mm_srai_epi32(_mm_slli_epi32(s, 8), 8);
949        let f = _mm_cvtepi32_ps(extended);
950        let r = _mm_mul_ps(f, _mm_set1_ps(gain));
951        _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
952        i += 4;
953    }
954    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
955        let mut v = *s & 0x00FF_FFFF;
956        if (v & 0x0080_0000) != 0 {
957            v |= 0xFF00_0000u32 as i32;
958        }
959        *d = v as f32 * gain;
960    }
961}
962
963/// Convert f32 samples to i32 and scale by `gain`, masking to lower 24 bits.
964/// Uses truncation toward zero (matching Rust `as i32`).
965/// `dst` must be at least as long as `src`.
966pub fn convert_f32_to_i24(src: &[f32], dst: &mut [i32], gain: f32) {
967    let n = src.len().min(dst.len());
968    if n == 0 {
969        return;
970    }
971    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
972    unsafe {
973        if is_x86_feature_detected!("avx") {
974            convert_f32_to_i24_avx(&src[..n], &mut dst[..n], gain);
975            return;
976        }
977        if is_x86_feature_detected!("sse2") {
978            convert_f32_to_i24_sse2(&src[..n], &mut dst[..n], gain);
979            return;
980        }
981    }
982    convert_f32_to_i24_scalar(&src[..n], &mut dst[..n], gain);
983}
984
985fn convert_f32_to_i24_scalar(src: &[f32], dst: &mut [i32], gain: f32) {
986    for (s, d) in src.iter().zip(dst.iter_mut()) {
987        *d = (*s * gain) as i32 & 0x00FF_FFFF;
988    }
989}
990
991#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
992unsafe fn convert_f32_to_i24_sse2(src: &[f32], dst: &mut [i32], gain: f32) {
993    let g = _mm_set1_ps(gain);
994    let mask = _mm_set1_epi32(0x00FF_FFFF);
995    let mut i = 0;
996    while i + 4 <= src.len() {
997        let s = _mm_loadu_ps(src.as_ptr().add(i));
998        let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
999        let m = _mm_and_si128(v, mask);
1000        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, m);
1001        i += 4;
1002    }
1003    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1004        *d = (*s * gain) as i32 & 0x00FF_FFFF;
1005    }
1006}
1007
1008#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1009#[target_feature(enable = "avx")]
1010unsafe fn convert_f32_to_i24_avx(src: &[f32], dst: &mut [i32], gain: f32) {
1011    let g = _mm256_set1_ps(gain);
1012    let mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x00FF_FFFF));
1013    let mut i = 0;
1014    while i + 8 <= src.len() {
1015        let s = _mm256_loadu_ps(src.as_ptr().add(i));
1016        let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1017        let m = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), mask));
1018        _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, m);
1019        i += 8;
1020    }
1021    if i + 4 <= src.len() {
1022        let g_sse = _mm_set1_ps(gain);
1023        let mask_sse = _mm_set1_epi32(0x00FF_FFFF);
1024        let s = _mm_loadu_ps(src.as_ptr().add(i));
1025        let v = _mm_cvttps_epi32(_mm_mul_ps(s, g_sse));
1026        let m = _mm_and_si128(v, mask_sse);
1027        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, m);
1028        i += 4;
1029    }
1030    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1031        *d = (*s * gain) as i32 & 0x00FF_FFFF;
1032    }
1033}
1034
1035/// Convert f32 samples to i32 and scale by `gain`.
1036/// Uses truncation toward zero (matching Rust `as i32`).
1037/// `dst` must be at least as long as `src`.
1038pub fn convert_f32_to_i32(src: &[f32], dst: &mut [i32], gain: f32) {
1039    let n = src.len().min(dst.len());
1040    if n == 0 {
1041        return;
1042    }
1043    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1044    unsafe {
1045        if is_x86_feature_detected!("avx") {
1046            convert_f32_to_i32_avx(&src[..n], &mut dst[..n], gain);
1047            return;
1048        }
1049        if is_x86_feature_detected!("sse2") {
1050            convert_f32_to_i32_sse2(&src[..n], &mut dst[..n], gain);
1051            return;
1052        }
1053    }
1054    convert_f32_to_i32_scalar(&src[..n], &mut dst[..n], gain);
1055}
1056
1057fn convert_f32_to_i32_scalar(src: &[f32], dst: &mut [i32], gain: f32) {
1058    for (s, d) in src.iter().zip(dst.iter_mut()) {
1059        *d = (*s * gain) as i32;
1060    }
1061}
1062
1063#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1064unsafe fn convert_f32_to_i32_sse2(src: &[f32], dst: &mut [i32], gain: f32) {
1065    let g = _mm_set1_ps(gain);
1066    let mut i = 0;
1067    while i + 4 <= src.len() {
1068        let s = _mm_loadu_ps(src.as_ptr().add(i));
1069        let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
1070        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, v);
1071        i += 4;
1072    }
1073    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1074        *d = (*s * gain) as i32;
1075    }
1076}
1077
1078#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1079#[target_feature(enable = "avx")]
1080unsafe fn convert_f32_to_i32_avx(src: &[f32], dst: &mut [i32], gain: f32) {
1081    let g = _mm256_set1_ps(gain);
1082    let mut i = 0;
1083    while i + 8 <= src.len() {
1084        let s = _mm256_loadu_ps(src.as_ptr().add(i));
1085        let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1086        _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, v);
1087        i += 8;
1088    }
1089    if i + 4 <= src.len() {
1090        let s = _mm_loadu_ps(src.as_ptr().add(i));
1091        let v = _mm_cvttps_epi32(_mm_mul_ps(s, _mm_set1_ps(gain)));
1092        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, v);
1093        i += 4;
1094    }
1095    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1096        *d = (*s * gain) as i32;
1097    }
1098}
1099
1100/// Convert f32 samples to i16 and scale by `gain`.
1101/// Uses truncation toward zero (matching Rust `as i16`).
1102/// `dst` must be at least as long as `src`.
1103pub fn convert_f32_to_i16(src: &[f32], dst: &mut [i16], gain: f32) {
1104    let n = src.len().min(dst.len());
1105    if n == 0 {
1106        return;
1107    }
1108    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1109    unsafe {
1110        if is_x86_feature_detected!("avx") {
1111            convert_f32_to_i16_avx(&src[..n], &mut dst[..n], gain);
1112            return;
1113        }
1114        if is_x86_feature_detected!("sse2") {
1115            convert_f32_to_i16_sse2(&src[..n], &mut dst[..n], gain);
1116            return;
1117        }
1118    }
1119    convert_f32_to_i16_scalar(&src[..n], &mut dst[..n], gain);
1120}
1121
1122fn convert_f32_to_i16_scalar(src: &[f32], dst: &mut [i16], gain: f32) {
1123    for (s, d) in src.iter().zip(dst.iter_mut()) {
1124        *d = (*s * gain) as i16;
1125    }
1126}
1127
1128#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1129unsafe fn convert_f32_to_i16_sse2(src: &[f32], dst: &mut [i16], gain: f32) {
1130    let g = _mm_set1_ps(gain);
1131    let mut i = 0;
1132    while i + 8 <= src.len() {
1133        let s0 = _mm_loadu_ps(src.as_ptr().add(i));
1134        let s1 = _mm_loadu_ps(src.as_ptr().add(i + 4));
1135        let v0 = _mm_cvttps_epi32(_mm_mul_ps(s0, g));
1136        let v1 = _mm_cvttps_epi32(_mm_mul_ps(s1, g));
1137        let packed = _mm_packs_epi32(v0, v1);
1138        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1139        i += 8;
1140    }
1141    if i + 4 <= src.len() {
1142        let s = _mm_loadu_ps(src.as_ptr().add(i));
1143        let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
1144        let packed = _mm_packs_epi32(v, _mm_setzero_si128());
1145        _mm_storel_epi64(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1146        i += 4;
1147    }
1148    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1149        *d = (*s * gain) as i16;
1150    }
1151}
1152
1153#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1154#[target_feature(enable = "avx")]
1155unsafe fn convert_f32_to_i16_avx(src: &[f32], dst: &mut [i16], gain: f32) {
1156    let g = _mm256_set1_ps(gain);
1157    let mut i = 0usize;
1158    while i + 8 <= src.len() {
1159        let s = _mm256_loadu_ps(src.as_ptr().add(i));
1160        let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1161        let vlo = _mm256_castsi256_si128(v);
1162        let vhi = _mm256_extracti128_si256(v, 1);
1163        let packed = _mm_packs_epi32(vlo, vhi);
1164        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1165        i += 8;
1166    }
1167    if i + 4 <= src.len() {
1168        let s = _mm_loadu_ps(src.as_ptr().add(i));
1169        let v = _mm_cvttps_epi32(_mm_mul_ps(s, _mm_set1_ps(gain)));
1170        let packed = _mm_packs_epi32(v, _mm_setzero_si128());
1171        _mm_storel_epi64(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1172        i += 4;
1173    }
1174    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1175        *d = (*s * gain) as i16;
1176    }
1177}
1178
1179/// Convert f32 samples to i8 and scale by `gain`.
1180/// Uses truncation toward zero (matching Rust `as i8`).
1181/// `dst` must be at least as long as `src`.
1182pub fn convert_f32_to_i8(src: &[f32], dst: &mut [i8], gain: f32) {
1183    let n = src.len().min(dst.len());
1184    if n == 0 {
1185        return;
1186    }
1187    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1188    unsafe {
1189        if is_x86_feature_detected!("avx") {
1190            convert_f32_to_i8_avx(&src[..n], &mut dst[..n], gain);
1191            return;
1192        }
1193        if is_x86_feature_detected!("sse2") {
1194            convert_f32_to_i8_sse2(&src[..n], &mut dst[..n], gain);
1195            return;
1196        }
1197    }
1198    convert_f32_to_i8_scalar(&src[..n], &mut dst[..n], gain);
1199}
1200
1201fn convert_f32_to_i8_scalar(src: &[f32], dst: &mut [i8], gain: f32) {
1202    for (s, d) in src.iter().zip(dst.iter_mut()) {
1203        *d = (*s * gain) as i8;
1204    }
1205}
1206
1207#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1208unsafe fn convert_f32_to_i8_sse2(src: &[f32], dst: &mut [i8], gain: f32) {
1209    let g = _mm_set1_ps(gain);
1210    let mut i = 0;
1211    while i + 16 <= src.len() {
1212        let s0 = _mm_loadu_ps(src.as_ptr().add(i));
1213        let s1 = _mm_loadu_ps(src.as_ptr().add(i + 4));
1214        let s2 = _mm_loadu_ps(src.as_ptr().add(i + 8));
1215        let s3 = _mm_loadu_ps(src.as_ptr().add(i + 12));
1216        let v0 = _mm_cvttps_epi32(_mm_mul_ps(s0, g));
1217        let v1 = _mm_cvttps_epi32(_mm_mul_ps(s1, g));
1218        let v2 = _mm_cvttps_epi32(_mm_mul_ps(s2, g));
1219        let v3 = _mm_cvttps_epi32(_mm_mul_ps(s3, g));
1220        let p0 = _mm_packs_epi32(v0, v1);
1221        let p1 = _mm_packs_epi32(v2, v3);
1222        let packed = _mm_packs_epi16(p0, p1);
1223        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1224        i += 16;
1225    }
1226    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1227        *d = (*s * gain) as i8;
1228    }
1229}
1230
1231#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1232#[target_feature(enable = "avx")]
1233unsafe fn convert_f32_to_i8_avx(src: &[f32], dst: &mut [i8], gain: f32) {
1234    let g = _mm256_set1_ps(gain);
1235    let mut i = 0usize;
1236    while i + 16 <= src.len() {
1237        let s0 = _mm256_loadu_ps(src.as_ptr().add(i));
1238        let s1 = _mm256_loadu_ps(src.as_ptr().add(i + 8));
1239        let v0 = _mm256_cvttps_epi32(_mm256_mul_ps(s0, g));
1240        let v1 = _mm256_cvttps_epi32(_mm256_mul_ps(s1, g));
1241        let p0 = _mm_packs_epi32(_mm256_castsi256_si128(v0), _mm256_extracti128_si256(v0, 1));
1242        let p1 = _mm_packs_epi32(_mm256_castsi256_si128(v1), _mm256_extracti128_si256(v1, 1));
1243        let packed = _mm_packs_epi16(p0, p1);
1244        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1245        i += 16;
1246    }
1247    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1248        *d = (*s * gain) as i8;
1249    }
1250}
1251
1252/// Apply a sine-based fade-in gain ramp in place: `gain = sin(t * π/2)`.
1253/// `t` for sample `i` is `(start_t + i as f32 * dt).clamp(0.0, 1.0)`.
1254pub fn apply_fade_in_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
1255    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1256    unsafe {
1257        if is_x86_feature_detected!("avx") {
1258            apply_fade_in_inplace_avx(buf, start_t, dt);
1259            return;
1260        }
1261        if is_x86_feature_detected!("sse") {
1262            apply_fade_in_inplace_sse(buf, start_t, dt);
1263            return;
1264        }
1265    }
1266    for (i, v) in buf.iter_mut().enumerate() {
1267        let t = (start_t + i as f32 * dt).clamp(0.0, 1.0);
1268        *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1269    }
1270}
1271
1272/// Apply a cosine-based fade-out gain ramp in place: `gain = cos(t * π/2)`.
1273/// `t` for sample `i` is `(start_t + i as f32 * dt).clamp(0.0, 1.0)`.
1274pub fn apply_fade_out_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
1275    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1276    unsafe {
1277        if is_x86_feature_detected!("avx") {
1278            apply_fade_out_inplace_avx(buf, start_t, dt);
1279            return;
1280        }
1281        if is_x86_feature_detected!("sse") {
1282            apply_fade_out_inplace_sse(buf, start_t, dt);
1283            return;
1284        }
1285    }
1286    for (i, v) in buf.iter_mut().enumerate() {
1287        let t = (start_t + i as f32 * dt).clamp(0.0, 1.0);
1288        *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1289    }
1290}
1291
1292#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1293#[target_feature(enable = "avx")]
1294unsafe fn apply_fade_in_inplace_avx(buf: &mut [f32], start_t: f32, dt: f32) {
1295    let mut i = 0usize;
1296    while i + 8 <= buf.len() {
1297        let mut gain = [0.0f32; 8];
1298        for (lane, g) in gain.iter_mut().enumerate() {
1299            let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1300            *g = (t * std::f32::consts::FRAC_PI_2).sin();
1301        }
1302        let s = _mm256_loadu_ps(buf.as_ptr().add(i));
1303        let g = _mm256_loadu_ps(gain.as_ptr());
1304        let r = _mm256_mul_ps(s, g);
1305        _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
1306        i += 8;
1307    }
1308    for (j, v) in buf[i..].iter_mut().enumerate() {
1309        let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1310        *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1311    }
1312}
1313
1314#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1315#[target_feature(enable = "sse")]
1316unsafe fn apply_fade_in_inplace_sse(buf: &mut [f32], start_t: f32, dt: f32) {
1317    let mut i = 0usize;
1318    while i + 4 <= buf.len() {
1319        let mut gain = [0.0f32; 4];
1320        for (lane, g) in gain.iter_mut().enumerate() {
1321            let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1322            *g = (t * std::f32::consts::FRAC_PI_2).sin();
1323        }
1324        let s = _mm_loadu_ps(buf.as_ptr().add(i));
1325        let g = _mm_loadu_ps(gain.as_ptr());
1326        let r = _mm_mul_ps(s, g);
1327        _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
1328        i += 4;
1329    }
1330    for (j, v) in buf[i..].iter_mut().enumerate() {
1331        let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1332        *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1333    }
1334}
1335
1336#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1337#[target_feature(enable = "avx")]
1338unsafe fn apply_fade_out_inplace_avx(buf: &mut [f32], start_t: f32, dt: f32) {
1339    let mut i = 0usize;
1340    while i + 8 <= buf.len() {
1341        let mut gain = [0.0f32; 8];
1342        for (lane, g) in gain.iter_mut().enumerate() {
1343            let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1344            *g = (t * std::f32::consts::FRAC_PI_2).cos();
1345        }
1346        let s = _mm256_loadu_ps(buf.as_ptr().add(i));
1347        let g = _mm256_loadu_ps(gain.as_ptr());
1348        let r = _mm256_mul_ps(s, g);
1349        _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
1350        i += 8;
1351    }
1352    for (j, v) in buf[i..].iter_mut().enumerate() {
1353        let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1354        *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1355    }
1356}
1357
1358#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1359#[target_feature(enable = "sse")]
1360unsafe fn apply_fade_out_inplace_sse(buf: &mut [f32], start_t: f32, dt: f32) {
1361    let mut i = 0usize;
1362    while i + 4 <= buf.len() {
1363        let mut gain = [0.0f32; 4];
1364        for (lane, g) in gain.iter_mut().enumerate() {
1365            let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1366            *g = (t * std::f32::consts::FRAC_PI_2).cos();
1367        }
1368        let s = _mm_loadu_ps(buf.as_ptr().add(i));
1369        let g = _mm_loadu_ps(gain.as_ptr());
1370        let r = _mm_mul_ps(s, g);
1371        _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
1372        i += 4;
1373    }
1374    for (j, v) in buf[i..].iter_mut().enumerate() {
1375        let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1376        *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1377    }
1378}