Skip to main content

maolan_engine/
simd.rs

1//! Portable SIMD helper routines for buffer math.
2//!
3//! Uses runtime CPU feature detection to dispatch:
4//! - AVX (`f32x8`) on x86_64/x86 when available
5//! - SSE intrinsics as fallback on x86_64/x86
6//! - Scalar loops on all other architectures
7
8#![allow(unsafe_op_in_unsafe_fn)]
9
10#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
11mod x86 {
12    pub use std::arch::x86_64::*;
13}
14
15#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
16use x86::*;
17
18// ---------------------------------------------------------------------------
19// Public API
20// ---------------------------------------------------------------------------
21
22/// dst[i] += src[i]
23pub fn add_inplace(dst: &mut [f32], src: &[f32]) {
24    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
25    unsafe {
26        if is_x86_feature_detected!("avx") {
27            add_inplace_avx(dst, src);
28            return;
29        }
30        if is_x86_feature_detected!("sse") {
31            add_inplace_sse(dst, src);
32            return;
33        }
34    }
35    add_inplace_scalar(dst, src);
36}
37
38/// dst[i] *= gain
39pub fn mul_inplace(dst: &mut [f32], gain: f32) {
40    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
41    unsafe {
42        if is_x86_feature_detected!("avx") {
43            mul_inplace_avx(dst, gain);
44            return;
45        }
46        if is_x86_feature_detected!("sse") {
47            mul_inplace_sse(dst, gain);
48            return;
49        }
50    }
51    mul_inplace_scalar(dst, gain);
52}
53
54/// dst[i] += src[i] * gain
55pub fn add_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
56    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
57    unsafe {
58        if is_x86_feature_detected!("avx") && is_x86_feature_detected!("fma") {
59            add_scaled_inplace_avx_fma(dst, src, gain);
60            return;
61        }
62        if is_x86_feature_detected!("avx") {
63            add_scaled_inplace_avx(dst, src, gain);
64            return;
65        }
66        if is_x86_feature_detected!("sse") {
67            add_scaled_inplace_sse(dst, src, gain);
68            return;
69        }
70    }
71    add_scaled_inplace_scalar(dst, src, gain);
72}
73
74/// dst[i] = src[i] * gain
75pub fn copy_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
76    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
77    unsafe {
78        if is_x86_feature_detected!("avx") {
79            copy_scaled_inplace_avx(dst, src, gain);
80            return;
81        }
82        if is_x86_feature_detected!("sse") {
83            copy_scaled_inplace_sse(dst, src, gain);
84            return;
85        }
86    }
87    copy_scaled_inplace_scalar(dst, src, gain);
88}
89
90/// dst[i] += sanitize_finite(src[i])
91pub fn add_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
92    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
93    unsafe {
94        if is_x86_feature_detected!("avx") {
95            add_sanitized_inplace_avx(dst, src);
96            return;
97        }
98        if is_x86_feature_detected!("sse") {
99            add_sanitized_inplace_sse(dst, src);
100            return;
101        }
102    }
103    add_sanitized_inplace_scalar(dst, src);
104}
105
106/// dst[i] = sanitize_finite(src[i])
107pub fn copy_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
108    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
109    unsafe {
110        if is_x86_feature_detected!("avx") {
111            copy_sanitized_inplace_avx(dst, src);
112            return;
113        }
114        if is_x86_feature_detected!("sse") {
115            copy_sanitized_inplace_sse(dst, src);
116            return;
117        }
118    }
119    copy_sanitized_inplace_scalar(dst, src);
120}
121
122/// Replace NaN / ±Inf with 0.0 in place.
123pub fn sanitize_finite_inplace(buf: &mut [f32]) {
124    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
125    unsafe {
126        if is_x86_feature_detected!("avx") {
127            sanitize_finite_inplace_avx(buf);
128            return;
129        }
130        if is_x86_feature_detected!("sse") {
131            sanitize_finite_inplace_sse(buf);
132            return;
133        }
134    }
135    sanitize_finite_inplace_scalar(buf);
136}
137
138/// Horizontal max of abs(buf[i]).
139pub fn peak_abs(buf: &[f32]) -> f32 {
140    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
141    unsafe {
142        if is_x86_feature_detected!("avx") {
143            return peak_abs_avx(buf);
144        }
145        if is_x86_feature_detected!("sse") {
146            return peak_abs_sse(buf);
147        }
148    }
149    peak_abs_scalar(buf)
150}
151
152/// Clamp every element to [min, max].
153pub fn clamp_inplace(buf: &mut [f32], min: f32, max: f32) {
154    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
155    unsafe {
156        if is_x86_feature_detected!("avx") {
157            clamp_inplace_avx(buf, min, max);
158            return;
159        }
160        if is_x86_feature_detected!("sse") {
161            clamp_inplace_sse(buf, min, max);
162            return;
163        }
164    }
165    clamp_inplace_scalar(buf, min, max);
166}
167
168// ---------------------------------------------------------------------------
169// Scalar fallbacks
170// ---------------------------------------------------------------------------
171
172fn add_inplace_scalar(dst: &mut [f32], src: &[f32]) {
173    for (d, s) in dst.iter_mut().zip(src.iter()) {
174        *d += *s;
175    }
176}
177
178fn mul_inplace_scalar(dst: &mut [f32], gain: f32) {
179    for d in dst.iter_mut() {
180        *d *= gain;
181    }
182}
183
184fn add_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
185    for (d, s) in dst.iter_mut().zip(src.iter()) {
186        *d += *s * gain;
187    }
188}
189
190fn copy_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
191    for (d, s) in dst.iter_mut().zip(src.iter()) {
192        *d = *s * gain;
193    }
194}
195
196fn add_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
197    for (d, s) in dst.iter_mut().zip(src.iter()) {
198        *d += if s.is_finite() { *s } else { 0.0 };
199    }
200}
201
202fn copy_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
203    for (d, s) in dst.iter_mut().zip(src.iter()) {
204        *d = if s.is_finite() { *s } else { 0.0 };
205    }
206}
207
208fn sanitize_finite_inplace_scalar(buf: &mut [f32]) {
209    for s in buf.iter_mut() {
210        if !s.is_finite() {
211            *s = 0.0;
212        }
213    }
214}
215
216fn peak_abs_scalar(buf: &[f32]) -> f32 {
217    buf.iter().fold(0.0f32, |acc, s| acc.max(s.abs()))
218}
219
220fn clamp_inplace_scalar(buf: &mut [f32], min: f32, max: f32) {
221    for s in buf.iter_mut() {
222        *s = s.clamp(min, max);
223    }
224}
225
226// ---------------------------------------------------------------------------
227// SSE paths (std::arch::x86/_mm*)
228// ---------------------------------------------------------------------------
229
230#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
231#[target_feature(enable = "sse")]
232unsafe fn add_inplace_sse(dst: &mut [f32], src: &[f32]) {
233    let len = dst.len().min(src.len());
234    let dst_head = &mut dst[..len];
235    let src_head = &src[..len];
236    let mut i = 0usize;
237    while i + 4 <= dst_head.len() {
238        let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
239        let s = _mm_loadu_ps(src_head.as_ptr().add(i));
240        let r = _mm_add_ps(d, s);
241        _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
242        i += 4;
243    }
244    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
245        *d += *s;
246    }
247}
248
249#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
250#[target_feature(enable = "sse")]
251unsafe fn mul_inplace_sse(dst: &mut [f32], gain: f32) {
252    let g = _mm_set1_ps(gain);
253    let mut i = 0usize;
254    while i + 4 <= dst.len() {
255        let d = _mm_loadu_ps(dst.as_ptr().add(i));
256        let r = _mm_mul_ps(d, g);
257        _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
258        i += 4;
259    }
260    for d in &mut dst[i..] {
261        *d *= gain;
262    }
263}
264
265#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
266#[target_feature(enable = "sse")]
267unsafe fn add_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
268    let len = dst.len().min(src.len());
269    let dst_head = &mut dst[..len];
270    let src_head = &src[..len];
271    let g = _mm_set1_ps(gain);
272    let mut i = 0usize;
273    while i + 4 <= dst_head.len() {
274        let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
275        let s = _mm_loadu_ps(src_head.as_ptr().add(i));
276        let r = _mm_add_ps(d, _mm_mul_ps(s, g));
277        _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
278        i += 4;
279    }
280    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
281        *d += *s * gain;
282    }
283}
284
285#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
286#[target_feature(enable = "sse")]
287unsafe fn copy_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
288    let len = dst.len().min(src.len());
289    let dst_head = &mut dst[..len];
290    let src_head = &src[..len];
291    let g = _mm_set1_ps(gain);
292    let mut i = 0usize;
293    while i + 4 <= dst_head.len() {
294        let s = _mm_loadu_ps(src_head.as_ptr().add(i));
295        let r = _mm_mul_ps(s, g);
296        _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
297        i += 4;
298    }
299    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
300        *d = *s * gain;
301    }
302}
303
304#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
305#[target_feature(enable = "sse")]
306unsafe fn add_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
307    let len = dst.len().min(src.len());
308    let dst_head = &mut dst[..len];
309    let src_head = &src[..len];
310    let sign_mask = _mm_set1_ps(-0.0);
311    let finite_max = _mm_set1_ps(f32::MAX);
312    let mut i = 0usize;
313    while i + 4 <= dst_head.len() {
314        let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
315        let s = _mm_loadu_ps(src_head.as_ptr().add(i));
316        let abs_s = _mm_andnot_ps(sign_mask, s);
317        let finite_mask = _mm_cmple_ps(abs_s, finite_max);
318        let sanitized = _mm_and_ps(s, finite_mask);
319        let r = _mm_add_ps(d, sanitized);
320        _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
321        i += 4;
322    }
323    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
324        *d += if s.is_finite() { *s } else { 0.0 };
325    }
326}
327
328#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
329#[target_feature(enable = "sse")]
330unsafe fn copy_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
331    let len = dst.len().min(src.len());
332    let dst_head = &mut dst[..len];
333    let src_head = &src[..len];
334    let sign_mask = _mm_set1_ps(-0.0);
335    let finite_max = _mm_set1_ps(f32::MAX);
336    let mut i = 0usize;
337    while i + 4 <= dst_head.len() {
338        let s = _mm_loadu_ps(src_head.as_ptr().add(i));
339        let abs_s = _mm_andnot_ps(sign_mask, s);
340        let finite_mask = _mm_cmple_ps(abs_s, finite_max);
341        let r = _mm_and_ps(s, finite_mask);
342        _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
343        i += 4;
344    }
345    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
346        *d = if s.is_finite() { *s } else { 0.0 };
347    }
348}
349
350#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
351#[target_feature(enable = "sse")]
352unsafe fn sanitize_finite_inplace_sse(buf: &mut [f32]) {
353    let sign_mask = _mm_set1_ps(-0.0);
354    let finite_max = _mm_set1_ps(f32::MAX);
355    let mut i = 0usize;
356    while i + 4 <= buf.len() {
357        let v = _mm_loadu_ps(buf.as_ptr().add(i));
358        let abs_v = _mm_andnot_ps(sign_mask, v);
359        let finite_mask = _mm_cmple_ps(abs_v, finite_max);
360        let r = _mm_and_ps(v, finite_mask);
361        _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
362        i += 4;
363    }
364    for s in &mut buf[i..] {
365        if !s.is_finite() {
366            *s = 0.0;
367        }
368    }
369}
370
371#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
372#[target_feature(enable = "sse")]
373unsafe fn peak_abs_sse(buf: &[f32]) -> f32 {
374    let sign_mask = _mm_set1_ps(-0.0);
375    let mut peak = _mm_setzero_ps();
376    let mut i = 0usize;
377    while i + 4 <= buf.len() {
378        let v = _mm_loadu_ps(buf.as_ptr().add(i));
379        let abs_v = _mm_andnot_ps(sign_mask, v);
380        peak = _mm_max_ps(peak, abs_v);
381        i += 4;
382    }
383    let mut arr = [0.0f32; 4];
384    _mm_storeu_ps(arr.as_mut_ptr(), peak);
385    let mut max_scalar = arr.into_iter().fold(0.0f32, |a, b| a.max(b));
386    for s in &buf[i..] {
387        max_scalar = max_scalar.max(s.abs());
388    }
389    max_scalar
390}
391
392#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
393#[target_feature(enable = "sse")]
394unsafe fn clamp_inplace_sse(buf: &mut [f32], min: f32, max: f32) {
395    let vmin = _mm_set1_ps(min);
396    let vmax = _mm_set1_ps(max);
397    let mut i = 0usize;
398    while i + 4 <= buf.len() {
399        let v = _mm_loadu_ps(buf.as_ptr().add(i));
400        let r = _mm_min_ps(_mm_max_ps(v, vmin), vmax);
401        _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
402        i += 4;
403    }
404    for s in &mut buf[i..] {
405        *s = s.clamp(min, max);
406    }
407}
408
409// ---------------------------------------------------------------------------
410// AVX paths (std::arch::x86_64::__m256)
411// ---------------------------------------------------------------------------
412
413#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
414#[target_feature(enable = "avx")]
415unsafe fn add_inplace_avx(dst: &mut [f32], src: &[f32]) {
416    let len = dst.len().min(src.len());
417    let dst_head = &mut dst[..len];
418    let src_head = &src[..len];
419    let mut i = 0;
420    while i + 8 <= dst_head.len() {
421        let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
422        let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
423        let r = _mm256_add_ps(d, s);
424        _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
425        i += 8;
426    }
427    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
428        *d += *s;
429    }
430}
431
432#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
433#[target_feature(enable = "avx")]
434unsafe fn mul_inplace_avx(dst: &mut [f32], gain: f32) {
435    let g = _mm256_set1_ps(gain);
436    let mut i = 0;
437    while i + 8 <= dst.len() {
438        let d = _mm256_loadu_ps(dst.as_ptr().add(i));
439        let r = _mm256_mul_ps(d, g);
440        _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
441        i += 8;
442    }
443    for d in &mut dst[i..] {
444        *d *= gain;
445    }
446}
447
448#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
449#[target_feature(enable = "avx")]
450unsafe fn add_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
451    let len = dst.len().min(src.len());
452    let dst_head = &mut dst[..len];
453    let src_head = &src[..len];
454    let g = _mm256_set1_ps(gain);
455    let mut i = 0;
456    while i + 8 <= dst_head.len() {
457        let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
458        let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
459        let r = _mm256_add_ps(d, _mm256_mul_ps(s, g));
460        _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
461        i += 8;
462    }
463    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
464        *d += *s * gain;
465    }
466}
467
468#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
469#[target_feature(enable = "avx,fma")]
470unsafe fn add_scaled_inplace_avx_fma(dst: &mut [f32], src: &[f32], gain: f32) {
471    let len = dst.len().min(src.len());
472    let dst_head = &mut dst[..len];
473    let src_head = &src[..len];
474    let g = _mm256_set1_ps(gain);
475    let mut i = 0;
476    while i + 8 <= dst_head.len() {
477        let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
478        let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
479        let r = _mm256_fmadd_ps(s, g, d);
480        _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
481        i += 8;
482    }
483    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
484        *d += *s * gain;
485    }
486}
487
488#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
489#[target_feature(enable = "avx")]
490unsafe fn copy_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
491    let len = dst.len().min(src.len());
492    let dst_head = &mut dst[..len];
493    let src_head = &src[..len];
494    let g = _mm256_set1_ps(gain);
495    let mut i = 0;
496    while i + 8 <= dst_head.len() {
497        let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
498        let r = _mm256_mul_ps(s, g);
499        _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
500        i += 8;
501    }
502    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
503        *d = *s * gain;
504    }
505}
506
507#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
508#[target_feature(enable = "avx")]
509unsafe fn add_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
510    let len = dst.len().min(src.len());
511    let dst_head = &mut dst[..len];
512    let src_head = &src[..len];
513    let zero = _mm256_setzero_ps();
514    let max_val = _mm256_set1_ps(f32::MAX);
515    let mut i = 0;
516    while i + 8 <= dst_head.len() {
517        let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
518        let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
519        let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
520        let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
521        let sanitized = _mm256_blendv_ps(zero, s, mask);
522        let r = _mm256_add_ps(d, sanitized);
523        _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
524        i += 8;
525    }
526    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
527        *d += if s.is_finite() { *s } else { 0.0 };
528    }
529}
530
531#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
532#[target_feature(enable = "avx")]
533unsafe fn copy_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
534    let len = dst.len().min(src.len());
535    let dst_head = &mut dst[..len];
536    let src_head = &src[..len];
537    let zero = _mm256_setzero_ps();
538    let max_val = _mm256_set1_ps(f32::MAX);
539    let mut i = 0;
540    while i + 8 <= dst_head.len() {
541        let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
542        let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
543        let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
544        let r = _mm256_blendv_ps(zero, s, mask);
545        _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
546        i += 8;
547    }
548    for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
549        *d = if s.is_finite() { *s } else { 0.0 };
550    }
551}
552
553#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
554#[target_feature(enable = "avx")]
555unsafe fn sanitize_finite_inplace_avx(buf: &mut [f32]) {
556    let zero = _mm256_setzero_ps();
557    let max_val = _mm256_set1_ps(f32::MAX);
558    let mut i = 0;
559    while i + 8 <= buf.len() {
560        let v = _mm256_loadu_ps(buf.as_ptr().add(i));
561        let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
562        let mask = _mm256_cmp_ps(abs_v, max_val, _CMP_LE_OQ);
563        let r = _mm256_blendv_ps(zero, v, mask);
564        _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
565        i += 8;
566    }
567    for s in &mut buf[i..] {
568        if !s.is_finite() {
569            *s = 0.0;
570        }
571    }
572}
573
574#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
575#[target_feature(enable = "avx")]
576unsafe fn peak_abs_avx(buf: &[f32]) -> f32 {
577    let mut peak = _mm256_setzero_ps();
578    let mut i = 0;
579    while i + 8 <= buf.len() {
580        let v = _mm256_loadu_ps(buf.as_ptr().add(i));
581        let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
582        peak = _mm256_max_ps(peak, abs_v);
583        i += 8;
584    }
585    let mut arr = [0.0f32; 8];
586    _mm256_storeu_ps(arr.as_mut_ptr(), peak);
587    let mut max_scalar = arr.iter().fold(0.0f32, |a, &b| a.max(b));
588    for s in &buf[i..] {
589        max_scalar = max_scalar.max(s.abs());
590    }
591    max_scalar
592}
593
594#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
595#[target_feature(enable = "avx")]
596unsafe fn clamp_inplace_avx(buf: &mut [f32], min: f32, max: f32) {
597    let vmin = _mm256_set1_ps(min);
598    let vmax = _mm256_set1_ps(max);
599    let mut i = 0;
600    while i + 8 <= buf.len() {
601        let v = _mm256_loadu_ps(buf.as_ptr().add(i));
602        let r = _mm256_max_ps(vmin, _mm256_min_ps(vmax, v));
603        _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
604        i += 8;
605    }
606    for s in &mut buf[i..] {
607        *s = s.clamp(min, max);
608    }
609}
610
611#[cfg(test)]
612mod tests {
613    use super::*;
614
615    #[test]
616    fn add_inplace_basic() {
617        let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
618        let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
619        add_inplace(&mut a, &b);
620        assert_eq!(a, [11.0, 22.0, 33.0, 44.0, 55.0]);
621    }
622
623    #[test]
624    fn mul_inplace_basic() {
625        let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
626        mul_inplace(&mut a, 2.0);
627        assert_eq!(a, [2.0, 4.0, 6.0, 8.0, 10.0]);
628    }
629
630    #[test]
631    fn add_scaled_inplace_basic() {
632        let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
633        let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
634        add_scaled_inplace(&mut a, &b, 0.5);
635        assert_eq!(a, [6.0, 12.0, 18.0, 24.0, 30.0]);
636    }
637
638    #[test]
639    fn copy_scaled_inplace_basic() {
640        let mut a = [0.0f32; 5];
641        let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
642        copy_scaled_inplace(&mut a, &b, 0.5);
643        assert_eq!(a, [5.0, 10.0, 15.0, 20.0, 25.0]);
644    }
645
646    #[test]
647    fn add_sanitized_inplace_basic() {
648        let mut a = [1.0f32, 2.0, 3.0, 4.0];
649        let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
650        add_sanitized_inplace(&mut a, &b);
651        assert!(a[0].is_finite() && a[0] == 1.5);
652        assert!(a[1].is_finite() && a[1] == 2.0);
653        assert!(a[2].is_finite() && a[2] == 3.0);
654        assert!(a[3].is_finite() && a[3] == 5.0);
655    }
656
657    #[test]
658    fn copy_sanitized_inplace_basic() {
659        let mut a = [0.0f32; 4];
660        let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
661        copy_sanitized_inplace(&mut a, &b);
662        assert!(a[0].is_finite() && a[0] == 0.5);
663        assert!(a[1].is_finite() && a[1] == 0.0);
664        assert!(a[2].is_finite() && a[2] == 0.0);
665        assert!(a[3].is_finite() && a[3] == 1.0);
666    }
667
668    #[test]
669    fn sanitize_finite_inplace_basic() {
670        let mut a = [1.0f32, f32::NAN, f32::INFINITY, 4.0, f32::NEG_INFINITY];
671        sanitize_finite_inplace(&mut a);
672        assert!(a[0].is_finite() && a[0] == 1.0);
673        assert_eq!(a[1], 0.0);
674        assert_eq!(a[2], 0.0);
675        assert!(a[3].is_finite() && a[3] == 4.0);
676        assert_eq!(a[4], 0.0);
677    }
678
679    #[test]
680    fn peak_abs_basic() {
681        let a = [1.0f32, -3.0, 2.0, 0.5];
682        assert_eq!(peak_abs(&a), 3.0);
683    }
684
685    #[test]
686    fn clamp_inplace_basic() {
687        let mut a = [-2.0f32, -0.5, 0.0, 0.5, 2.0];
688        clamp_inplace(&mut a, -1.0, 1.0);
689        assert_eq!(a, [-1.0, -0.5, 0.0, 0.5, 1.0]);
690    }
691}
692
693// ---------------------------------------------------------------------------
694// Integer -> f32 conversion
695// ---------------------------------------------------------------------------
696
697/// Convert i32 samples to f32 and scale by `gain`.
698/// `dst` must be at least as long as `src`.
699pub fn convert_i32_to_f32(src: &[i32], dst: &mut [f32], gain: f32) {
700    let n = src.len().min(dst.len());
701    if n == 0 {
702        return;
703    }
704    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
705    unsafe {
706        if is_x86_feature_detected!("avx") {
707            convert_i32_to_f32_avx(&src[..n], &mut dst[..n], gain);
708            return;
709        }
710        if is_x86_feature_detected!("sse") {
711            convert_i32_to_f32_sse(&src[..n], &mut dst[..n], gain);
712            return;
713        }
714    }
715    convert_i32_to_f32_scalar(&src[..n], &mut dst[..n], gain);
716}
717
718fn convert_i32_to_f32_scalar(src: &[i32], dst: &mut [f32], gain: f32) {
719    for (s, d) in src.iter().zip(dst.iter_mut()) {
720        *d = *s as f32 * gain;
721    }
722}
723
724#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
725unsafe fn convert_i32_to_f32_sse(src: &[i32], dst: &mut [f32], gain: f32) {
726    let g = _mm_set1_ps(gain);
727    let mut i = 0;
728    while i + 4 <= src.len() {
729        let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
730        let f = _mm_cvtepi32_ps(s);
731        let r = _mm_mul_ps(f, g);
732        _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
733        i += 4;
734    }
735    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
736        *d = *s as f32 * gain;
737    }
738}
739
740#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
741#[target_feature(enable = "avx")]
742unsafe fn convert_i32_to_f32_avx(src: &[i32], dst: &mut [f32], gain: f32) {
743    let g = _mm256_set1_ps(gain);
744    let mut i = 0;
745    while i + 8 <= src.len() {
746        let s = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
747        let f = _mm256_cvtepi32_ps(s);
748        let r = _mm256_mul_ps(f, g);
749        _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
750        i += 8;
751    }
752    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
753        *d = *s as f32 * gain;
754    }
755}
756
757/// Convert i16 samples to f32 and scale by `gain`.
758/// `dst` must be at least as long as `src`.
759pub fn convert_i16_to_f32(src: &[i16], dst: &mut [f32], gain: f32) {
760    let n = src.len().min(dst.len());
761    if n == 0 {
762        return;
763    }
764    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
765    unsafe {
766        if is_x86_feature_detected!("avx2") {
767            convert_i16_to_f32_avx2(&src[..n], &mut dst[..n], gain);
768            return;
769        }
770        if is_x86_feature_detected!("sse4.1") {
771            convert_i16_to_f32_sse41(&src[..n], &mut dst[..n], gain);
772            return;
773        }
774    }
775    convert_i16_to_f32_scalar(&src[..n], &mut dst[..n], gain);
776}
777
778fn convert_i16_to_f32_scalar(src: &[i16], dst: &mut [f32], gain: f32) {
779    for (s, d) in src.iter().zip(dst.iter_mut()) {
780        *d = *s as f32 * gain;
781    }
782}
783
784#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
785unsafe fn convert_i16_to_f32_sse41(src: &[i16], dst: &mut [f32], gain: f32) {
786    let g = _mm_set1_ps(gain);
787    let mut i = 0;
788    while i + 8 <= src.len() {
789        let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
790        let low = _mm_cvtepi16_epi32(bytes);
791        let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
792        let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), g);
793        let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), g);
794        _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
795        _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
796        i += 8;
797    }
798    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
799        *d = *s as f32 * gain;
800    }
801}
802
803#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
804#[target_feature(enable = "avx2")]
805unsafe fn convert_i16_to_f32_avx2(src: &[i16], dst: &mut [f32], gain: f32) {
806    let g = _mm256_set1_ps(gain);
807    let mut i = 0;
808    while i + 16 <= src.len() {
809        let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
810        let low = _mm256_cvtepi16_epi32(bytes);
811        let high = _mm256_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
812        let low_f = _mm256_mul_ps(_mm256_cvtepi32_ps(low), g);
813        let high_f = _mm256_mul_ps(_mm256_cvtepi32_ps(high), g);
814        _mm256_storeu_ps(dst.as_mut_ptr().add(i), low_f);
815        _mm256_storeu_ps(dst.as_mut_ptr().add(i + 8), high_f);
816        i += 16;
817    }
818    if i + 8 <= src.len() {
819        let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
820        let low = _mm_cvtepi16_epi32(bytes);
821        let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
822        let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), _mm_set1_ps(gain));
823        let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), _mm_set1_ps(gain));
824        _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
825        _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
826        i += 8;
827    }
828    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
829        *d = *s as f32 * gain;
830    }
831}
832
833/// Convert i8 samples to f32 and scale by `gain`.
834/// `dst` must be at least as long as `src`.
835pub fn convert_i8_to_f32(src: &[i8], dst: &mut [f32], gain: f32) {
836    let n = src.len().min(dst.len());
837    if n == 0 {
838        return;
839    }
840    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
841    unsafe {
842        if is_x86_feature_detected!("avx2") {
843            convert_i8_to_f32_avx2(&src[..n], &mut dst[..n], gain);
844            return;
845        }
846        if is_x86_feature_detected!("sse4.1") {
847            convert_i8_to_f32_sse41(&src[..n], &mut dst[..n], gain);
848            return;
849        }
850    }
851    convert_i8_to_f32_scalar(&src[..n], &mut dst[..n], gain);
852}
853
854fn convert_i8_to_f32_scalar(src: &[i8], dst: &mut [f32], gain: f32) {
855    for (s, d) in src.iter().zip(dst.iter_mut()) {
856        *d = *s as f32 * gain;
857    }
858}
859
860#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
861unsafe fn convert_i8_to_f32_sse41(src: &[i8], dst: &mut [f32], gain: f32) {
862    let g = _mm_set1_ps(gain);
863    let mut i = 0;
864    while i + 4 <= src.len() {
865        let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
866        let i32s = _mm_cvtepi8_epi32(bytes);
867        let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), g);
868        _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
869        i += 4;
870    }
871    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
872        *d = *s as f32 * gain;
873    }
874}
875
876#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
877#[target_feature(enable = "avx2")]
878unsafe fn convert_i8_to_f32_avx2(src: &[i8], dst: &mut [f32], gain: f32) {
879    let g = _mm256_set1_ps(gain);
880    let mut i = 0;
881    while i + 8 <= src.len() {
882        let bytes = _mm_cvtsi64_si128(*(src.as_ptr().add(i) as *const i64));
883        let i32s = _mm256_cvtepi8_epi32(bytes);
884        let f32s = _mm256_mul_ps(_mm256_cvtepi32_ps(i32s), g);
885        _mm256_storeu_ps(dst.as_mut_ptr().add(i), f32s);
886        i += 8;
887    }
888    if i + 4 <= src.len() {
889        let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
890        let i32s = _mm_cvtepi8_epi32(bytes);
891        let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), _mm_set1_ps(gain));
892        _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
893        i += 4;
894    }
895    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
896        *d = *s as f32 * gain;
897    }
898}
899
900/// Convert i32 samples with lower 24 bits valid to f32 and scale by `gain`.
901/// Sign-extends the lower 24 bits of each i32 before conversion.
902/// `dst` must be at least as long as `src`.
903pub fn convert_i24_to_f32(src: &[i32], dst: &mut [f32], gain: f32) {
904    let n = src.len().min(dst.len());
905    if n == 0 {
906        return;
907    }
908    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
909    unsafe {
910        if is_x86_feature_detected!("avx2") {
911            convert_i24_to_f32_avx2(&src[..n], &mut dst[..n], gain);
912            return;
913        }
914        if is_x86_feature_detected!("sse4.1") {
915            convert_i24_to_f32_sse41(&src[..n], &mut dst[..n], gain);
916            return;
917        }
918    }
919    convert_i24_to_f32_scalar(&src[..n], &mut dst[..n], gain);
920}
921
922fn convert_i24_to_f32_scalar(src: &[i32], dst: &mut [f32], gain: f32) {
923    for (s, d) in src.iter().zip(dst.iter_mut()) {
924        let mut v = *s & 0x00FF_FFFF;
925        if (v & 0x0080_0000) != 0 {
926            v |= 0xFF00_0000u32 as i32;
927        }
928        *d = v as f32 * gain;
929    }
930}
931
932#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
933unsafe fn convert_i24_to_f32_sse41(src: &[i32], dst: &mut [f32], gain: f32) {
934    let g = _mm_set1_ps(gain);
935    let mut i = 0;
936    while i + 4 <= src.len() {
937        let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
938        let extended = _mm_srai_epi32(_mm_slli_epi32(s, 8), 8);
939        let f = _mm_cvtepi32_ps(extended);
940        let r = _mm_mul_ps(f, g);
941        _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
942        i += 4;
943    }
944    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
945        let mut v = *s & 0x00FF_FFFF;
946        if (v & 0x0080_0000) != 0 {
947            v |= 0xFF00_0000u32 as i32;
948        }
949        *d = v as f32 * gain;
950    }
951}
952
953#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
954#[target_feature(enable = "avx2")]
955unsafe fn convert_i24_to_f32_avx2(src: &[i32], dst: &mut [f32], gain: f32) {
956    let g = _mm256_set1_ps(gain);
957    let mut i = 0;
958    while i + 8 <= src.len() {
959        let s = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
960        let extended = _mm256_srai_epi32(_mm256_slli_epi32(s, 8), 8);
961        let f = _mm256_cvtepi32_ps(extended);
962        let r = _mm256_mul_ps(f, g);
963        _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
964        i += 8;
965    }
966    if i + 4 <= src.len() {
967        let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
968        let extended = _mm_srai_epi32(_mm_slli_epi32(s, 8), 8);
969        let f = _mm_cvtepi32_ps(extended);
970        let r = _mm_mul_ps(f, _mm_set1_ps(gain));
971        _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
972        i += 4;
973    }
974    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
975        let mut v = *s & 0x00FF_FFFF;
976        if (v & 0x0080_0000) != 0 {
977            v |= 0xFF00_0000u32 as i32;
978        }
979        *d = v as f32 * gain;
980    }
981}
982
983/// Convert f32 samples to i32 and scale by `gain`, masking to lower 24 bits.
984/// Uses truncation toward zero (matching Rust `as i32`).
985/// `dst` must be at least as long as `src`.
986pub fn convert_f32_to_i24(src: &[f32], dst: &mut [i32], gain: f32) {
987    let n = src.len().min(dst.len());
988    if n == 0 {
989        return;
990    }
991    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
992    unsafe {
993        if is_x86_feature_detected!("avx") {
994            convert_f32_to_i24_avx(&src[..n], &mut dst[..n], gain);
995            return;
996        }
997        if is_x86_feature_detected!("sse2") {
998            convert_f32_to_i24_sse2(&src[..n], &mut dst[..n], gain);
999            return;
1000        }
1001    }
1002    convert_f32_to_i24_scalar(&src[..n], &mut dst[..n], gain);
1003}
1004
1005fn convert_f32_to_i24_scalar(src: &[f32], dst: &mut [i32], gain: f32) {
1006    for (s, d) in src.iter().zip(dst.iter_mut()) {
1007        *d = (*s * gain) as i32 & 0x00FF_FFFF;
1008    }
1009}
1010
1011#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1012unsafe fn convert_f32_to_i24_sse2(src: &[f32], dst: &mut [i32], gain: f32) {
1013    let g = _mm_set1_ps(gain);
1014    let mask = _mm_set1_epi32(0x00FF_FFFF);
1015    let mut i = 0;
1016    while i + 4 <= src.len() {
1017        let s = _mm_loadu_ps(src.as_ptr().add(i));
1018        let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
1019        let m = _mm_and_si128(v, mask);
1020        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, m);
1021        i += 4;
1022    }
1023    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1024        *d = (*s * gain) as i32 & 0x00FF_FFFF;
1025    }
1026}
1027
1028#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1029#[target_feature(enable = "avx")]
1030unsafe fn convert_f32_to_i24_avx(src: &[f32], dst: &mut [i32], gain: f32) {
1031    let g = _mm256_set1_ps(gain);
1032    let mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x00FF_FFFF));
1033    let mut i = 0;
1034    while i + 8 <= src.len() {
1035        let s = _mm256_loadu_ps(src.as_ptr().add(i));
1036        let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1037        let m = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), mask));
1038        _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, m);
1039        i += 8;
1040    }
1041    if i + 4 <= src.len() {
1042        let g_sse = _mm_set1_ps(gain);
1043        let mask_sse = _mm_set1_epi32(0x00FF_FFFF);
1044        let s = _mm_loadu_ps(src.as_ptr().add(i));
1045        let v = _mm_cvttps_epi32(_mm_mul_ps(s, g_sse));
1046        let m = _mm_and_si128(v, mask_sse);
1047        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, m);
1048        i += 4;
1049    }
1050    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1051        *d = (*s * gain) as i32 & 0x00FF_FFFF;
1052    }
1053}
1054
1055/// Convert f32 samples to i32 and scale by `gain`.
1056/// Uses truncation toward zero (matching Rust `as i32`).
1057/// `dst` must be at least as long as `src`.
1058pub fn convert_f32_to_i32(src: &[f32], dst: &mut [i32], gain: f32) {
1059    let n = src.len().min(dst.len());
1060    if n == 0 {
1061        return;
1062    }
1063    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1064    unsafe {
1065        if is_x86_feature_detected!("avx") {
1066            convert_f32_to_i32_avx(&src[..n], &mut dst[..n], gain);
1067            return;
1068        }
1069        if is_x86_feature_detected!("sse2") {
1070            convert_f32_to_i32_sse2(&src[..n], &mut dst[..n], gain);
1071            return;
1072        }
1073    }
1074    convert_f32_to_i32_scalar(&src[..n], &mut dst[..n], gain);
1075}
1076
1077fn convert_f32_to_i32_scalar(src: &[f32], dst: &mut [i32], gain: f32) {
1078    for (s, d) in src.iter().zip(dst.iter_mut()) {
1079        *d = (*s * gain) as i32;
1080    }
1081}
1082
1083#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1084unsafe fn convert_f32_to_i32_sse2(src: &[f32], dst: &mut [i32], gain: f32) {
1085    let g = _mm_set1_ps(gain);
1086    let mut i = 0;
1087    while i + 4 <= src.len() {
1088        let s = _mm_loadu_ps(src.as_ptr().add(i));
1089        let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
1090        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, v);
1091        i += 4;
1092    }
1093    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1094        *d = (*s * gain) as i32;
1095    }
1096}
1097
1098#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1099#[target_feature(enable = "avx")]
1100unsafe fn convert_f32_to_i32_avx(src: &[f32], dst: &mut [i32], gain: f32) {
1101    let g = _mm256_set1_ps(gain);
1102    let mut i = 0;
1103    while i + 8 <= src.len() {
1104        let s = _mm256_loadu_ps(src.as_ptr().add(i));
1105        let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1106        _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, v);
1107        i += 8;
1108    }
1109    if i + 4 <= src.len() {
1110        let s = _mm_loadu_ps(src.as_ptr().add(i));
1111        let v = _mm_cvttps_epi32(_mm_mul_ps(s, _mm_set1_ps(gain)));
1112        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, v);
1113        i += 4;
1114    }
1115    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1116        *d = (*s * gain) as i32;
1117    }
1118}
1119
1120/// Convert f32 samples to i16 and scale by `gain`.
1121/// Uses truncation toward zero (matching Rust `as i16`).
1122/// `dst` must be at least as long as `src`.
1123pub fn convert_f32_to_i16(src: &[f32], dst: &mut [i16], gain: f32) {
1124    let n = src.len().min(dst.len());
1125    if n == 0 {
1126        return;
1127    }
1128    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1129    unsafe {
1130        if is_x86_feature_detected!("avx") {
1131            convert_f32_to_i16_avx(&src[..n], &mut dst[..n], gain);
1132            return;
1133        }
1134        if is_x86_feature_detected!("sse2") {
1135            convert_f32_to_i16_sse2(&src[..n], &mut dst[..n], gain);
1136            return;
1137        }
1138    }
1139    convert_f32_to_i16_scalar(&src[..n], &mut dst[..n], gain);
1140}
1141
1142fn convert_f32_to_i16_scalar(src: &[f32], dst: &mut [i16], gain: f32) {
1143    for (s, d) in src.iter().zip(dst.iter_mut()) {
1144        *d = (*s * gain) as i16;
1145    }
1146}
1147
1148#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1149unsafe fn convert_f32_to_i16_sse2(src: &[f32], dst: &mut [i16], gain: f32) {
1150    let g = _mm_set1_ps(gain);
1151    let mut i = 0;
1152    while i + 8 <= src.len() {
1153        let s0 = _mm_loadu_ps(src.as_ptr().add(i));
1154        let s1 = _mm_loadu_ps(src.as_ptr().add(i + 4));
1155        let v0 = _mm_cvttps_epi32(_mm_mul_ps(s0, g));
1156        let v1 = _mm_cvttps_epi32(_mm_mul_ps(s1, g));
1157        let packed = _mm_packs_epi32(v0, v1);
1158        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1159        i += 8;
1160    }
1161    if i + 4 <= src.len() {
1162        let s = _mm_loadu_ps(src.as_ptr().add(i));
1163        let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
1164        let packed = _mm_packs_epi32(v, _mm_setzero_si128());
1165        _mm_storel_epi64(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1166        i += 4;
1167    }
1168    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1169        *d = (*s * gain) as i16;
1170    }
1171}
1172
1173#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1174#[target_feature(enable = "avx")]
1175unsafe fn convert_f32_to_i16_avx(src: &[f32], dst: &mut [i16], gain: f32) {
1176    let g = _mm256_set1_ps(gain);
1177    let mut i = 0usize;
1178    while i + 8 <= src.len() {
1179        let s = _mm256_loadu_ps(src.as_ptr().add(i));
1180        let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1181        let vlo = _mm256_castsi256_si128(v);
1182        let vhi = _mm256_extracti128_si256(v, 1);
1183        let packed = _mm_packs_epi32(vlo, vhi);
1184        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1185        i += 8;
1186    }
1187    if i + 4 <= src.len() {
1188        let s = _mm_loadu_ps(src.as_ptr().add(i));
1189        let v = _mm_cvttps_epi32(_mm_mul_ps(s, _mm_set1_ps(gain)));
1190        let packed = _mm_packs_epi32(v, _mm_setzero_si128());
1191        _mm_storel_epi64(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1192        i += 4;
1193    }
1194    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1195        *d = (*s * gain) as i16;
1196    }
1197}
1198
1199/// Convert f32 samples to i8 and scale by `gain`.
1200/// Uses truncation toward zero (matching Rust `as i8`).
1201/// `dst` must be at least as long as `src`.
1202pub fn convert_f32_to_i8(src: &[f32], dst: &mut [i8], gain: f32) {
1203    let n = src.len().min(dst.len());
1204    if n == 0 {
1205        return;
1206    }
1207    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1208    unsafe {
1209        if is_x86_feature_detected!("avx") {
1210            convert_f32_to_i8_avx(&src[..n], &mut dst[..n], gain);
1211            return;
1212        }
1213        if is_x86_feature_detected!("sse2") {
1214            convert_f32_to_i8_sse2(&src[..n], &mut dst[..n], gain);
1215            return;
1216        }
1217    }
1218    convert_f32_to_i8_scalar(&src[..n], &mut dst[..n], gain);
1219}
1220
1221fn convert_f32_to_i8_scalar(src: &[f32], dst: &mut [i8], gain: f32) {
1222    for (s, d) in src.iter().zip(dst.iter_mut()) {
1223        *d = (*s * gain) as i8;
1224    }
1225}
1226
1227#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1228unsafe fn convert_f32_to_i8_sse2(src: &[f32], dst: &mut [i8], gain: f32) {
1229    let g = _mm_set1_ps(gain);
1230    let mut i = 0;
1231    while i + 16 <= src.len() {
1232        let s0 = _mm_loadu_ps(src.as_ptr().add(i));
1233        let s1 = _mm_loadu_ps(src.as_ptr().add(i + 4));
1234        let s2 = _mm_loadu_ps(src.as_ptr().add(i + 8));
1235        let s3 = _mm_loadu_ps(src.as_ptr().add(i + 12));
1236        let v0 = _mm_cvttps_epi32(_mm_mul_ps(s0, g));
1237        let v1 = _mm_cvttps_epi32(_mm_mul_ps(s1, g));
1238        let v2 = _mm_cvttps_epi32(_mm_mul_ps(s2, g));
1239        let v3 = _mm_cvttps_epi32(_mm_mul_ps(s3, g));
1240        let p0 = _mm_packs_epi32(v0, v1);
1241        let p1 = _mm_packs_epi32(v2, v3);
1242        let packed = _mm_packs_epi16(p0, p1);
1243        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1244        i += 16;
1245    }
1246    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1247        *d = (*s * gain) as i8;
1248    }
1249}
1250
1251#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1252#[target_feature(enable = "avx")]
1253unsafe fn convert_f32_to_i8_avx(src: &[f32], dst: &mut [i8], gain: f32) {
1254    let g = _mm256_set1_ps(gain);
1255    let mut i = 0usize;
1256    while i + 16 <= src.len() {
1257        let s0 = _mm256_loadu_ps(src.as_ptr().add(i));
1258        let s1 = _mm256_loadu_ps(src.as_ptr().add(i + 8));
1259        let v0 = _mm256_cvttps_epi32(_mm256_mul_ps(s0, g));
1260        let v1 = _mm256_cvttps_epi32(_mm256_mul_ps(s1, g));
1261        let p0 = _mm_packs_epi32(_mm256_castsi256_si128(v0), _mm256_extracti128_si256(v0, 1));
1262        let p1 = _mm_packs_epi32(_mm256_castsi256_si128(v1), _mm256_extracti128_si256(v1, 1));
1263        let packed = _mm_packs_epi16(p0, p1);
1264        _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1265        i += 16;
1266    }
1267    for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1268        *d = (*s * gain) as i8;
1269    }
1270}
1271
1272// ---------------------------------------------------------------------------
1273// Fade ramps (sin/cos based)
1274// ---------------------------------------------------------------------------
1275
1276/// Apply a sine-based fade-in gain ramp in place: `gain = sin(t * π/2)`.
1277/// `t` for sample `i` is `(start_t + i as f32 * dt).clamp(0.0, 1.0)`.
1278pub fn apply_fade_in_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
1279    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1280    unsafe {
1281        if is_x86_feature_detected!("avx") {
1282            apply_fade_in_inplace_avx(buf, start_t, dt);
1283            return;
1284        }
1285        if is_x86_feature_detected!("sse") {
1286            apply_fade_in_inplace_sse(buf, start_t, dt);
1287            return;
1288        }
1289    }
1290    for (i, v) in buf.iter_mut().enumerate() {
1291        let t = (start_t + i as f32 * dt).clamp(0.0, 1.0);
1292        *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1293    }
1294}
1295
1296/// Apply a cosine-based fade-out gain ramp in place: `gain = cos(t * π/2)`.
1297/// `t` for sample `i` is `(start_t + i as f32 * dt).clamp(0.0, 1.0)`.
1298pub fn apply_fade_out_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
1299    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1300    unsafe {
1301        if is_x86_feature_detected!("avx") {
1302            apply_fade_out_inplace_avx(buf, start_t, dt);
1303            return;
1304        }
1305        if is_x86_feature_detected!("sse") {
1306            apply_fade_out_inplace_sse(buf, start_t, dt);
1307            return;
1308        }
1309    }
1310    for (i, v) in buf.iter_mut().enumerate() {
1311        let t = (start_t + i as f32 * dt).clamp(0.0, 1.0);
1312        *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1313    }
1314}
1315
1316#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1317#[target_feature(enable = "avx")]
1318unsafe fn apply_fade_in_inplace_avx(buf: &mut [f32], start_t: f32, dt: f32) {
1319    let mut i = 0usize;
1320    while i + 8 <= buf.len() {
1321        let mut gain = [0.0f32; 8];
1322        for (lane, g) in gain.iter_mut().enumerate() {
1323            let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1324            *g = (t * std::f32::consts::FRAC_PI_2).sin();
1325        }
1326        let s = _mm256_loadu_ps(buf.as_ptr().add(i));
1327        let g = _mm256_loadu_ps(gain.as_ptr());
1328        let r = _mm256_mul_ps(s, g);
1329        _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
1330        i += 8;
1331    }
1332    for (j, v) in buf[i..].iter_mut().enumerate() {
1333        let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1334        *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1335    }
1336}
1337
1338#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1339#[target_feature(enable = "sse")]
1340unsafe fn apply_fade_in_inplace_sse(buf: &mut [f32], start_t: f32, dt: f32) {
1341    let mut i = 0usize;
1342    while i + 4 <= buf.len() {
1343        let mut gain = [0.0f32; 4];
1344        for (lane, g) in gain.iter_mut().enumerate() {
1345            let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1346            *g = (t * std::f32::consts::FRAC_PI_2).sin();
1347        }
1348        let s = _mm_loadu_ps(buf.as_ptr().add(i));
1349        let g = _mm_loadu_ps(gain.as_ptr());
1350        let r = _mm_mul_ps(s, g);
1351        _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
1352        i += 4;
1353    }
1354    for (j, v) in buf[i..].iter_mut().enumerate() {
1355        let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1356        *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1357    }
1358}
1359
1360#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1361#[target_feature(enable = "avx")]
1362unsafe fn apply_fade_out_inplace_avx(buf: &mut [f32], start_t: f32, dt: f32) {
1363    let mut i = 0usize;
1364    while i + 8 <= buf.len() {
1365        let mut gain = [0.0f32; 8];
1366        for (lane, g) in gain.iter_mut().enumerate() {
1367            let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1368            *g = (t * std::f32::consts::FRAC_PI_2).cos();
1369        }
1370        let s = _mm256_loadu_ps(buf.as_ptr().add(i));
1371        let g = _mm256_loadu_ps(gain.as_ptr());
1372        let r = _mm256_mul_ps(s, g);
1373        _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
1374        i += 8;
1375    }
1376    for (j, v) in buf[i..].iter_mut().enumerate() {
1377        let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1378        *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1379    }
1380}
1381
1382#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1383#[target_feature(enable = "sse")]
1384unsafe fn apply_fade_out_inplace_sse(buf: &mut [f32], start_t: f32, dt: f32) {
1385    let mut i = 0usize;
1386    while i + 4 <= buf.len() {
1387        let mut gain = [0.0f32; 4];
1388        for (lane, g) in gain.iter_mut().enumerate() {
1389            let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1390            *g = (t * std::f32::consts::FRAC_PI_2).cos();
1391        }
1392        let s = _mm_loadu_ps(buf.as_ptr().add(i));
1393        let g = _mm_loadu_ps(gain.as_ptr());
1394        let r = _mm_mul_ps(s, g);
1395        _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
1396        i += 4;
1397    }
1398    for (j, v) in buf[i..].iter_mut().enumerate() {
1399        let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1400        *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1401    }
1402}