1#![allow(unsafe_op_in_unsafe_fn)]
9
10#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
11mod x86 {
12 pub use std::arch::x86_64::*;
13 pub use wide::f32x4;
14}
15
16#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
17use x86::*;
18
19pub fn add_inplace(dst: &mut [f32], src: &[f32]) {
25 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
26 unsafe {
27 if is_x86_feature_detected!("avx") {
28 add_inplace_avx(dst, src);
29 return;
30 }
31 if is_x86_feature_detected!("sse") {
32 add_inplace_sse(dst, src);
33 return;
34 }
35 }
36 add_inplace_scalar(dst, src);
37}
38
39pub fn mul_inplace(dst: &mut [f32], gain: f32) {
41 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
42 unsafe {
43 if is_x86_feature_detected!("avx") {
44 mul_inplace_avx(dst, gain);
45 return;
46 }
47 if is_x86_feature_detected!("sse") {
48 mul_inplace_sse(dst, gain);
49 return;
50 }
51 }
52 mul_inplace_scalar(dst, gain);
53}
54
55pub fn add_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
57 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
58 unsafe {
59 if is_x86_feature_detected!("avx") {
60 add_scaled_inplace_avx(dst, src, gain);
61 return;
62 }
63 if is_x86_feature_detected!("sse") {
64 add_scaled_inplace_sse(dst, src, gain);
65 return;
66 }
67 }
68 add_scaled_inplace_scalar(dst, src, gain);
69}
70
71pub fn copy_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
73 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
74 unsafe {
75 if is_x86_feature_detected!("avx") {
76 copy_scaled_inplace_avx(dst, src, gain);
77 return;
78 }
79 if is_x86_feature_detected!("sse") {
80 copy_scaled_inplace_sse(dst, src, gain);
81 return;
82 }
83 }
84 copy_scaled_inplace_scalar(dst, src, gain);
85}
86
87pub fn add_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
89 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
90 unsafe {
91 if is_x86_feature_detected!("avx") {
92 add_sanitized_inplace_avx(dst, src);
93 return;
94 }
95 if is_x86_feature_detected!("sse") {
96 add_sanitized_inplace_sse(dst, src);
97 return;
98 }
99 }
100 add_sanitized_inplace_scalar(dst, src);
101}
102
103pub fn copy_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
105 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
106 unsafe {
107 if is_x86_feature_detected!("avx") {
108 copy_sanitized_inplace_avx(dst, src);
109 return;
110 }
111 if is_x86_feature_detected!("sse") {
112 copy_sanitized_inplace_sse(dst, src);
113 return;
114 }
115 }
116 copy_sanitized_inplace_scalar(dst, src);
117}
118
119pub fn sanitize_finite_inplace(buf: &mut [f32]) {
121 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
122 unsafe {
123 if is_x86_feature_detected!("avx") {
124 sanitize_finite_inplace_avx(buf);
125 return;
126 }
127 if is_x86_feature_detected!("sse") {
128 sanitize_finite_inplace_sse(buf);
129 return;
130 }
131 }
132 sanitize_finite_inplace_scalar(buf);
133}
134
135pub fn peak_abs(buf: &[f32]) -> f32 {
137 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
138 unsafe {
139 if is_x86_feature_detected!("avx") {
140 return peak_abs_avx(buf);
141 }
142 if is_x86_feature_detected!("sse") {
143 return peak_abs_sse(buf);
144 }
145 }
146 peak_abs_scalar(buf)
147}
148
149pub fn clamp_inplace(buf: &mut [f32], min: f32, max: f32) {
151 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
152 unsafe {
153 if is_x86_feature_detected!("avx") {
154 clamp_inplace_avx(buf, min, max);
155 return;
156 }
157 if is_x86_feature_detected!("sse") {
158 clamp_inplace_sse(buf, min, max);
159 return;
160 }
161 }
162 clamp_inplace_scalar(buf, min, max);
163}
164
165fn add_inplace_scalar(dst: &mut [f32], src: &[f32]) {
170 for (d, s) in dst.iter_mut().zip(src.iter()) {
171 *d += *s;
172 }
173}
174
175fn mul_inplace_scalar(dst: &mut [f32], gain: f32) {
176 for d in dst.iter_mut() {
177 *d *= gain;
178 }
179}
180
181fn add_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
182 for (d, s) in dst.iter_mut().zip(src.iter()) {
183 *d += *s * gain;
184 }
185}
186
187fn copy_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
188 for (d, s) in dst.iter_mut().zip(src.iter()) {
189 *d = *s * gain;
190 }
191}
192
193fn add_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
194 for (d, s) in dst.iter_mut().zip(src.iter()) {
195 *d += if s.is_finite() { *s } else { 0.0 };
196 }
197}
198
199fn copy_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
200 for (d, s) in dst.iter_mut().zip(src.iter()) {
201 *d = if s.is_finite() { *s } else { 0.0 };
202 }
203}
204
205fn sanitize_finite_inplace_scalar(buf: &mut [f32]) {
206 for s in buf.iter_mut() {
207 if !s.is_finite() {
208 *s = 0.0;
209 }
210 }
211}
212
213fn peak_abs_scalar(buf: &[f32]) -> f32 {
214 buf.iter().fold(0.0f32, |acc, s| acc.max(s.abs()))
215}
216
217fn clamp_inplace_scalar(buf: &mut [f32], min: f32, max: f32) {
218 for s in buf.iter_mut() {
219 *s = s.clamp(min, max);
220 }
221}
222
223#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
228unsafe fn add_inplace_sse(dst: &mut [f32], src: &[f32]) {
229 let len = dst.len().min(src.len());
230 let dst_head = &mut dst[..len];
231 let src_head = &src[..len];
232 let n = dst_head.len() / 4;
233 for i in 0..n {
234 let d_chunk = &mut dst_head[i * 4..(i + 1) * 4];
235 let s_chunk = &src_head[i * 4..(i + 1) * 4];
236 let d: f32x4 = [d_chunk[0], d_chunk[1], d_chunk[2], d_chunk[3]].into();
237 let s: f32x4 = [s_chunk[0], s_chunk[1], s_chunk[2], s_chunk[3]].into();
238 let r = d + s;
239 d_chunk.copy_from_slice(&r.to_array());
240 }
241 for (d, s) in dst_head[n * 4..].iter_mut().zip(src_head[n * 4..].iter()) {
242 *d += *s;
243 }
244}
245
246#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
247unsafe fn mul_inplace_sse(dst: &mut [f32], gain: f32) {
248 let n = dst.len() / 4;
249 let g: f32x4 = [gain; 4].into();
250 for i in 0..n {
251 let d_chunk = &mut dst[i * 4..(i + 1) * 4];
252 let d: f32x4 = [d_chunk[0], d_chunk[1], d_chunk[2], d_chunk[3]].into();
253 let r = d * g;
254 d_chunk.copy_from_slice(&r.to_array());
255 }
256 for d in &mut dst[n * 4..] {
257 *d *= gain;
258 }
259}
260
261#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
262unsafe fn add_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
263 let len = dst.len().min(src.len());
264 let dst_head = &mut dst[..len];
265 let src_head = &src[..len];
266 let n = dst_head.len() / 4;
267 let g: f32x4 = [gain; 4].into();
268 for i in 0..n {
269 let d_chunk = &mut dst_head[i * 4..(i + 1) * 4];
270 let s_chunk = &src_head[i * 4..(i + 1) * 4];
271 let d: f32x4 = [d_chunk[0], d_chunk[1], d_chunk[2], d_chunk[3]].into();
272 let s: f32x4 = [s_chunk[0], s_chunk[1], s_chunk[2], s_chunk[3]].into();
273 let r = d + s * g;
274 d_chunk.copy_from_slice(&r.to_array());
275 }
276 for (d, s) in dst_head[n * 4..].iter_mut().zip(src_head[n * 4..].iter()) {
277 *d += *s * gain;
278 }
279}
280
281#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
282unsafe fn copy_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
283 let len = dst.len().min(src.len());
284 let dst_head = &mut dst[..len];
285 let src_head = &src[..len];
286 let n = dst_head.len() / 4;
287 let g: f32x4 = [gain; 4].into();
288 for i in 0..n {
289 let d_chunk = &mut dst_head[i * 4..(i + 1) * 4];
290 let s_chunk = &src_head[i * 4..(i + 1) * 4];
291 let s: f32x4 = [s_chunk[0], s_chunk[1], s_chunk[2], s_chunk[3]].into();
292 let r = s * g;
293 d_chunk.copy_from_slice(&r.to_array());
294 }
295 for (d, s) in dst_head[n * 4..].iter_mut().zip(src_head[n * 4..].iter()) {
296 *d = *s * gain;
297 }
298}
299
300#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
301unsafe fn add_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
302 let len = dst.len().min(src.len());
303 let dst_head = &mut dst[..len];
304 let src_head = &src[..len];
305 let n = dst_head.len() / 4;
306 let zero: f32x4 = [0.0; 4].into();
307 for i in 0..n {
308 let d_chunk = &mut dst_head[i * 4..(i + 1) * 4];
309 let s_chunk = &src_head[i * 4..(i + 1) * 4];
310 let d: f32x4 = [d_chunk[0], d_chunk[1], d_chunk[2], d_chunk[3]].into();
311 let s: f32x4 = [s_chunk[0], s_chunk[1], s_chunk[2], s_chunk[3]].into();
312 let mask = s.is_finite();
313 let sanitized = mask.blend(s, zero);
314 let r = d + sanitized;
315 d_chunk.copy_from_slice(&r.to_array());
316 }
317 for (d, s) in dst_head[n * 4..].iter_mut().zip(src_head[n * 4..].iter()) {
318 *d += if s.is_finite() { *s } else { 0.0 };
319 }
320}
321
322#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
323unsafe fn copy_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
324 let len = dst.len().min(src.len());
325 let dst_head = &mut dst[..len];
326 let src_head = &src[..len];
327 let n = dst_head.len() / 4;
328 let zero: f32x4 = [0.0; 4].into();
329 for i in 0..n {
330 let d_chunk = &mut dst_head[i * 4..(i + 1) * 4];
331 let s_chunk = &src_head[i * 4..(i + 1) * 4];
332 let s: f32x4 = [s_chunk[0], s_chunk[1], s_chunk[2], s_chunk[3]].into();
333 let mask = s.is_finite();
334 let r = mask.blend(s, zero);
335 d_chunk.copy_from_slice(&r.to_array());
336 }
337 for (d, s) in dst_head[n * 4..].iter_mut().zip(src_head[n * 4..].iter()) {
338 *d = if s.is_finite() { *s } else { 0.0 };
339 }
340}
341
342#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
343unsafe fn sanitize_finite_inplace_sse(buf: &mut [f32]) {
344 let n = buf.len() / 4;
345 let zero: f32x4 = [0.0; 4].into();
346 for i in 0..n {
347 let chunk = &mut buf[i * 4..(i + 1) * 4];
348 let v: f32x4 = [chunk[0], chunk[1], chunk[2], chunk[3]].into();
349 let mask = v.is_finite();
350 let r = mask.blend(v, zero);
351 chunk.copy_from_slice(&r.to_array());
352 }
353 for s in &mut buf[n * 4..] {
354 if !s.is_finite() {
355 *s = 0.0;
356 }
357 }
358}
359
360#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
361unsafe fn peak_abs_sse(buf: &[f32]) -> f32 {
362 let n = buf.len() / 4;
363 let mut peak: f32x4 = [0.0; 4].into();
364 for i in 0..n {
365 let chunk = &buf[i * 4..(i + 1) * 4];
366 let v: f32x4 = [chunk[0], chunk[1], chunk[2], chunk[3]].into();
367 peak = peak.max(v.abs());
368 }
369 let mut max_scalar = peak.to_array().into_iter().fold(0.0f32, |a, b| a.max(b));
370 for s in &buf[n * 4..] {
371 max_scalar = max_scalar.max(s.abs());
372 }
373 max_scalar
374}
375
376#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
377unsafe fn clamp_inplace_sse(buf: &mut [f32], min: f32, max: f32) {
378 let n = buf.len() / 4;
379 let vmin: f32x4 = [min; 4].into();
380 let vmax: f32x4 = [max; 4].into();
381 for i in 0..n {
382 let chunk = &mut buf[i * 4..(i + 1) * 4];
383 let v: f32x4 = [chunk[0], chunk[1], chunk[2], chunk[3]].into();
384 let r = v.clamp(vmin, vmax);
385 chunk.copy_from_slice(&r.to_array());
386 }
387 for s in &mut buf[n * 4..] {
388 *s = s.clamp(min, max);
389 }
390}
391
392#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
397#[target_feature(enable = "avx")]
398unsafe fn add_inplace_avx(dst: &mut [f32], src: &[f32]) {
399 let len = dst.len().min(src.len());
400 let dst_head = &mut dst[..len];
401 let src_head = &src[..len];
402 let mut i = 0;
403 while i + 8 <= dst_head.len() {
404 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
405 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
406 let r = _mm256_add_ps(d, s);
407 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
408 i += 8;
409 }
410 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
411 *d += *s;
412 }
413}
414
415#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
416#[target_feature(enable = "avx")]
417unsafe fn mul_inplace_avx(dst: &mut [f32], gain: f32) {
418 let g = _mm256_set1_ps(gain);
419 let mut i = 0;
420 while i + 8 <= dst.len() {
421 let d = _mm256_loadu_ps(dst.as_ptr().add(i));
422 let r = _mm256_mul_ps(d, g);
423 _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
424 i += 8;
425 }
426 for d in &mut dst[i..] {
427 *d *= gain;
428 }
429}
430
431#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
432#[target_feature(enable = "avx")]
433unsafe fn add_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
434 let len = dst.len().min(src.len());
435 let dst_head = &mut dst[..len];
436 let src_head = &src[..len];
437 let g = _mm256_set1_ps(gain);
438 let mut i = 0;
439 while i + 8 <= dst_head.len() {
440 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
441 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
442 let r = _mm256_add_ps(d, _mm256_mul_ps(s, g));
443 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
444 i += 8;
445 }
446 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
447 *d += *s * gain;
448 }
449}
450
451#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
452#[target_feature(enable = "avx")]
453unsafe fn copy_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
454 let len = dst.len().min(src.len());
455 let dst_head = &mut dst[..len];
456 let src_head = &src[..len];
457 let g = _mm256_set1_ps(gain);
458 let mut i = 0;
459 while i + 8 <= dst_head.len() {
460 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
461 let r = _mm256_mul_ps(s, g);
462 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
463 i += 8;
464 }
465 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
466 *d = *s * gain;
467 }
468}
469
470#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
471#[target_feature(enable = "avx")]
472unsafe fn add_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
473 let len = dst.len().min(src.len());
474 let dst_head = &mut dst[..len];
475 let src_head = &src[..len];
476 let zero = _mm256_setzero_ps();
477 let max_val = _mm256_set1_ps(f32::MAX);
478 let mut i = 0;
479 while i + 8 <= dst_head.len() {
480 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
481 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
482 let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
483 let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
484 let sanitized = _mm256_blendv_ps(zero, s, mask);
485 let r = _mm256_add_ps(d, sanitized);
486 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
487 i += 8;
488 }
489 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
490 *d += if s.is_finite() { *s } else { 0.0 };
491 }
492}
493
494#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
495#[target_feature(enable = "avx")]
496unsafe fn copy_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
497 let len = dst.len().min(src.len());
498 let dst_head = &mut dst[..len];
499 let src_head = &src[..len];
500 let zero = _mm256_setzero_ps();
501 let max_val = _mm256_set1_ps(f32::MAX);
502 let mut i = 0;
503 while i + 8 <= dst_head.len() {
504 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
505 let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
506 let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
507 let r = _mm256_blendv_ps(zero, s, mask);
508 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
509 i += 8;
510 }
511 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
512 *d = if s.is_finite() { *s } else { 0.0 };
513 }
514}
515
516#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
517#[target_feature(enable = "avx")]
518unsafe fn sanitize_finite_inplace_avx(buf: &mut [f32]) {
519 let zero = _mm256_setzero_ps();
520 let max_val = _mm256_set1_ps(f32::MAX);
521 let mut i = 0;
522 while i + 8 <= buf.len() {
523 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
524 let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
525 let mask = _mm256_cmp_ps(abs_v, max_val, _CMP_LE_OQ);
526 let r = _mm256_blendv_ps(zero, v, mask);
527 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
528 i += 8;
529 }
530 for s in &mut buf[i..] {
531 if !s.is_finite() {
532 *s = 0.0;
533 }
534 }
535}
536
537#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
538#[target_feature(enable = "avx")]
539unsafe fn peak_abs_avx(buf: &[f32]) -> f32 {
540 let mut peak = _mm256_setzero_ps();
541 let mut i = 0;
542 while i + 8 <= buf.len() {
543 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
544 let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
545 peak = _mm256_max_ps(peak, abs_v);
546 i += 8;
547 }
548 let mut arr = [0.0f32; 8];
549 _mm256_storeu_ps(arr.as_mut_ptr(), peak);
550 let mut max_scalar = arr.iter().fold(0.0f32, |a, &b| a.max(b));
551 for s in &buf[i..] {
552 max_scalar = max_scalar.max(s.abs());
553 }
554 max_scalar
555}
556
557#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
558#[target_feature(enable = "avx")]
559unsafe fn clamp_inplace_avx(buf: &mut [f32], min: f32, max: f32) {
560 let vmin = _mm256_set1_ps(min);
561 let vmax = _mm256_set1_ps(max);
562 let mut i = 0;
563 while i + 8 <= buf.len() {
564 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
565 let r = _mm256_max_ps(vmin, _mm256_min_ps(vmax, v));
566 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
567 i += 8;
568 }
569 for s in &mut buf[i..] {
570 *s = s.clamp(min, max);
571 }
572}
573
574#[cfg(test)]
575mod tests {
576 use super::*;
577
578 #[test]
579 fn add_inplace_basic() {
580 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
581 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
582 add_inplace(&mut a, &b);
583 assert_eq!(a, [11.0, 22.0, 33.0, 44.0, 55.0]);
584 }
585
586 #[test]
587 fn mul_inplace_basic() {
588 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
589 mul_inplace(&mut a, 2.0);
590 assert_eq!(a, [2.0, 4.0, 6.0, 8.0, 10.0]);
591 }
592
593 #[test]
594 fn add_scaled_inplace_basic() {
595 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
596 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
597 add_scaled_inplace(&mut a, &b, 0.5);
598 assert_eq!(a, [6.0, 12.0, 18.0, 24.0, 30.0]);
599 }
600
601 #[test]
602 fn copy_scaled_inplace_basic() {
603 let mut a = [0.0f32; 5];
604 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
605 copy_scaled_inplace(&mut a, &b, 0.5);
606 assert_eq!(a, [5.0, 10.0, 15.0, 20.0, 25.0]);
607 }
608
609 #[test]
610 fn add_sanitized_inplace_basic() {
611 let mut a = [1.0f32, 2.0, 3.0, 4.0];
612 let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
613 add_sanitized_inplace(&mut a, &b);
614 assert!(a[0].is_finite() && a[0] == 1.5);
615 assert!(a[1].is_finite() && a[1] == 2.0);
616 assert!(a[2].is_finite() && a[2] == 3.0);
617 assert!(a[3].is_finite() && a[3] == 5.0);
618 }
619
620 #[test]
621 fn copy_sanitized_inplace_basic() {
622 let mut a = [0.0f32; 4];
623 let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
624 copy_sanitized_inplace(&mut a, &b);
625 assert!(a[0].is_finite() && a[0] == 0.5);
626 assert!(a[1].is_finite() && a[1] == 0.0);
627 assert!(a[2].is_finite() && a[2] == 0.0);
628 assert!(a[3].is_finite() && a[3] == 1.0);
629 }
630
631 #[test]
632 fn sanitize_finite_inplace_basic() {
633 let mut a = [1.0f32, f32::NAN, f32::INFINITY, 4.0, f32::NEG_INFINITY];
634 sanitize_finite_inplace(&mut a);
635 assert!(a[0].is_finite() && a[0] == 1.0);
636 assert_eq!(a[1], 0.0);
637 assert_eq!(a[2], 0.0);
638 assert!(a[3].is_finite() && a[3] == 4.0);
639 assert_eq!(a[4], 0.0);
640 }
641
642 #[test]
643 fn peak_abs_basic() {
644 let a = [1.0f32, -3.0, 2.0, 0.5];
645 assert_eq!(peak_abs(&a), 3.0);
646 }
647
648 #[test]
649 fn clamp_inplace_basic() {
650 let mut a = [-2.0f32, -0.5, 0.0, 0.5, 2.0];
651 clamp_inplace(&mut a, -1.0, 1.0);
652 assert_eq!(a, [-1.0, -0.5, 0.0, 0.5, 1.0]);
653 }
654}
655
656pub fn convert_i32_to_f32(src: &[i32], dst: &mut [f32], gain: f32) {
663 let n = src.len().min(dst.len());
664 if n == 0 {
665 return;
666 }
667 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
668 unsafe {
669 if is_x86_feature_detected!("avx") {
670 convert_i32_to_f32_avx(&src[..n], &mut dst[..n], gain);
671 return;
672 }
673 if is_x86_feature_detected!("sse") {
674 convert_i32_to_f32_sse(&src[..n], &mut dst[..n], gain);
675 return;
676 }
677 }
678 convert_i32_to_f32_scalar(&src[..n], &mut dst[..n], gain);
679}
680
681fn convert_i32_to_f32_scalar(src: &[i32], dst: &mut [f32], gain: f32) {
682 for (s, d) in src.iter().zip(dst.iter_mut()) {
683 *d = *s as f32 * gain;
684 }
685}
686
687#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
688unsafe fn convert_i32_to_f32_sse(src: &[i32], dst: &mut [f32], gain: f32) {
689 let g = _mm_set1_ps(gain);
690 let mut i = 0;
691 while i + 4 <= src.len() {
692 let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
693 let f = _mm_cvtepi32_ps(s);
694 let r = _mm_mul_ps(f, g);
695 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
696 i += 4;
697 }
698 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
699 *d = *s as f32 * gain;
700 }
701}
702
703#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
704#[target_feature(enable = "avx")]
705unsafe fn convert_i32_to_f32_avx(src: &[i32], dst: &mut [f32], gain: f32) {
706 let g = _mm256_set1_ps(gain);
707 let mut i = 0;
708 while i + 8 <= src.len() {
709 let s = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
710 let f = _mm256_cvtepi32_ps(s);
711 let r = _mm256_mul_ps(f, g);
712 _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
713 i += 8;
714 }
715 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
716 *d = *s as f32 * gain;
717 }
718}
719
720pub fn convert_i16_to_f32(src: &[i16], dst: &mut [f32], gain: f32) {
723 let n = src.len().min(dst.len());
724 if n == 0 {
725 return;
726 }
727 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
728 unsafe {
729 if is_x86_feature_detected!("avx2") {
730 convert_i16_to_f32_avx2(&src[..n], &mut dst[..n], gain);
731 return;
732 }
733 if is_x86_feature_detected!("sse4.1") {
734 convert_i16_to_f32_sse41(&src[..n], &mut dst[..n], gain);
735 return;
736 }
737 }
738 convert_i16_to_f32_scalar(&src[..n], &mut dst[..n], gain);
739}
740
741fn convert_i16_to_f32_scalar(src: &[i16], dst: &mut [f32], gain: f32) {
742 for (s, d) in src.iter().zip(dst.iter_mut()) {
743 *d = *s as f32 * gain;
744 }
745}
746
747#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
748unsafe fn convert_i16_to_f32_sse41(src: &[i16], dst: &mut [f32], gain: f32) {
749 let g = _mm_set1_ps(gain);
750 let mut i = 0;
751 while i + 8 <= src.len() {
752 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
753 let low = _mm_cvtepi16_epi32(bytes);
754 let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
755 let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), g);
756 let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), g);
757 _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
758 _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
759 i += 8;
760 }
761 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
762 *d = *s as f32 * gain;
763 }
764}
765
766#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
767#[target_feature(enable = "avx2")]
768unsafe fn convert_i16_to_f32_avx2(src: &[i16], dst: &mut [f32], gain: f32) {
769 let g = _mm256_set1_ps(gain);
770 let mut i = 0;
771 while i + 16 <= src.len() {
772 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
773 let low = _mm256_cvtepi16_epi32(bytes);
774 let high = _mm256_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
775 let low_f = _mm256_mul_ps(_mm256_cvtepi32_ps(low), g);
776 let high_f = _mm256_mul_ps(_mm256_cvtepi32_ps(high), g);
777 _mm256_storeu_ps(dst.as_mut_ptr().add(i), low_f);
778 _mm256_storeu_ps(dst.as_mut_ptr().add(i + 8), high_f);
779 i += 16;
780 }
781 if i + 8 <= src.len() {
782 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
783 let low = _mm_cvtepi16_epi32(bytes);
784 let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
785 let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), _mm_set1_ps(gain));
786 let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), _mm_set1_ps(gain));
787 _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
788 _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
789 i += 8;
790 }
791 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
792 *d = *s as f32 * gain;
793 }
794}
795
796pub fn convert_i8_to_f32(src: &[i8], dst: &mut [f32], gain: f32) {
799 let n = src.len().min(dst.len());
800 if n == 0 {
801 return;
802 }
803 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
804 unsafe {
805 if is_x86_feature_detected!("avx2") {
806 convert_i8_to_f32_avx2(&src[..n], &mut dst[..n], gain);
807 return;
808 }
809 if is_x86_feature_detected!("sse4.1") {
810 convert_i8_to_f32_sse41(&src[..n], &mut dst[..n], gain);
811 return;
812 }
813 }
814 convert_i8_to_f32_scalar(&src[..n], &mut dst[..n], gain);
815}
816
817fn convert_i8_to_f32_scalar(src: &[i8], dst: &mut [f32], gain: f32) {
818 for (s, d) in src.iter().zip(dst.iter_mut()) {
819 *d = *s as f32 * gain;
820 }
821}
822
823#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
824unsafe fn convert_i8_to_f32_sse41(src: &[i8], dst: &mut [f32], gain: f32) {
825 let g = _mm_set1_ps(gain);
826 let mut i = 0;
827 while i + 4 <= src.len() {
828 let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
829 let i32s = _mm_cvtepi8_epi32(bytes);
830 let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), g);
831 _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
832 i += 4;
833 }
834 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
835 *d = *s as f32 * gain;
836 }
837}
838
839#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
840#[target_feature(enable = "avx2")]
841unsafe fn convert_i8_to_f32_avx2(src: &[i8], dst: &mut [f32], gain: f32) {
842 let g = _mm256_set1_ps(gain);
843 let mut i = 0;
844 while i + 8 <= src.len() {
845 let bytes = _mm_cvtsi64_si128(*(src.as_ptr().add(i) as *const i64));
846 let i32s = _mm256_cvtepi8_epi32(bytes);
847 let f32s = _mm256_mul_ps(_mm256_cvtepi32_ps(i32s), g);
848 _mm256_storeu_ps(dst.as_mut_ptr().add(i), f32s);
849 i += 8;
850 }
851 if i + 4 <= src.len() {
852 let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
853 let i32s = _mm_cvtepi8_epi32(bytes);
854 let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), _mm_set1_ps(gain));
855 _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
856 i += 4;
857 }
858 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
859 *d = *s as f32 * gain;
860 }
861}
862
863pub fn convert_i24_to_f32(src: &[i32], dst: &mut [f32], gain: f32) {
867 let n = src.len().min(dst.len());
868 if n == 0 {
869 return;
870 }
871 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
872 unsafe {
873 if is_x86_feature_detected!("avx2") {
874 convert_i24_to_f32_avx2(&src[..n], &mut dst[..n], gain);
875 return;
876 }
877 if is_x86_feature_detected!("sse4.1") {
878 convert_i24_to_f32_sse41(&src[..n], &mut dst[..n], gain);
879 return;
880 }
881 }
882 convert_i24_to_f32_scalar(&src[..n], &mut dst[..n], gain);
883}
884
885fn convert_i24_to_f32_scalar(src: &[i32], dst: &mut [f32], gain: f32) {
886 for (s, d) in src.iter().zip(dst.iter_mut()) {
887 let mut v = *s & 0x00FF_FFFF;
888 if (v & 0x0080_0000) != 0 {
889 v |= 0xFF00_0000u32 as i32;
890 }
891 *d = v as f32 * gain;
892 }
893}
894
895#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
896unsafe fn convert_i24_to_f32_sse41(src: &[i32], dst: &mut [f32], gain: f32) {
897 let g = _mm_set1_ps(gain);
898 let mut i = 0;
899 while i + 4 <= src.len() {
900 let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
901 let extended = _mm_srai_epi32(_mm_slli_epi32(s, 8), 8);
902 let f = _mm_cvtepi32_ps(extended);
903 let r = _mm_mul_ps(f, g);
904 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
905 i += 4;
906 }
907 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
908 let mut v = *s & 0x00FF_FFFF;
909 if (v & 0x0080_0000) != 0 {
910 v |= 0xFF00_0000u32 as i32;
911 }
912 *d = v as f32 * gain;
913 }
914}
915
916#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
917#[target_feature(enable = "avx2")]
918unsafe fn convert_i24_to_f32_avx2(src: &[i32], dst: &mut [f32], gain: f32) {
919 let g = _mm256_set1_ps(gain);
920 let mut i = 0;
921 while i + 8 <= src.len() {
922 let s = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
923 let extended = _mm256_srai_epi32(_mm256_slli_epi32(s, 8), 8);
924 let f = _mm256_cvtepi32_ps(extended);
925 let r = _mm256_mul_ps(f, g);
926 _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
927 i += 8;
928 }
929 if i + 4 <= src.len() {
930 let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
931 let extended = _mm_srai_epi32(_mm_slli_epi32(s, 8), 8);
932 let f = _mm_cvtepi32_ps(extended);
933 let r = _mm_mul_ps(f, _mm_set1_ps(gain));
934 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
935 i += 4;
936 }
937 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
938 let mut v = *s & 0x00FF_FFFF;
939 if (v & 0x0080_0000) != 0 {
940 v |= 0xFF00_0000u32 as i32;
941 }
942 *d = v as f32 * gain;
943 }
944}
945
946pub fn convert_f32_to_i24(src: &[f32], dst: &mut [i32], gain: f32) {
950 let n = src.len().min(dst.len());
951 if n == 0 {
952 return;
953 }
954 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
955 unsafe {
956 if is_x86_feature_detected!("avx") {
957 convert_f32_to_i24_avx(&src[..n], &mut dst[..n], gain);
958 return;
959 }
960 if is_x86_feature_detected!("sse2") {
961 convert_f32_to_i24_sse2(&src[..n], &mut dst[..n], gain);
962 return;
963 }
964 }
965 convert_f32_to_i24_scalar(&src[..n], &mut dst[..n], gain);
966}
967
968fn convert_f32_to_i24_scalar(src: &[f32], dst: &mut [i32], gain: f32) {
969 for (s, d) in src.iter().zip(dst.iter_mut()) {
970 *d = (*s * gain) as i32 & 0x00FF_FFFF;
971 }
972}
973
974#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
975unsafe fn convert_f32_to_i24_sse2(src: &[f32], dst: &mut [i32], gain: f32) {
976 let g = _mm_set1_ps(gain);
977 let mask = _mm_set1_epi32(0x00FF_FFFF);
978 let mut i = 0;
979 while i + 4 <= src.len() {
980 let s = _mm_loadu_ps(src.as_ptr().add(i));
981 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
982 let m = _mm_and_si128(v, mask);
983 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, m);
984 i += 4;
985 }
986 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
987 *d = (*s * gain) as i32 & 0x00FF_FFFF;
988 }
989}
990
991#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
992#[target_feature(enable = "avx")]
993unsafe fn convert_f32_to_i24_avx(src: &[f32], dst: &mut [i32], gain: f32) {
994 let g = _mm256_set1_ps(gain);
995 let mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x00FF_FFFF));
996 let mut i = 0;
997 while i + 8 <= src.len() {
998 let s = _mm256_loadu_ps(src.as_ptr().add(i));
999 let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1000 let m = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), mask));
1001 _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, m);
1002 i += 8;
1003 }
1004 if i + 4 <= src.len() {
1005 let g_sse = _mm_set1_ps(gain);
1006 let mask_sse = _mm_set1_epi32(0x00FF_FFFF);
1007 let s = _mm_loadu_ps(src.as_ptr().add(i));
1008 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g_sse));
1009 let m = _mm_and_si128(v, mask_sse);
1010 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, m);
1011 i += 4;
1012 }
1013 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1014 *d = (*s * gain) as i32 & 0x00FF_FFFF;
1015 }
1016}
1017
1018pub fn convert_f32_to_i32(src: &[f32], dst: &mut [i32], gain: f32) {
1022 let n = src.len().min(dst.len());
1023 if n == 0 {
1024 return;
1025 }
1026 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1027 unsafe {
1028 if is_x86_feature_detected!("avx") {
1029 convert_f32_to_i32_avx(&src[..n], &mut dst[..n], gain);
1030 return;
1031 }
1032 if is_x86_feature_detected!("sse2") {
1033 convert_f32_to_i32_sse2(&src[..n], &mut dst[..n], gain);
1034 return;
1035 }
1036 }
1037 convert_f32_to_i32_scalar(&src[..n], &mut dst[..n], gain);
1038}
1039
1040fn convert_f32_to_i32_scalar(src: &[f32], dst: &mut [i32], gain: f32) {
1041 for (s, d) in src.iter().zip(dst.iter_mut()) {
1042 *d = (*s * gain) as i32;
1043 }
1044}
1045
1046#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1047unsafe fn convert_f32_to_i32_sse2(src: &[f32], dst: &mut [i32], gain: f32) {
1048 let g = _mm_set1_ps(gain);
1049 let mut i = 0;
1050 while i + 4 <= src.len() {
1051 let s = _mm_loadu_ps(src.as_ptr().add(i));
1052 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
1053 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, v);
1054 i += 4;
1055 }
1056 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1057 *d = (*s * gain) as i32;
1058 }
1059}
1060
1061#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1062#[target_feature(enable = "avx")]
1063unsafe fn convert_f32_to_i32_avx(src: &[f32], dst: &mut [i32], gain: f32) {
1064 let g = _mm256_set1_ps(gain);
1065 let mut i = 0;
1066 while i + 8 <= src.len() {
1067 let s = _mm256_loadu_ps(src.as_ptr().add(i));
1068 let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1069 _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, v);
1070 i += 8;
1071 }
1072 if i + 4 <= src.len() {
1073 let s = _mm_loadu_ps(src.as_ptr().add(i));
1074 let v = _mm_cvttps_epi32(_mm_mul_ps(s, _mm_set1_ps(gain)));
1075 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, v);
1076 i += 4;
1077 }
1078 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1079 *d = (*s * gain) as i32;
1080 }
1081}
1082
1083pub fn convert_f32_to_i16(src: &[f32], dst: &mut [i16], gain: f32) {
1087 let n = src.len().min(dst.len());
1088 if n == 0 {
1089 return;
1090 }
1091 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1092 unsafe {
1093 if is_x86_feature_detected!("sse2") {
1094 convert_f32_to_i16_sse2(&src[..n], &mut dst[..n], gain);
1095 return;
1096 }
1097 }
1098 convert_f32_to_i16_scalar(&src[..n], &mut dst[..n], gain);
1099}
1100
1101fn convert_f32_to_i16_scalar(src: &[f32], dst: &mut [i16], gain: f32) {
1102 for (s, d) in src.iter().zip(dst.iter_mut()) {
1103 *d = (*s * gain) as i16;
1104 }
1105}
1106
1107#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1108unsafe fn convert_f32_to_i16_sse2(src: &[f32], dst: &mut [i16], gain: f32) {
1109 let g = _mm_set1_ps(gain);
1110 let mut i = 0;
1111 while i + 8 <= src.len() {
1112 let s0 = _mm_loadu_ps(src.as_ptr().add(i));
1113 let s1 = _mm_loadu_ps(src.as_ptr().add(i + 4));
1114 let v0 = _mm_cvttps_epi32(_mm_mul_ps(s0, g));
1115 let v1 = _mm_cvttps_epi32(_mm_mul_ps(s1, g));
1116 let packed = _mm_packs_epi32(v0, v1);
1117 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1118 i += 8;
1119 }
1120 if i + 4 <= src.len() {
1121 let s = _mm_loadu_ps(src.as_ptr().add(i));
1122 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
1123 let packed = _mm_packs_epi32(v, _mm_setzero_si128());
1124 _mm_storel_epi64(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1125 i += 4;
1126 }
1127 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1128 *d = (*s * gain) as i16;
1129 }
1130}
1131
1132pub fn convert_f32_to_i8(src: &[f32], dst: &mut [i8], gain: f32) {
1136 let n = src.len().min(dst.len());
1137 if n == 0 {
1138 return;
1139 }
1140 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1141 unsafe {
1142 if is_x86_feature_detected!("sse2") {
1143 convert_f32_to_i8_sse2(&src[..n], &mut dst[..n], gain);
1144 return;
1145 }
1146 }
1147 convert_f32_to_i8_scalar(&src[..n], &mut dst[..n], gain);
1148}
1149
1150fn convert_f32_to_i8_scalar(src: &[f32], dst: &mut [i8], gain: f32) {
1151 for (s, d) in src.iter().zip(dst.iter_mut()) {
1152 *d = (*s * gain) as i8;
1153 }
1154}
1155
1156#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1157unsafe fn convert_f32_to_i8_sse2(src: &[f32], dst: &mut [i8], gain: f32) {
1158 let g = _mm_set1_ps(gain);
1159 let mut i = 0;
1160 while i + 16 <= src.len() {
1161 let s0 = _mm_loadu_ps(src.as_ptr().add(i));
1162 let s1 = _mm_loadu_ps(src.as_ptr().add(i + 4));
1163 let s2 = _mm_loadu_ps(src.as_ptr().add(i + 8));
1164 let s3 = _mm_loadu_ps(src.as_ptr().add(i + 12));
1165 let v0 = _mm_cvttps_epi32(_mm_mul_ps(s0, g));
1166 let v1 = _mm_cvttps_epi32(_mm_mul_ps(s1, g));
1167 let v2 = _mm_cvttps_epi32(_mm_mul_ps(s2, g));
1168 let v3 = _mm_cvttps_epi32(_mm_mul_ps(s3, g));
1169 let p0 = _mm_packs_epi32(v0, v1);
1170 let p1 = _mm_packs_epi32(v2, v3);
1171 let packed = _mm_packs_epi16(p0, p1);
1172 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1173 i += 16;
1174 }
1175 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1176 *d = (*s * gain) as i8;
1177 }
1178}
1179
1180pub fn apply_fade_in_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
1187 use wide::f32x4;
1188 let pi_2 = f32x4::splat(std::f32::consts::FRAC_PI_2);
1189 let dt_vec = f32x4::from([0.0, dt, 2.0 * dt, 3.0 * dt]);
1190 let mut i = 0;
1191 while i + 4 <= buf.len() {
1192 let base_t = f32x4::splat(start_t + i as f32 * dt);
1193 let t = (base_t + dt_vec).clamp(f32x4::splat(0.0), f32x4::splat(1.0));
1194 let gain = (t * pi_2).sin();
1195 let chunk = &mut buf[i..i + 4];
1196 let s: f32x4 = [chunk[0], chunk[1], chunk[2], chunk[3]].into();
1197 let r = s * gain;
1198 chunk.copy_from_slice(&r.to_array());
1199 i += 4;
1200 }
1201 for (j, v) in buf[i..].iter_mut().enumerate() {
1202 let t = (((i + j) as f32 * dt) + start_t).clamp(0.0, 1.0);
1203 *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1204 }
1205}
1206
1207pub fn apply_fade_out_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
1210 use wide::f32x4;
1211 let pi_2 = f32x4::splat(std::f32::consts::FRAC_PI_2);
1212 let dt_vec = f32x4::from([0.0, dt, 2.0 * dt, 3.0 * dt]);
1213 let mut i = 0;
1214 while i + 4 <= buf.len() {
1215 let base_t = f32x4::splat(start_t + i as f32 * dt);
1216 let t = (base_t + dt_vec).clamp(f32x4::splat(0.0), f32x4::splat(1.0));
1217 let gain = (t * pi_2).cos();
1218 let chunk = &mut buf[i..i + 4];
1219 let s: f32x4 = [chunk[0], chunk[1], chunk[2], chunk[3]].into();
1220 let r = s * gain;
1221 chunk.copy_from_slice(&r.to_array());
1222 i += 4;
1223 }
1224 for (j, v) in buf[i..].iter_mut().enumerate() {
1225 let t = (((i + j) as f32 * dt) + start_t).clamp(0.0, 1.0);
1226 *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1227 }
1228}