1#![allow(unsafe_op_in_unsafe_fn)]
9
10#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
11mod x86 {
12 pub use std::arch::x86_64::*;
13}
14
15#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
16use x86::*;
17
18pub fn add_inplace(dst: &mut [f32], src: &[f32]) {
24 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
25 unsafe {
26 if is_x86_feature_detected!("avx") {
27 add_inplace_avx(dst, src);
28 return;
29 }
30 if is_x86_feature_detected!("sse") {
31 add_inplace_sse(dst, src);
32 return;
33 }
34 }
35 add_inplace_scalar(dst, src);
36}
37
38pub fn mul_inplace(dst: &mut [f32], gain: f32) {
40 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
41 unsafe {
42 if is_x86_feature_detected!("avx") {
43 mul_inplace_avx(dst, gain);
44 return;
45 }
46 if is_x86_feature_detected!("sse") {
47 mul_inplace_sse(dst, gain);
48 return;
49 }
50 }
51 mul_inplace_scalar(dst, gain);
52}
53
54pub fn add_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
56 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
57 unsafe {
58 if is_x86_feature_detected!("avx") && is_x86_feature_detected!("fma") {
59 add_scaled_inplace_avx_fma(dst, src, gain);
60 return;
61 }
62 if is_x86_feature_detected!("avx") {
63 add_scaled_inplace_avx(dst, src, gain);
64 return;
65 }
66 if is_x86_feature_detected!("sse") {
67 add_scaled_inplace_sse(dst, src, gain);
68 return;
69 }
70 }
71 add_scaled_inplace_scalar(dst, src, gain);
72}
73
74pub fn copy_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
76 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
77 unsafe {
78 if is_x86_feature_detected!("avx") {
79 copy_scaled_inplace_avx(dst, src, gain);
80 return;
81 }
82 if is_x86_feature_detected!("sse") {
83 copy_scaled_inplace_sse(dst, src, gain);
84 return;
85 }
86 }
87 copy_scaled_inplace_scalar(dst, src, gain);
88}
89
90pub fn add_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
92 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
93 unsafe {
94 if is_x86_feature_detected!("avx") {
95 add_sanitized_inplace_avx(dst, src);
96 return;
97 }
98 if is_x86_feature_detected!("sse") {
99 add_sanitized_inplace_sse(dst, src);
100 return;
101 }
102 }
103 add_sanitized_inplace_scalar(dst, src);
104}
105
106pub fn copy_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
108 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
109 unsafe {
110 if is_x86_feature_detected!("avx") {
111 copy_sanitized_inplace_avx(dst, src);
112 return;
113 }
114 if is_x86_feature_detected!("sse") {
115 copy_sanitized_inplace_sse(dst, src);
116 return;
117 }
118 }
119 copy_sanitized_inplace_scalar(dst, src);
120}
121
122pub fn sanitize_finite_inplace(buf: &mut [f32]) {
124 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
125 unsafe {
126 if is_x86_feature_detected!("avx") {
127 sanitize_finite_inplace_avx(buf);
128 return;
129 }
130 if is_x86_feature_detected!("sse") {
131 sanitize_finite_inplace_sse(buf);
132 return;
133 }
134 }
135 sanitize_finite_inplace_scalar(buf);
136}
137
138pub fn peak_abs(buf: &[f32]) -> f32 {
140 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
141 unsafe {
142 if is_x86_feature_detected!("avx") {
143 return peak_abs_avx(buf);
144 }
145 if is_x86_feature_detected!("sse") {
146 return peak_abs_sse(buf);
147 }
148 }
149 peak_abs_scalar(buf)
150}
151
152pub fn clamp_inplace(buf: &mut [f32], min: f32, max: f32) {
154 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
155 unsafe {
156 if is_x86_feature_detected!("avx") {
157 clamp_inplace_avx(buf, min, max);
158 return;
159 }
160 if is_x86_feature_detected!("sse") {
161 clamp_inplace_sse(buf, min, max);
162 return;
163 }
164 }
165 clamp_inplace_scalar(buf, min, max);
166}
167
168fn add_inplace_scalar(dst: &mut [f32], src: &[f32]) {
173 for (d, s) in dst.iter_mut().zip(src.iter()) {
174 *d += *s;
175 }
176}
177
178fn mul_inplace_scalar(dst: &mut [f32], gain: f32) {
179 for d in dst.iter_mut() {
180 *d *= gain;
181 }
182}
183
184fn add_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
185 for (d, s) in dst.iter_mut().zip(src.iter()) {
186 *d += *s * gain;
187 }
188}
189
190fn copy_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
191 for (d, s) in dst.iter_mut().zip(src.iter()) {
192 *d = *s * gain;
193 }
194}
195
196fn add_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
197 for (d, s) in dst.iter_mut().zip(src.iter()) {
198 *d += if s.is_finite() { *s } else { 0.0 };
199 }
200}
201
202fn copy_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
203 for (d, s) in dst.iter_mut().zip(src.iter()) {
204 *d = if s.is_finite() { *s } else { 0.0 };
205 }
206}
207
208fn sanitize_finite_inplace_scalar(buf: &mut [f32]) {
209 for s in buf.iter_mut() {
210 if !s.is_finite() {
211 *s = 0.0;
212 }
213 }
214}
215
216fn peak_abs_scalar(buf: &[f32]) -> f32 {
217 buf.iter().fold(0.0f32, |acc, s| acc.max(s.abs()))
218}
219
220fn clamp_inplace_scalar(buf: &mut [f32], min: f32, max: f32) {
221 for s in buf.iter_mut() {
222 *s = s.clamp(min, max);
223 }
224}
225
226#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
231#[target_feature(enable = "sse")]
232unsafe fn add_inplace_sse(dst: &mut [f32], src: &[f32]) {
233 let len = dst.len().min(src.len());
234 let dst_head = &mut dst[..len];
235 let src_head = &src[..len];
236 let mut i = 0usize;
237 while i + 4 <= dst_head.len() {
238 let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
239 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
240 let r = _mm_add_ps(d, s);
241 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
242 i += 4;
243 }
244 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
245 *d += *s;
246 }
247}
248
249#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
250#[target_feature(enable = "sse")]
251unsafe fn mul_inplace_sse(dst: &mut [f32], gain: f32) {
252 let g = _mm_set1_ps(gain);
253 let mut i = 0usize;
254 while i + 4 <= dst.len() {
255 let d = _mm_loadu_ps(dst.as_ptr().add(i));
256 let r = _mm_mul_ps(d, g);
257 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
258 i += 4;
259 }
260 for d in &mut dst[i..] {
261 *d *= gain;
262 }
263}
264
265#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
266#[target_feature(enable = "sse")]
267unsafe fn add_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
268 let len = dst.len().min(src.len());
269 let dst_head = &mut dst[..len];
270 let src_head = &src[..len];
271 let g = _mm_set1_ps(gain);
272 let mut i = 0usize;
273 while i + 4 <= dst_head.len() {
274 let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
275 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
276 let r = _mm_add_ps(d, _mm_mul_ps(s, g));
277 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
278 i += 4;
279 }
280 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
281 *d += *s * gain;
282 }
283}
284
285#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
286#[target_feature(enable = "sse")]
287unsafe fn copy_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
288 let len = dst.len().min(src.len());
289 let dst_head = &mut dst[..len];
290 let src_head = &src[..len];
291 let g = _mm_set1_ps(gain);
292 let mut i = 0usize;
293 while i + 4 <= dst_head.len() {
294 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
295 let r = _mm_mul_ps(s, g);
296 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
297 i += 4;
298 }
299 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
300 *d = *s * gain;
301 }
302}
303
304#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
305#[target_feature(enable = "sse")]
306unsafe fn add_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
307 let len = dst.len().min(src.len());
308 let dst_head = &mut dst[..len];
309 let src_head = &src[..len];
310 let sign_mask = _mm_set1_ps(-0.0);
311 let finite_max = _mm_set1_ps(f32::MAX);
312 let mut i = 0usize;
313 while i + 4 <= dst_head.len() {
314 let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
315 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
316 let abs_s = _mm_andnot_ps(sign_mask, s);
317 let finite_mask = _mm_cmple_ps(abs_s, finite_max);
318 let sanitized = _mm_and_ps(s, finite_mask);
319 let r = _mm_add_ps(d, sanitized);
320 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
321 i += 4;
322 }
323 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
324 *d += if s.is_finite() { *s } else { 0.0 };
325 }
326}
327
328#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
329#[target_feature(enable = "sse")]
330unsafe fn copy_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
331 let len = dst.len().min(src.len());
332 let dst_head = &mut dst[..len];
333 let src_head = &src[..len];
334 let sign_mask = _mm_set1_ps(-0.0);
335 let finite_max = _mm_set1_ps(f32::MAX);
336 let mut i = 0usize;
337 while i + 4 <= dst_head.len() {
338 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
339 let abs_s = _mm_andnot_ps(sign_mask, s);
340 let finite_mask = _mm_cmple_ps(abs_s, finite_max);
341 let r = _mm_and_ps(s, finite_mask);
342 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
343 i += 4;
344 }
345 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
346 *d = if s.is_finite() { *s } else { 0.0 };
347 }
348}
349
350#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
351#[target_feature(enable = "sse")]
352unsafe fn sanitize_finite_inplace_sse(buf: &mut [f32]) {
353 let sign_mask = _mm_set1_ps(-0.0);
354 let finite_max = _mm_set1_ps(f32::MAX);
355 let mut i = 0usize;
356 while i + 4 <= buf.len() {
357 let v = _mm_loadu_ps(buf.as_ptr().add(i));
358 let abs_v = _mm_andnot_ps(sign_mask, v);
359 let finite_mask = _mm_cmple_ps(abs_v, finite_max);
360 let r = _mm_and_ps(v, finite_mask);
361 _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
362 i += 4;
363 }
364 for s in &mut buf[i..] {
365 if !s.is_finite() {
366 *s = 0.0;
367 }
368 }
369}
370
371#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
372#[target_feature(enable = "sse")]
373unsafe fn peak_abs_sse(buf: &[f32]) -> f32 {
374 let sign_mask = _mm_set1_ps(-0.0);
375 let mut peak = _mm_setzero_ps();
376 let mut i = 0usize;
377 while i + 4 <= buf.len() {
378 let v = _mm_loadu_ps(buf.as_ptr().add(i));
379 let abs_v = _mm_andnot_ps(sign_mask, v);
380 peak = _mm_max_ps(peak, abs_v);
381 i += 4;
382 }
383 let mut arr = [0.0f32; 4];
384 _mm_storeu_ps(arr.as_mut_ptr(), peak);
385 let mut max_scalar = arr.into_iter().fold(0.0f32, |a, b| a.max(b));
386 for s in &buf[i..] {
387 max_scalar = max_scalar.max(s.abs());
388 }
389 max_scalar
390}
391
392#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
393#[target_feature(enable = "sse")]
394unsafe fn clamp_inplace_sse(buf: &mut [f32], min: f32, max: f32) {
395 let vmin = _mm_set1_ps(min);
396 let vmax = _mm_set1_ps(max);
397 let mut i = 0usize;
398 while i + 4 <= buf.len() {
399 let v = _mm_loadu_ps(buf.as_ptr().add(i));
400 let r = _mm_min_ps(_mm_max_ps(v, vmin), vmax);
401 _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
402 i += 4;
403 }
404 for s in &mut buf[i..] {
405 *s = s.clamp(min, max);
406 }
407}
408
409#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
414#[target_feature(enable = "avx")]
415unsafe fn add_inplace_avx(dst: &mut [f32], src: &[f32]) {
416 let len = dst.len().min(src.len());
417 let dst_head = &mut dst[..len];
418 let src_head = &src[..len];
419 let mut i = 0;
420 while i + 8 <= dst_head.len() {
421 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
422 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
423 let r = _mm256_add_ps(d, s);
424 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
425 i += 8;
426 }
427 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
428 *d += *s;
429 }
430}
431
432#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
433#[target_feature(enable = "avx")]
434unsafe fn mul_inplace_avx(dst: &mut [f32], gain: f32) {
435 let g = _mm256_set1_ps(gain);
436 let mut i = 0;
437 while i + 8 <= dst.len() {
438 let d = _mm256_loadu_ps(dst.as_ptr().add(i));
439 let r = _mm256_mul_ps(d, g);
440 _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
441 i += 8;
442 }
443 for d in &mut dst[i..] {
444 *d *= gain;
445 }
446}
447
448#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
449#[target_feature(enable = "avx")]
450unsafe fn add_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
451 let len = dst.len().min(src.len());
452 let dst_head = &mut dst[..len];
453 let src_head = &src[..len];
454 let g = _mm256_set1_ps(gain);
455 let mut i = 0;
456 while i + 8 <= dst_head.len() {
457 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
458 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
459 let r = _mm256_add_ps(d, _mm256_mul_ps(s, g));
460 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
461 i += 8;
462 }
463 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
464 *d += *s * gain;
465 }
466}
467
468#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
469#[target_feature(enable = "avx,fma")]
470unsafe fn add_scaled_inplace_avx_fma(dst: &mut [f32], src: &[f32], gain: f32) {
471 let len = dst.len().min(src.len());
472 let dst_head = &mut dst[..len];
473 let src_head = &src[..len];
474 let g = _mm256_set1_ps(gain);
475 let mut i = 0;
476 while i + 8 <= dst_head.len() {
477 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
478 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
479 let r = _mm256_fmadd_ps(s, g, d);
480 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
481 i += 8;
482 }
483 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
484 *d += *s * gain;
485 }
486}
487
488#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
489#[target_feature(enable = "avx")]
490unsafe fn copy_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
491 let len = dst.len().min(src.len());
492 let dst_head = &mut dst[..len];
493 let src_head = &src[..len];
494 let g = _mm256_set1_ps(gain);
495 let mut i = 0;
496 while i + 8 <= dst_head.len() {
497 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
498 let r = _mm256_mul_ps(s, g);
499 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
500 i += 8;
501 }
502 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
503 *d = *s * gain;
504 }
505}
506
507#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
508#[target_feature(enable = "avx")]
509unsafe fn add_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
510 let len = dst.len().min(src.len());
511 let dst_head = &mut dst[..len];
512 let src_head = &src[..len];
513 let zero = _mm256_setzero_ps();
514 let max_val = _mm256_set1_ps(f32::MAX);
515 let mut i = 0;
516 while i + 8 <= dst_head.len() {
517 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
518 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
519 let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
520 let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
521 let sanitized = _mm256_blendv_ps(zero, s, mask);
522 let r = _mm256_add_ps(d, sanitized);
523 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
524 i += 8;
525 }
526 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
527 *d += if s.is_finite() { *s } else { 0.0 };
528 }
529}
530
531#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
532#[target_feature(enable = "avx")]
533unsafe fn copy_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
534 let len = dst.len().min(src.len());
535 let dst_head = &mut dst[..len];
536 let src_head = &src[..len];
537 let zero = _mm256_setzero_ps();
538 let max_val = _mm256_set1_ps(f32::MAX);
539 let mut i = 0;
540 while i + 8 <= dst_head.len() {
541 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
542 let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
543 let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
544 let r = _mm256_blendv_ps(zero, s, mask);
545 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
546 i += 8;
547 }
548 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
549 *d = if s.is_finite() { *s } else { 0.0 };
550 }
551}
552
553#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
554#[target_feature(enable = "avx")]
555unsafe fn sanitize_finite_inplace_avx(buf: &mut [f32]) {
556 let zero = _mm256_setzero_ps();
557 let max_val = _mm256_set1_ps(f32::MAX);
558 let mut i = 0;
559 while i + 8 <= buf.len() {
560 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
561 let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
562 let mask = _mm256_cmp_ps(abs_v, max_val, _CMP_LE_OQ);
563 let r = _mm256_blendv_ps(zero, v, mask);
564 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
565 i += 8;
566 }
567 for s in &mut buf[i..] {
568 if !s.is_finite() {
569 *s = 0.0;
570 }
571 }
572}
573
574#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
575#[target_feature(enable = "avx")]
576unsafe fn peak_abs_avx(buf: &[f32]) -> f32 {
577 let mut peak = _mm256_setzero_ps();
578 let mut i = 0;
579 while i + 8 <= buf.len() {
580 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
581 let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
582 peak = _mm256_max_ps(peak, abs_v);
583 i += 8;
584 }
585 let mut arr = [0.0f32; 8];
586 _mm256_storeu_ps(arr.as_mut_ptr(), peak);
587 let mut max_scalar = arr.iter().fold(0.0f32, |a, &b| a.max(b));
588 for s in &buf[i..] {
589 max_scalar = max_scalar.max(s.abs());
590 }
591 max_scalar
592}
593
594#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
595#[target_feature(enable = "avx")]
596unsafe fn clamp_inplace_avx(buf: &mut [f32], min: f32, max: f32) {
597 let vmin = _mm256_set1_ps(min);
598 let vmax = _mm256_set1_ps(max);
599 let mut i = 0;
600 while i + 8 <= buf.len() {
601 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
602 let r = _mm256_max_ps(vmin, _mm256_min_ps(vmax, v));
603 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
604 i += 8;
605 }
606 for s in &mut buf[i..] {
607 *s = s.clamp(min, max);
608 }
609}
610
611#[cfg(test)]
612mod tests {
613 use super::*;
614
615 #[test]
616 fn add_inplace_basic() {
617 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
618 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
619 add_inplace(&mut a, &b);
620 assert_eq!(a, [11.0, 22.0, 33.0, 44.0, 55.0]);
621 }
622
623 #[test]
624 fn mul_inplace_basic() {
625 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
626 mul_inplace(&mut a, 2.0);
627 assert_eq!(a, [2.0, 4.0, 6.0, 8.0, 10.0]);
628 }
629
630 #[test]
631 fn add_scaled_inplace_basic() {
632 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
633 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
634 add_scaled_inplace(&mut a, &b, 0.5);
635 assert_eq!(a, [6.0, 12.0, 18.0, 24.0, 30.0]);
636 }
637
638 #[test]
639 fn copy_scaled_inplace_basic() {
640 let mut a = [0.0f32; 5];
641 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
642 copy_scaled_inplace(&mut a, &b, 0.5);
643 assert_eq!(a, [5.0, 10.0, 15.0, 20.0, 25.0]);
644 }
645
646 #[test]
647 fn add_sanitized_inplace_basic() {
648 let mut a = [1.0f32, 2.0, 3.0, 4.0];
649 let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
650 add_sanitized_inplace(&mut a, &b);
651 assert!(a[0].is_finite() && a[0] == 1.5);
652 assert!(a[1].is_finite() && a[1] == 2.0);
653 assert!(a[2].is_finite() && a[2] == 3.0);
654 assert!(a[3].is_finite() && a[3] == 5.0);
655 }
656
657 #[test]
658 fn copy_sanitized_inplace_basic() {
659 let mut a = [0.0f32; 4];
660 let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
661 copy_sanitized_inplace(&mut a, &b);
662 assert!(a[0].is_finite() && a[0] == 0.5);
663 assert!(a[1].is_finite() && a[1] == 0.0);
664 assert!(a[2].is_finite() && a[2] == 0.0);
665 assert!(a[3].is_finite() && a[3] == 1.0);
666 }
667
668 #[test]
669 fn sanitize_finite_inplace_basic() {
670 let mut a = [1.0f32, f32::NAN, f32::INFINITY, 4.0, f32::NEG_INFINITY];
671 sanitize_finite_inplace(&mut a);
672 assert!(a[0].is_finite() && a[0] == 1.0);
673 assert_eq!(a[1], 0.0);
674 assert_eq!(a[2], 0.0);
675 assert!(a[3].is_finite() && a[3] == 4.0);
676 assert_eq!(a[4], 0.0);
677 }
678
679 #[test]
680 fn peak_abs_basic() {
681 let a = [1.0f32, -3.0, 2.0, 0.5];
682 assert_eq!(peak_abs(&a), 3.0);
683 }
684
685 #[test]
686 fn clamp_inplace_basic() {
687 let mut a = [-2.0f32, -0.5, 0.0, 0.5, 2.0];
688 clamp_inplace(&mut a, -1.0, 1.0);
689 assert_eq!(a, [-1.0, -0.5, 0.0, 0.5, 1.0]);
690 }
691}
692
693pub fn convert_i32_to_f32(src: &[i32], dst: &mut [f32], gain: f32) {
700 let n = src.len().min(dst.len());
701 if n == 0 {
702 return;
703 }
704 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
705 unsafe {
706 if is_x86_feature_detected!("avx") {
707 convert_i32_to_f32_avx(&src[..n], &mut dst[..n], gain);
708 return;
709 }
710 if is_x86_feature_detected!("sse") {
711 convert_i32_to_f32_sse(&src[..n], &mut dst[..n], gain);
712 return;
713 }
714 }
715 convert_i32_to_f32_scalar(&src[..n], &mut dst[..n], gain);
716}
717
718fn convert_i32_to_f32_scalar(src: &[i32], dst: &mut [f32], gain: f32) {
719 for (s, d) in src.iter().zip(dst.iter_mut()) {
720 *d = *s as f32 * gain;
721 }
722}
723
724#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
725unsafe fn convert_i32_to_f32_sse(src: &[i32], dst: &mut [f32], gain: f32) {
726 let g = _mm_set1_ps(gain);
727 let mut i = 0;
728 while i + 4 <= src.len() {
729 let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
730 let f = _mm_cvtepi32_ps(s);
731 let r = _mm_mul_ps(f, g);
732 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
733 i += 4;
734 }
735 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
736 *d = *s as f32 * gain;
737 }
738}
739
740#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
741#[target_feature(enable = "avx")]
742unsafe fn convert_i32_to_f32_avx(src: &[i32], dst: &mut [f32], gain: f32) {
743 let g = _mm256_set1_ps(gain);
744 let mut i = 0;
745 while i + 8 <= src.len() {
746 let s = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
747 let f = _mm256_cvtepi32_ps(s);
748 let r = _mm256_mul_ps(f, g);
749 _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
750 i += 8;
751 }
752 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
753 *d = *s as f32 * gain;
754 }
755}
756
757pub fn convert_i16_to_f32(src: &[i16], dst: &mut [f32], gain: f32) {
760 let n = src.len().min(dst.len());
761 if n == 0 {
762 return;
763 }
764 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
765 unsafe {
766 if is_x86_feature_detected!("avx2") {
767 convert_i16_to_f32_avx2(&src[..n], &mut dst[..n], gain);
768 return;
769 }
770 if is_x86_feature_detected!("sse4.1") {
771 convert_i16_to_f32_sse41(&src[..n], &mut dst[..n], gain);
772 return;
773 }
774 }
775 convert_i16_to_f32_scalar(&src[..n], &mut dst[..n], gain);
776}
777
778fn convert_i16_to_f32_scalar(src: &[i16], dst: &mut [f32], gain: f32) {
779 for (s, d) in src.iter().zip(dst.iter_mut()) {
780 *d = *s as f32 * gain;
781 }
782}
783
784#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
785unsafe fn convert_i16_to_f32_sse41(src: &[i16], dst: &mut [f32], gain: f32) {
786 let g = _mm_set1_ps(gain);
787 let mut i = 0;
788 while i + 8 <= src.len() {
789 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
790 let low = _mm_cvtepi16_epi32(bytes);
791 let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
792 let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), g);
793 let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), g);
794 _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
795 _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
796 i += 8;
797 }
798 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
799 *d = *s as f32 * gain;
800 }
801}
802
803#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
804#[target_feature(enable = "avx2")]
805unsafe fn convert_i16_to_f32_avx2(src: &[i16], dst: &mut [f32], gain: f32) {
806 let g = _mm256_set1_ps(gain);
807 let mut i = 0;
808 while i + 16 <= src.len() {
809 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
810 let low = _mm256_cvtepi16_epi32(bytes);
811 let high = _mm256_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
812 let low_f = _mm256_mul_ps(_mm256_cvtepi32_ps(low), g);
813 let high_f = _mm256_mul_ps(_mm256_cvtepi32_ps(high), g);
814 _mm256_storeu_ps(dst.as_mut_ptr().add(i), low_f);
815 _mm256_storeu_ps(dst.as_mut_ptr().add(i + 8), high_f);
816 i += 16;
817 }
818 if i + 8 <= src.len() {
819 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
820 let low = _mm_cvtepi16_epi32(bytes);
821 let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
822 let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), _mm_set1_ps(gain));
823 let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), _mm_set1_ps(gain));
824 _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
825 _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
826 i += 8;
827 }
828 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
829 *d = *s as f32 * gain;
830 }
831}
832
833pub fn convert_i8_to_f32(src: &[i8], dst: &mut [f32], gain: f32) {
836 let n = src.len().min(dst.len());
837 if n == 0 {
838 return;
839 }
840 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
841 unsafe {
842 if is_x86_feature_detected!("avx2") {
843 convert_i8_to_f32_avx2(&src[..n], &mut dst[..n], gain);
844 return;
845 }
846 if is_x86_feature_detected!("sse4.1") {
847 convert_i8_to_f32_sse41(&src[..n], &mut dst[..n], gain);
848 return;
849 }
850 }
851 convert_i8_to_f32_scalar(&src[..n], &mut dst[..n], gain);
852}
853
854fn convert_i8_to_f32_scalar(src: &[i8], dst: &mut [f32], gain: f32) {
855 for (s, d) in src.iter().zip(dst.iter_mut()) {
856 *d = *s as f32 * gain;
857 }
858}
859
860#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
861unsafe fn convert_i8_to_f32_sse41(src: &[i8], dst: &mut [f32], gain: f32) {
862 let g = _mm_set1_ps(gain);
863 let mut i = 0;
864 while i + 4 <= src.len() {
865 let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
866 let i32s = _mm_cvtepi8_epi32(bytes);
867 let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), g);
868 _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
869 i += 4;
870 }
871 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
872 *d = *s as f32 * gain;
873 }
874}
875
876#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
877#[target_feature(enable = "avx2")]
878unsafe fn convert_i8_to_f32_avx2(src: &[i8], dst: &mut [f32], gain: f32) {
879 let g = _mm256_set1_ps(gain);
880 let mut i = 0;
881 while i + 8 <= src.len() {
882 let bytes = _mm_cvtsi64_si128(*(src.as_ptr().add(i) as *const i64));
883 let i32s = _mm256_cvtepi8_epi32(bytes);
884 let f32s = _mm256_mul_ps(_mm256_cvtepi32_ps(i32s), g);
885 _mm256_storeu_ps(dst.as_mut_ptr().add(i), f32s);
886 i += 8;
887 }
888 if i + 4 <= src.len() {
889 let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
890 let i32s = _mm_cvtepi8_epi32(bytes);
891 let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), _mm_set1_ps(gain));
892 _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
893 i += 4;
894 }
895 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
896 *d = *s as f32 * gain;
897 }
898}
899
900pub fn convert_i24_to_f32(src: &[i32], dst: &mut [f32], gain: f32) {
904 let n = src.len().min(dst.len());
905 if n == 0 {
906 return;
907 }
908 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
909 unsafe {
910 if is_x86_feature_detected!("avx2") {
911 convert_i24_to_f32_avx2(&src[..n], &mut dst[..n], gain);
912 return;
913 }
914 if is_x86_feature_detected!("sse4.1") {
915 convert_i24_to_f32_sse41(&src[..n], &mut dst[..n], gain);
916 return;
917 }
918 }
919 convert_i24_to_f32_scalar(&src[..n], &mut dst[..n], gain);
920}
921
922fn convert_i24_to_f32_scalar(src: &[i32], dst: &mut [f32], gain: f32) {
923 for (s, d) in src.iter().zip(dst.iter_mut()) {
924 let mut v = *s & 0x00FF_FFFF;
925 if (v & 0x0080_0000) != 0 {
926 v |= 0xFF00_0000u32 as i32;
927 }
928 *d = v as f32 * gain;
929 }
930}
931
932#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
933unsafe fn convert_i24_to_f32_sse41(src: &[i32], dst: &mut [f32], gain: f32) {
934 let g = _mm_set1_ps(gain);
935 let mut i = 0;
936 while i + 4 <= src.len() {
937 let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
938 let extended = _mm_srai_epi32(_mm_slli_epi32(s, 8), 8);
939 let f = _mm_cvtepi32_ps(extended);
940 let r = _mm_mul_ps(f, g);
941 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
942 i += 4;
943 }
944 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
945 let mut v = *s & 0x00FF_FFFF;
946 if (v & 0x0080_0000) != 0 {
947 v |= 0xFF00_0000u32 as i32;
948 }
949 *d = v as f32 * gain;
950 }
951}
952
953#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
954#[target_feature(enable = "avx2")]
955unsafe fn convert_i24_to_f32_avx2(src: &[i32], dst: &mut [f32], gain: f32) {
956 let g = _mm256_set1_ps(gain);
957 let mut i = 0;
958 while i + 8 <= src.len() {
959 let s = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
960 let extended = _mm256_srai_epi32(_mm256_slli_epi32(s, 8), 8);
961 let f = _mm256_cvtepi32_ps(extended);
962 let r = _mm256_mul_ps(f, g);
963 _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
964 i += 8;
965 }
966 if i + 4 <= src.len() {
967 let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
968 let extended = _mm_srai_epi32(_mm_slli_epi32(s, 8), 8);
969 let f = _mm_cvtepi32_ps(extended);
970 let r = _mm_mul_ps(f, _mm_set1_ps(gain));
971 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
972 i += 4;
973 }
974 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
975 let mut v = *s & 0x00FF_FFFF;
976 if (v & 0x0080_0000) != 0 {
977 v |= 0xFF00_0000u32 as i32;
978 }
979 *d = v as f32 * gain;
980 }
981}
982
983pub fn convert_f32_to_i24(src: &[f32], dst: &mut [i32], gain: f32) {
987 let n = src.len().min(dst.len());
988 if n == 0 {
989 return;
990 }
991 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
992 unsafe {
993 if is_x86_feature_detected!("avx") {
994 convert_f32_to_i24_avx(&src[..n], &mut dst[..n], gain);
995 return;
996 }
997 if is_x86_feature_detected!("sse2") {
998 convert_f32_to_i24_sse2(&src[..n], &mut dst[..n], gain);
999 return;
1000 }
1001 }
1002 convert_f32_to_i24_scalar(&src[..n], &mut dst[..n], gain);
1003}
1004
1005fn convert_f32_to_i24_scalar(src: &[f32], dst: &mut [i32], gain: f32) {
1006 for (s, d) in src.iter().zip(dst.iter_mut()) {
1007 *d = (*s * gain) as i32 & 0x00FF_FFFF;
1008 }
1009}
1010
1011#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1012unsafe fn convert_f32_to_i24_sse2(src: &[f32], dst: &mut [i32], gain: f32) {
1013 let g = _mm_set1_ps(gain);
1014 let mask = _mm_set1_epi32(0x00FF_FFFF);
1015 let mut i = 0;
1016 while i + 4 <= src.len() {
1017 let s = _mm_loadu_ps(src.as_ptr().add(i));
1018 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
1019 let m = _mm_and_si128(v, mask);
1020 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, m);
1021 i += 4;
1022 }
1023 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1024 *d = (*s * gain) as i32 & 0x00FF_FFFF;
1025 }
1026}
1027
1028#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1029#[target_feature(enable = "avx")]
1030unsafe fn convert_f32_to_i24_avx(src: &[f32], dst: &mut [i32], gain: f32) {
1031 let g = _mm256_set1_ps(gain);
1032 let mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x00FF_FFFF));
1033 let mut i = 0;
1034 while i + 8 <= src.len() {
1035 let s = _mm256_loadu_ps(src.as_ptr().add(i));
1036 let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1037 let m = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), mask));
1038 _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, m);
1039 i += 8;
1040 }
1041 if i + 4 <= src.len() {
1042 let g_sse = _mm_set1_ps(gain);
1043 let mask_sse = _mm_set1_epi32(0x00FF_FFFF);
1044 let s = _mm_loadu_ps(src.as_ptr().add(i));
1045 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g_sse));
1046 let m = _mm_and_si128(v, mask_sse);
1047 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, m);
1048 i += 4;
1049 }
1050 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1051 *d = (*s * gain) as i32 & 0x00FF_FFFF;
1052 }
1053}
1054
1055pub fn convert_f32_to_i32(src: &[f32], dst: &mut [i32], gain: f32) {
1059 let n = src.len().min(dst.len());
1060 if n == 0 {
1061 return;
1062 }
1063 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1064 unsafe {
1065 if is_x86_feature_detected!("avx") {
1066 convert_f32_to_i32_avx(&src[..n], &mut dst[..n], gain);
1067 return;
1068 }
1069 if is_x86_feature_detected!("sse2") {
1070 convert_f32_to_i32_sse2(&src[..n], &mut dst[..n], gain);
1071 return;
1072 }
1073 }
1074 convert_f32_to_i32_scalar(&src[..n], &mut dst[..n], gain);
1075}
1076
1077fn convert_f32_to_i32_scalar(src: &[f32], dst: &mut [i32], gain: f32) {
1078 for (s, d) in src.iter().zip(dst.iter_mut()) {
1079 *d = (*s * gain) as i32;
1080 }
1081}
1082
1083#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1084unsafe fn convert_f32_to_i32_sse2(src: &[f32], dst: &mut [i32], gain: f32) {
1085 let g = _mm_set1_ps(gain);
1086 let mut i = 0;
1087 while i + 4 <= src.len() {
1088 let s = _mm_loadu_ps(src.as_ptr().add(i));
1089 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
1090 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, v);
1091 i += 4;
1092 }
1093 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1094 *d = (*s * gain) as i32;
1095 }
1096}
1097
1098#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1099#[target_feature(enable = "avx")]
1100unsafe fn convert_f32_to_i32_avx(src: &[f32], dst: &mut [i32], gain: f32) {
1101 let g = _mm256_set1_ps(gain);
1102 let mut i = 0;
1103 while i + 8 <= src.len() {
1104 let s = _mm256_loadu_ps(src.as_ptr().add(i));
1105 let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1106 _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, v);
1107 i += 8;
1108 }
1109 if i + 4 <= src.len() {
1110 let s = _mm_loadu_ps(src.as_ptr().add(i));
1111 let v = _mm_cvttps_epi32(_mm_mul_ps(s, _mm_set1_ps(gain)));
1112 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, v);
1113 i += 4;
1114 }
1115 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1116 *d = (*s * gain) as i32;
1117 }
1118}
1119
1120pub fn convert_f32_to_i16(src: &[f32], dst: &mut [i16], gain: f32) {
1124 let n = src.len().min(dst.len());
1125 if n == 0 {
1126 return;
1127 }
1128 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1129 unsafe {
1130 if is_x86_feature_detected!("avx") {
1131 convert_f32_to_i16_avx(&src[..n], &mut dst[..n], gain);
1132 return;
1133 }
1134 if is_x86_feature_detected!("sse2") {
1135 convert_f32_to_i16_sse2(&src[..n], &mut dst[..n], gain);
1136 return;
1137 }
1138 }
1139 convert_f32_to_i16_scalar(&src[..n], &mut dst[..n], gain);
1140}
1141
1142fn convert_f32_to_i16_scalar(src: &[f32], dst: &mut [i16], gain: f32) {
1143 for (s, d) in src.iter().zip(dst.iter_mut()) {
1144 *d = (*s * gain) as i16;
1145 }
1146}
1147
1148#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1149unsafe fn convert_f32_to_i16_sse2(src: &[f32], dst: &mut [i16], gain: f32) {
1150 let g = _mm_set1_ps(gain);
1151 let mut i = 0;
1152 while i + 8 <= src.len() {
1153 let s0 = _mm_loadu_ps(src.as_ptr().add(i));
1154 let s1 = _mm_loadu_ps(src.as_ptr().add(i + 4));
1155 let v0 = _mm_cvttps_epi32(_mm_mul_ps(s0, g));
1156 let v1 = _mm_cvttps_epi32(_mm_mul_ps(s1, g));
1157 let packed = _mm_packs_epi32(v0, v1);
1158 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1159 i += 8;
1160 }
1161 if i + 4 <= src.len() {
1162 let s = _mm_loadu_ps(src.as_ptr().add(i));
1163 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
1164 let packed = _mm_packs_epi32(v, _mm_setzero_si128());
1165 _mm_storel_epi64(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1166 i += 4;
1167 }
1168 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1169 *d = (*s * gain) as i16;
1170 }
1171}
1172
1173#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1174#[target_feature(enable = "avx")]
1175unsafe fn convert_f32_to_i16_avx(src: &[f32], dst: &mut [i16], gain: f32) {
1176 let g = _mm256_set1_ps(gain);
1177 let mut i = 0usize;
1178 while i + 8 <= src.len() {
1179 let s = _mm256_loadu_ps(src.as_ptr().add(i));
1180 let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1181 let vlo = _mm256_castsi256_si128(v);
1182 let vhi = _mm256_extracti128_si256(v, 1);
1183 let packed = _mm_packs_epi32(vlo, vhi);
1184 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1185 i += 8;
1186 }
1187 if i + 4 <= src.len() {
1188 let s = _mm_loadu_ps(src.as_ptr().add(i));
1189 let v = _mm_cvttps_epi32(_mm_mul_ps(s, _mm_set1_ps(gain)));
1190 let packed = _mm_packs_epi32(v, _mm_setzero_si128());
1191 _mm_storel_epi64(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1192 i += 4;
1193 }
1194 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1195 *d = (*s * gain) as i16;
1196 }
1197}
1198
1199pub fn convert_f32_to_i8(src: &[f32], dst: &mut [i8], gain: f32) {
1203 let n = src.len().min(dst.len());
1204 if n == 0 {
1205 return;
1206 }
1207 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1208 unsafe {
1209 if is_x86_feature_detected!("avx") {
1210 convert_f32_to_i8_avx(&src[..n], &mut dst[..n], gain);
1211 return;
1212 }
1213 if is_x86_feature_detected!("sse2") {
1214 convert_f32_to_i8_sse2(&src[..n], &mut dst[..n], gain);
1215 return;
1216 }
1217 }
1218 convert_f32_to_i8_scalar(&src[..n], &mut dst[..n], gain);
1219}
1220
1221fn convert_f32_to_i8_scalar(src: &[f32], dst: &mut [i8], gain: f32) {
1222 for (s, d) in src.iter().zip(dst.iter_mut()) {
1223 *d = (*s * gain) as i8;
1224 }
1225}
1226
1227#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1228unsafe fn convert_f32_to_i8_sse2(src: &[f32], dst: &mut [i8], gain: f32) {
1229 let g = _mm_set1_ps(gain);
1230 let mut i = 0;
1231 while i + 16 <= src.len() {
1232 let s0 = _mm_loadu_ps(src.as_ptr().add(i));
1233 let s1 = _mm_loadu_ps(src.as_ptr().add(i + 4));
1234 let s2 = _mm_loadu_ps(src.as_ptr().add(i + 8));
1235 let s3 = _mm_loadu_ps(src.as_ptr().add(i + 12));
1236 let v0 = _mm_cvttps_epi32(_mm_mul_ps(s0, g));
1237 let v1 = _mm_cvttps_epi32(_mm_mul_ps(s1, g));
1238 let v2 = _mm_cvttps_epi32(_mm_mul_ps(s2, g));
1239 let v3 = _mm_cvttps_epi32(_mm_mul_ps(s3, g));
1240 let p0 = _mm_packs_epi32(v0, v1);
1241 let p1 = _mm_packs_epi32(v2, v3);
1242 let packed = _mm_packs_epi16(p0, p1);
1243 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1244 i += 16;
1245 }
1246 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1247 *d = (*s * gain) as i8;
1248 }
1249}
1250
1251#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1252#[target_feature(enable = "avx")]
1253unsafe fn convert_f32_to_i8_avx(src: &[f32], dst: &mut [i8], gain: f32) {
1254 let g = _mm256_set1_ps(gain);
1255 let mut i = 0usize;
1256 while i + 16 <= src.len() {
1257 let s0 = _mm256_loadu_ps(src.as_ptr().add(i));
1258 let s1 = _mm256_loadu_ps(src.as_ptr().add(i + 8));
1259 let v0 = _mm256_cvttps_epi32(_mm256_mul_ps(s0, g));
1260 let v1 = _mm256_cvttps_epi32(_mm256_mul_ps(s1, g));
1261 let p0 = _mm_packs_epi32(_mm256_castsi256_si128(v0), _mm256_extracti128_si256(v0, 1));
1262 let p1 = _mm_packs_epi32(_mm256_castsi256_si128(v1), _mm256_extracti128_si256(v1, 1));
1263 let packed = _mm_packs_epi16(p0, p1);
1264 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1265 i += 16;
1266 }
1267 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1268 *d = (*s * gain) as i8;
1269 }
1270}
1271
1272pub fn apply_fade_in_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
1279 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1280 unsafe {
1281 if is_x86_feature_detected!("avx") {
1282 apply_fade_in_inplace_avx(buf, start_t, dt);
1283 return;
1284 }
1285 if is_x86_feature_detected!("sse") {
1286 apply_fade_in_inplace_sse(buf, start_t, dt);
1287 return;
1288 }
1289 }
1290 for (i, v) in buf.iter_mut().enumerate() {
1291 let t = (start_t + i as f32 * dt).clamp(0.0, 1.0);
1292 *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1293 }
1294}
1295
1296pub fn apply_fade_out_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
1299 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1300 unsafe {
1301 if is_x86_feature_detected!("avx") {
1302 apply_fade_out_inplace_avx(buf, start_t, dt);
1303 return;
1304 }
1305 if is_x86_feature_detected!("sse") {
1306 apply_fade_out_inplace_sse(buf, start_t, dt);
1307 return;
1308 }
1309 }
1310 for (i, v) in buf.iter_mut().enumerate() {
1311 let t = (start_t + i as f32 * dt).clamp(0.0, 1.0);
1312 *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1313 }
1314}
1315
1316#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1317#[target_feature(enable = "avx")]
1318unsafe fn apply_fade_in_inplace_avx(buf: &mut [f32], start_t: f32, dt: f32) {
1319 let mut i = 0usize;
1320 while i + 8 <= buf.len() {
1321 let mut gain = [0.0f32; 8];
1322 for (lane, g) in gain.iter_mut().enumerate() {
1323 let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1324 *g = (t * std::f32::consts::FRAC_PI_2).sin();
1325 }
1326 let s = _mm256_loadu_ps(buf.as_ptr().add(i));
1327 let g = _mm256_loadu_ps(gain.as_ptr());
1328 let r = _mm256_mul_ps(s, g);
1329 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
1330 i += 8;
1331 }
1332 for (j, v) in buf[i..].iter_mut().enumerate() {
1333 let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1334 *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1335 }
1336}
1337
1338#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1339#[target_feature(enable = "sse")]
1340unsafe fn apply_fade_in_inplace_sse(buf: &mut [f32], start_t: f32, dt: f32) {
1341 let mut i = 0usize;
1342 while i + 4 <= buf.len() {
1343 let mut gain = [0.0f32; 4];
1344 for (lane, g) in gain.iter_mut().enumerate() {
1345 let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1346 *g = (t * std::f32::consts::FRAC_PI_2).sin();
1347 }
1348 let s = _mm_loadu_ps(buf.as_ptr().add(i));
1349 let g = _mm_loadu_ps(gain.as_ptr());
1350 let r = _mm_mul_ps(s, g);
1351 _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
1352 i += 4;
1353 }
1354 for (j, v) in buf[i..].iter_mut().enumerate() {
1355 let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1356 *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1357 }
1358}
1359
1360#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1361#[target_feature(enable = "avx")]
1362unsafe fn apply_fade_out_inplace_avx(buf: &mut [f32], start_t: f32, dt: f32) {
1363 let mut i = 0usize;
1364 while i + 8 <= buf.len() {
1365 let mut gain = [0.0f32; 8];
1366 for (lane, g) in gain.iter_mut().enumerate() {
1367 let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1368 *g = (t * std::f32::consts::FRAC_PI_2).cos();
1369 }
1370 let s = _mm256_loadu_ps(buf.as_ptr().add(i));
1371 let g = _mm256_loadu_ps(gain.as_ptr());
1372 let r = _mm256_mul_ps(s, g);
1373 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
1374 i += 8;
1375 }
1376 for (j, v) in buf[i..].iter_mut().enumerate() {
1377 let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1378 *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1379 }
1380}
1381
1382#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1383#[target_feature(enable = "sse")]
1384unsafe fn apply_fade_out_inplace_sse(buf: &mut [f32], start_t: f32, dt: f32) {
1385 let mut i = 0usize;
1386 while i + 4 <= buf.len() {
1387 let mut gain = [0.0f32; 4];
1388 for (lane, g) in gain.iter_mut().enumerate() {
1389 let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1390 *g = (t * std::f32::consts::FRAC_PI_2).cos();
1391 }
1392 let s = _mm_loadu_ps(buf.as_ptr().add(i));
1393 let g = _mm_loadu_ps(gain.as_ptr());
1394 let r = _mm_mul_ps(s, g);
1395 _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
1396 i += 4;
1397 }
1398 for (j, v) in buf[i..].iter_mut().enumerate() {
1399 let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1400 *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1401 }
1402}