1#![allow(unsafe_op_in_unsafe_fn)]
9
10#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
11mod x86 {
12 pub use std::arch::x86_64::*;
13}
14
15#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
16use x86::*;
17
18pub fn add_inplace(dst: &mut [f32], src: &[f32]) {
20 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
21 unsafe {
22 if is_x86_feature_detected!("avx") {
23 add_inplace_avx(dst, src);
24 return;
25 }
26 if is_x86_feature_detected!("sse") {
27 add_inplace_sse(dst, src);
28 return;
29 }
30 }
31 add_inplace_scalar(dst, src);
32}
33
34pub fn mul_inplace(dst: &mut [f32], gain: f32) {
36 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
37 unsafe {
38 if is_x86_feature_detected!("avx") {
39 mul_inplace_avx(dst, gain);
40 return;
41 }
42 if is_x86_feature_detected!("sse") {
43 mul_inplace_sse(dst, gain);
44 return;
45 }
46 }
47 mul_inplace_scalar(dst, gain);
48}
49
50pub fn add_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
52 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
53 unsafe {
54 if is_x86_feature_detected!("avx") && is_x86_feature_detected!("fma") {
55 add_scaled_inplace_avx_fma(dst, src, gain);
56 return;
57 }
58 if is_x86_feature_detected!("avx") {
59 add_scaled_inplace_avx(dst, src, gain);
60 return;
61 }
62 if is_x86_feature_detected!("sse") {
63 add_scaled_inplace_sse(dst, src, gain);
64 return;
65 }
66 }
67 add_scaled_inplace_scalar(dst, src, gain);
68}
69
70pub fn copy_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
72 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
73 unsafe {
74 if is_x86_feature_detected!("avx") {
75 copy_scaled_inplace_avx(dst, src, gain);
76 return;
77 }
78 if is_x86_feature_detected!("sse") {
79 copy_scaled_inplace_sse(dst, src, gain);
80 return;
81 }
82 }
83 copy_scaled_inplace_scalar(dst, src, gain);
84}
85
86pub fn add_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
88 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
89 unsafe {
90 if is_x86_feature_detected!("avx") {
91 add_sanitized_inplace_avx(dst, src);
92 return;
93 }
94 if is_x86_feature_detected!("sse") {
95 add_sanitized_inplace_sse(dst, src);
96 return;
97 }
98 }
99 add_sanitized_inplace_scalar(dst, src);
100}
101
102pub fn copy_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
104 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
105 unsafe {
106 if is_x86_feature_detected!("avx") {
107 copy_sanitized_inplace_avx(dst, src);
108 return;
109 }
110 if is_x86_feature_detected!("sse") {
111 copy_sanitized_inplace_sse(dst, src);
112 return;
113 }
114 }
115 copy_sanitized_inplace_scalar(dst, src);
116}
117
118pub fn sanitize_finite_inplace(buf: &mut [f32]) {
120 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
121 unsafe {
122 if is_x86_feature_detected!("avx") {
123 sanitize_finite_inplace_avx(buf);
124 return;
125 }
126 if is_x86_feature_detected!("sse") {
127 sanitize_finite_inplace_sse(buf);
128 return;
129 }
130 }
131 sanitize_finite_inplace_scalar(buf);
132}
133
134pub fn peak_abs(buf: &[f32]) -> f32 {
136 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
137 unsafe {
138 if is_x86_feature_detected!("avx") {
139 return peak_abs_avx(buf);
140 }
141 if is_x86_feature_detected!("sse") {
142 return peak_abs_sse(buf);
143 }
144 }
145 peak_abs_scalar(buf)
146}
147
148pub fn clamp_inplace(buf: &mut [f32], min: f32, max: f32) {
150 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
151 unsafe {
152 if is_x86_feature_detected!("avx") {
153 clamp_inplace_avx(buf, min, max);
154 return;
155 }
156 if is_x86_feature_detected!("sse") {
157 clamp_inplace_sse(buf, min, max);
158 return;
159 }
160 }
161 clamp_inplace_scalar(buf, min, max);
162}
163
164fn add_inplace_scalar(dst: &mut [f32], src: &[f32]) {
165 for (d, s) in dst.iter_mut().zip(src.iter()) {
166 *d += *s;
167 }
168}
169
170fn mul_inplace_scalar(dst: &mut [f32], gain: f32) {
171 for d in dst.iter_mut() {
172 *d *= gain;
173 }
174}
175
176fn add_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
177 for (d, s) in dst.iter_mut().zip(src.iter()) {
178 *d += *s * gain;
179 }
180}
181
182fn copy_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
183 for (d, s) in dst.iter_mut().zip(src.iter()) {
184 *d = *s * gain;
185 }
186}
187
188fn add_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
189 for (d, s) in dst.iter_mut().zip(src.iter()) {
190 *d += if s.is_finite() { *s } else { 0.0 };
191 }
192}
193
194fn copy_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
195 for (d, s) in dst.iter_mut().zip(src.iter()) {
196 *d = if s.is_finite() { *s } else { 0.0 };
197 }
198}
199
200fn sanitize_finite_inplace_scalar(buf: &mut [f32]) {
201 for s in buf.iter_mut() {
202 if !s.is_finite() {
203 *s = 0.0;
204 }
205 }
206}
207
208fn peak_abs_scalar(buf: &[f32]) -> f32 {
209 buf.iter().fold(0.0f32, |acc, s| acc.max(s.abs()))
210}
211
212fn clamp_inplace_scalar(buf: &mut [f32], min: f32, max: f32) {
213 for s in buf.iter_mut() {
214 *s = s.clamp(min, max);
215 }
216}
217
218#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
219#[target_feature(enable = "sse")]
220unsafe fn add_inplace_sse(dst: &mut [f32], src: &[f32]) {
221 let len = dst.len().min(src.len());
222 let dst_head = &mut dst[..len];
223 let src_head = &src[..len];
224 let mut i = 0usize;
225 while i + 4 <= dst_head.len() {
226 let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
227 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
228 let r = _mm_add_ps(d, s);
229 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
230 i += 4;
231 }
232 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
233 *d += *s;
234 }
235}
236
237#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
238#[target_feature(enable = "sse")]
239unsafe fn mul_inplace_sse(dst: &mut [f32], gain: f32) {
240 let g = _mm_set1_ps(gain);
241 let mut i = 0usize;
242 while i + 4 <= dst.len() {
243 let d = _mm_loadu_ps(dst.as_ptr().add(i));
244 let r = _mm_mul_ps(d, g);
245 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
246 i += 4;
247 }
248 for d in &mut dst[i..] {
249 *d *= gain;
250 }
251}
252
253#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
254#[target_feature(enable = "sse")]
255unsafe fn add_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
256 let len = dst.len().min(src.len());
257 let dst_head = &mut dst[..len];
258 let src_head = &src[..len];
259 let g = _mm_set1_ps(gain);
260 let mut i = 0usize;
261 while i + 4 <= dst_head.len() {
262 let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
263 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
264 let r = _mm_add_ps(d, _mm_mul_ps(s, g));
265 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
266 i += 4;
267 }
268 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
269 *d += *s * gain;
270 }
271}
272
273#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
274#[target_feature(enable = "sse")]
275unsafe fn copy_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
276 let len = dst.len().min(src.len());
277 let dst_head = &mut dst[..len];
278 let src_head = &src[..len];
279 let g = _mm_set1_ps(gain);
280 let mut i = 0usize;
281 while i + 4 <= dst_head.len() {
282 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
283 let r = _mm_mul_ps(s, g);
284 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
285 i += 4;
286 }
287 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
288 *d = *s * gain;
289 }
290}
291
292#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
293#[target_feature(enable = "sse")]
294unsafe fn add_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
295 let len = dst.len().min(src.len());
296 let dst_head = &mut dst[..len];
297 let src_head = &src[..len];
298 let sign_mask = _mm_set1_ps(-0.0);
299 let finite_max = _mm_set1_ps(f32::MAX);
300 let mut i = 0usize;
301 while i + 4 <= dst_head.len() {
302 let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
303 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
304 let abs_s = _mm_andnot_ps(sign_mask, s);
305 let finite_mask = _mm_cmple_ps(abs_s, finite_max);
306 let sanitized = _mm_and_ps(s, finite_mask);
307 let r = _mm_add_ps(d, sanitized);
308 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
309 i += 4;
310 }
311 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
312 *d += if s.is_finite() { *s } else { 0.0 };
313 }
314}
315
316#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
317#[target_feature(enable = "sse")]
318unsafe fn copy_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
319 let len = dst.len().min(src.len());
320 let dst_head = &mut dst[..len];
321 let src_head = &src[..len];
322 let sign_mask = _mm_set1_ps(-0.0);
323 let finite_max = _mm_set1_ps(f32::MAX);
324 let mut i = 0usize;
325 while i + 4 <= dst_head.len() {
326 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
327 let abs_s = _mm_andnot_ps(sign_mask, s);
328 let finite_mask = _mm_cmple_ps(abs_s, finite_max);
329 let r = _mm_and_ps(s, finite_mask);
330 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
331 i += 4;
332 }
333 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
334 *d = if s.is_finite() { *s } else { 0.0 };
335 }
336}
337
338#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
339#[target_feature(enable = "sse")]
340unsafe fn sanitize_finite_inplace_sse(buf: &mut [f32]) {
341 let sign_mask = _mm_set1_ps(-0.0);
342 let finite_max = _mm_set1_ps(f32::MAX);
343 let mut i = 0usize;
344 while i + 4 <= buf.len() {
345 let v = _mm_loadu_ps(buf.as_ptr().add(i));
346 let abs_v = _mm_andnot_ps(sign_mask, v);
347 let finite_mask = _mm_cmple_ps(abs_v, finite_max);
348 let r = _mm_and_ps(v, finite_mask);
349 _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
350 i += 4;
351 }
352 for s in &mut buf[i..] {
353 if !s.is_finite() {
354 *s = 0.0;
355 }
356 }
357}
358
359#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
360#[target_feature(enable = "sse")]
361unsafe fn peak_abs_sse(buf: &[f32]) -> f32 {
362 let sign_mask = _mm_set1_ps(-0.0);
363 let mut peak = _mm_setzero_ps();
364 let mut i = 0usize;
365 while i + 4 <= buf.len() {
366 let v = _mm_loadu_ps(buf.as_ptr().add(i));
367 let abs_v = _mm_andnot_ps(sign_mask, v);
368 peak = _mm_max_ps(peak, abs_v);
369 i += 4;
370 }
371 let mut arr = [0.0f32; 4];
372 _mm_storeu_ps(arr.as_mut_ptr(), peak);
373 let mut max_scalar = arr.into_iter().fold(0.0f32, |a, b| a.max(b));
374 for s in &buf[i..] {
375 max_scalar = max_scalar.max(s.abs());
376 }
377 max_scalar
378}
379
380#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
381#[target_feature(enable = "sse")]
382unsafe fn clamp_inplace_sse(buf: &mut [f32], min: f32, max: f32) {
383 let vmin = _mm_set1_ps(min);
384 let vmax = _mm_set1_ps(max);
385 let mut i = 0usize;
386 while i + 4 <= buf.len() {
387 let v = _mm_loadu_ps(buf.as_ptr().add(i));
388 let r = _mm_min_ps(_mm_max_ps(v, vmin), vmax);
389 _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
390 i += 4;
391 }
392 for s in &mut buf[i..] {
393 *s = s.clamp(min, max);
394 }
395}
396
397#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
398#[target_feature(enable = "avx")]
399unsafe fn add_inplace_avx(dst: &mut [f32], src: &[f32]) {
400 let len = dst.len().min(src.len());
401 let dst_head = &mut dst[..len];
402 let src_head = &src[..len];
403 let mut i = 0;
404 while i + 8 <= dst_head.len() {
405 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
406 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
407 let r = _mm256_add_ps(d, s);
408 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
409 i += 8;
410 }
411 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
412 *d += *s;
413 }
414}
415
416#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
417#[target_feature(enable = "avx")]
418unsafe fn mul_inplace_avx(dst: &mut [f32], gain: f32) {
419 let g = _mm256_set1_ps(gain);
420 let mut i = 0;
421 while i + 8 <= dst.len() {
422 let d = _mm256_loadu_ps(dst.as_ptr().add(i));
423 let r = _mm256_mul_ps(d, g);
424 _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
425 i += 8;
426 }
427 for d in &mut dst[i..] {
428 *d *= gain;
429 }
430}
431
432#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
433#[target_feature(enable = "avx")]
434unsafe fn add_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
435 let len = dst.len().min(src.len());
436 let dst_head = &mut dst[..len];
437 let src_head = &src[..len];
438 let g = _mm256_set1_ps(gain);
439 let mut i = 0;
440 while i + 8 <= dst_head.len() {
441 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
442 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
443 let r = _mm256_add_ps(d, _mm256_mul_ps(s, g));
444 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
445 i += 8;
446 }
447 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
448 *d += *s * gain;
449 }
450}
451
452#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
453#[target_feature(enable = "avx,fma")]
454unsafe fn add_scaled_inplace_avx_fma(dst: &mut [f32], src: &[f32], gain: f32) {
455 let len = dst.len().min(src.len());
456 let dst_head = &mut dst[..len];
457 let src_head = &src[..len];
458 let g = _mm256_set1_ps(gain);
459 let mut i = 0;
460 while i + 8 <= dst_head.len() {
461 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
462 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
463 let r = _mm256_fmadd_ps(s, g, d);
464 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
465 i += 8;
466 }
467 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
468 *d += *s * gain;
469 }
470}
471
472#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
473#[target_feature(enable = "avx")]
474unsafe fn copy_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
475 let len = dst.len().min(src.len());
476 let dst_head = &mut dst[..len];
477 let src_head = &src[..len];
478 let g = _mm256_set1_ps(gain);
479 let mut i = 0;
480 while i + 8 <= dst_head.len() {
481 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
482 let r = _mm256_mul_ps(s, g);
483 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
484 i += 8;
485 }
486 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
487 *d = *s * gain;
488 }
489}
490
491#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
492#[target_feature(enable = "avx")]
493unsafe fn add_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
494 let len = dst.len().min(src.len());
495 let dst_head = &mut dst[..len];
496 let src_head = &src[..len];
497 let zero = _mm256_setzero_ps();
498 let max_val = _mm256_set1_ps(f32::MAX);
499 let mut i = 0;
500 while i + 8 <= dst_head.len() {
501 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
502 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
503 let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
504 let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
505 let sanitized = _mm256_blendv_ps(zero, s, mask);
506 let r = _mm256_add_ps(d, sanitized);
507 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
508 i += 8;
509 }
510 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
511 *d += if s.is_finite() { *s } else { 0.0 };
512 }
513}
514
515#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
516#[target_feature(enable = "avx")]
517unsafe fn copy_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
518 let len = dst.len().min(src.len());
519 let dst_head = &mut dst[..len];
520 let src_head = &src[..len];
521 let zero = _mm256_setzero_ps();
522 let max_val = _mm256_set1_ps(f32::MAX);
523 let mut i = 0;
524 while i + 8 <= dst_head.len() {
525 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
526 let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
527 let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
528 let r = _mm256_blendv_ps(zero, s, mask);
529 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
530 i += 8;
531 }
532 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
533 *d = if s.is_finite() { *s } else { 0.0 };
534 }
535}
536
537#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
538#[target_feature(enable = "avx")]
539unsafe fn sanitize_finite_inplace_avx(buf: &mut [f32]) {
540 let zero = _mm256_setzero_ps();
541 let max_val = _mm256_set1_ps(f32::MAX);
542 let mut i = 0;
543 while i + 8 <= buf.len() {
544 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
545 let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
546 let mask = _mm256_cmp_ps(abs_v, max_val, _CMP_LE_OQ);
547 let r = _mm256_blendv_ps(zero, v, mask);
548 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
549 i += 8;
550 }
551 for s in &mut buf[i..] {
552 if !s.is_finite() {
553 *s = 0.0;
554 }
555 }
556}
557
558#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
559#[target_feature(enable = "avx")]
560unsafe fn peak_abs_avx(buf: &[f32]) -> f32 {
561 let mut peak = _mm256_setzero_ps();
562 let mut i = 0;
563 while i + 8 <= buf.len() {
564 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
565 let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
566 peak = _mm256_max_ps(peak, abs_v);
567 i += 8;
568 }
569 let mut arr = [0.0f32; 8];
570 _mm256_storeu_ps(arr.as_mut_ptr(), peak);
571 let mut max_scalar = arr.iter().fold(0.0f32, |a, &b| a.max(b));
572 for s in &buf[i..] {
573 max_scalar = max_scalar.max(s.abs());
574 }
575 max_scalar
576}
577
578#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
579#[target_feature(enable = "avx")]
580unsafe fn clamp_inplace_avx(buf: &mut [f32], min: f32, max: f32) {
581 let vmin = _mm256_set1_ps(min);
582 let vmax = _mm256_set1_ps(max);
583 let mut i = 0;
584 while i + 8 <= buf.len() {
585 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
586 let r = _mm256_max_ps(vmin, _mm256_min_ps(vmax, v));
587 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
588 i += 8;
589 }
590 for s in &mut buf[i..] {
591 *s = s.clamp(min, max);
592 }
593}
594
595#[cfg(test)]
596mod tests {
597 use super::*;
598
599 #[test]
600 fn add_inplace_basic() {
601 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
602 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
603 add_inplace(&mut a, &b);
604 assert_eq!(a, [11.0, 22.0, 33.0, 44.0, 55.0]);
605 }
606
607 #[test]
608 fn mul_inplace_basic() {
609 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
610 mul_inplace(&mut a, 2.0);
611 assert_eq!(a, [2.0, 4.0, 6.0, 8.0, 10.0]);
612 }
613
614 #[test]
615 fn add_scaled_inplace_basic() {
616 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
617 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
618 add_scaled_inplace(&mut a, &b, 0.5);
619 assert_eq!(a, [6.0, 12.0, 18.0, 24.0, 30.0]);
620 }
621
622 #[test]
623 fn copy_scaled_inplace_basic() {
624 let mut a = [0.0f32; 5];
625 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
626 copy_scaled_inplace(&mut a, &b, 0.5);
627 assert_eq!(a, [5.0, 10.0, 15.0, 20.0, 25.0]);
628 }
629
630 #[test]
631 fn add_sanitized_inplace_basic() {
632 let mut a = [1.0f32, 2.0, 3.0, 4.0];
633 let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
634 add_sanitized_inplace(&mut a, &b);
635 assert!(a[0].is_finite() && a[0] == 1.5);
636 assert!(a[1].is_finite() && a[1] == 2.0);
637 assert!(a[2].is_finite() && a[2] == 3.0);
638 assert!(a[3].is_finite() && a[3] == 5.0);
639 }
640
641 #[test]
642 fn copy_sanitized_inplace_basic() {
643 let mut a = [0.0f32; 4];
644 let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
645 copy_sanitized_inplace(&mut a, &b);
646 assert!(a[0].is_finite() && a[0] == 0.5);
647 assert!(a[1].is_finite() && a[1] == 0.0);
648 assert!(a[2].is_finite() && a[2] == 0.0);
649 assert!(a[3].is_finite() && a[3] == 1.0);
650 }
651
652 #[test]
653 fn sanitize_finite_inplace_basic() {
654 let mut a = [1.0f32, f32::NAN, f32::INFINITY, 4.0, f32::NEG_INFINITY];
655 sanitize_finite_inplace(&mut a);
656 assert!(a[0].is_finite() && a[0] == 1.0);
657 assert_eq!(a[1], 0.0);
658 assert_eq!(a[2], 0.0);
659 assert!(a[3].is_finite() && a[3] == 4.0);
660 assert_eq!(a[4], 0.0);
661 }
662
663 #[test]
664 fn peak_abs_basic() {
665 let a = [1.0f32, -3.0, 2.0, 0.5];
666 assert_eq!(peak_abs(&a), 3.0);
667 }
668
669 #[test]
670 fn clamp_inplace_basic() {
671 let mut a = [-2.0f32, -0.5, 0.0, 0.5, 2.0];
672 clamp_inplace(&mut a, -1.0, 1.0);
673 assert_eq!(a, [-1.0, -0.5, 0.0, 0.5, 1.0]);
674 }
675}
676
677pub fn convert_i32_to_f32(src: &[i32], dst: &mut [f32], gain: f32) {
680 let n = src.len().min(dst.len());
681 if n == 0 {
682 return;
683 }
684 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
685 unsafe {
686 if is_x86_feature_detected!("avx") {
687 convert_i32_to_f32_avx(&src[..n], &mut dst[..n], gain);
688 return;
689 }
690 if is_x86_feature_detected!("sse") {
691 convert_i32_to_f32_sse(&src[..n], &mut dst[..n], gain);
692 return;
693 }
694 }
695 convert_i32_to_f32_scalar(&src[..n], &mut dst[..n], gain);
696}
697
698fn convert_i32_to_f32_scalar(src: &[i32], dst: &mut [f32], gain: f32) {
699 for (s, d) in src.iter().zip(dst.iter_mut()) {
700 *d = *s as f32 * gain;
701 }
702}
703
704#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
705unsafe fn convert_i32_to_f32_sse(src: &[i32], dst: &mut [f32], gain: f32) {
706 let g = _mm_set1_ps(gain);
707 let mut i = 0;
708 while i + 4 <= src.len() {
709 let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
710 let f = _mm_cvtepi32_ps(s);
711 let r = _mm_mul_ps(f, g);
712 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
713 i += 4;
714 }
715 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
716 *d = *s as f32 * gain;
717 }
718}
719
720#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
721#[target_feature(enable = "avx")]
722unsafe fn convert_i32_to_f32_avx(src: &[i32], dst: &mut [f32], gain: f32) {
723 let g = _mm256_set1_ps(gain);
724 let mut i = 0;
725 while i + 8 <= src.len() {
726 let s = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
727 let f = _mm256_cvtepi32_ps(s);
728 let r = _mm256_mul_ps(f, g);
729 _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
730 i += 8;
731 }
732 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
733 *d = *s as f32 * gain;
734 }
735}
736
737pub fn convert_i16_to_f32(src: &[i16], dst: &mut [f32], gain: f32) {
740 let n = src.len().min(dst.len());
741 if n == 0 {
742 return;
743 }
744 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
745 unsafe {
746 if is_x86_feature_detected!("avx2") {
747 convert_i16_to_f32_avx2(&src[..n], &mut dst[..n], gain);
748 return;
749 }
750 if is_x86_feature_detected!("sse4.1") {
751 convert_i16_to_f32_sse41(&src[..n], &mut dst[..n], gain);
752 return;
753 }
754 }
755 convert_i16_to_f32_scalar(&src[..n], &mut dst[..n], gain);
756}
757
758fn convert_i16_to_f32_scalar(src: &[i16], dst: &mut [f32], gain: f32) {
759 for (s, d) in src.iter().zip(dst.iter_mut()) {
760 *d = *s as f32 * gain;
761 }
762}
763
764#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
765unsafe fn convert_i16_to_f32_sse41(src: &[i16], dst: &mut [f32], gain: f32) {
766 let g = _mm_set1_ps(gain);
767 let mut i = 0;
768 while i + 8 <= src.len() {
769 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
770 let low = _mm_cvtepi16_epi32(bytes);
771 let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
772 let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), g);
773 let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), g);
774 _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
775 _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
776 i += 8;
777 }
778 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
779 *d = *s as f32 * gain;
780 }
781}
782
783#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
784#[target_feature(enable = "avx2")]
785unsafe fn convert_i16_to_f32_avx2(src: &[i16], dst: &mut [f32], gain: f32) {
786 let g = _mm256_set1_ps(gain);
787 let mut i = 0;
788 while i + 16 <= src.len() {
789 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
790 let low = _mm256_cvtepi16_epi32(bytes);
791 let high = _mm256_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
792 let low_f = _mm256_mul_ps(_mm256_cvtepi32_ps(low), g);
793 let high_f = _mm256_mul_ps(_mm256_cvtepi32_ps(high), g);
794 _mm256_storeu_ps(dst.as_mut_ptr().add(i), low_f);
795 _mm256_storeu_ps(dst.as_mut_ptr().add(i + 8), high_f);
796 i += 16;
797 }
798 if i + 8 <= src.len() {
799 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
800 let low = _mm_cvtepi16_epi32(bytes);
801 let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
802 let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), _mm_set1_ps(gain));
803 let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), _mm_set1_ps(gain));
804 _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
805 _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
806 i += 8;
807 }
808 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
809 *d = *s as f32 * gain;
810 }
811}
812
813pub fn convert_i8_to_f32(src: &[i8], dst: &mut [f32], gain: f32) {
816 let n = src.len().min(dst.len());
817 if n == 0 {
818 return;
819 }
820 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
821 unsafe {
822 if is_x86_feature_detected!("avx2") {
823 convert_i8_to_f32_avx2(&src[..n], &mut dst[..n], gain);
824 return;
825 }
826 if is_x86_feature_detected!("sse4.1") {
827 convert_i8_to_f32_sse41(&src[..n], &mut dst[..n], gain);
828 return;
829 }
830 }
831 convert_i8_to_f32_scalar(&src[..n], &mut dst[..n], gain);
832}
833
834fn convert_i8_to_f32_scalar(src: &[i8], dst: &mut [f32], gain: f32) {
835 for (s, d) in src.iter().zip(dst.iter_mut()) {
836 *d = *s as f32 * gain;
837 }
838}
839
840#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
841unsafe fn convert_i8_to_f32_sse41(src: &[i8], dst: &mut [f32], gain: f32) {
842 let g = _mm_set1_ps(gain);
843 let mut i = 0;
844 while i + 4 <= src.len() {
845 let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
846 let i32s = _mm_cvtepi8_epi32(bytes);
847 let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), g);
848 _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
849 i += 4;
850 }
851 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
852 *d = *s as f32 * gain;
853 }
854}
855
856#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
857#[target_feature(enable = "avx2")]
858unsafe fn convert_i8_to_f32_avx2(src: &[i8], dst: &mut [f32], gain: f32) {
859 let g = _mm256_set1_ps(gain);
860 let mut i = 0;
861 while i + 8 <= src.len() {
862 let bytes = _mm_cvtsi64_si128(*(src.as_ptr().add(i) as *const i64));
863 let i32s = _mm256_cvtepi8_epi32(bytes);
864 let f32s = _mm256_mul_ps(_mm256_cvtepi32_ps(i32s), g);
865 _mm256_storeu_ps(dst.as_mut_ptr().add(i), f32s);
866 i += 8;
867 }
868 if i + 4 <= src.len() {
869 let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
870 let i32s = _mm_cvtepi8_epi32(bytes);
871 let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), _mm_set1_ps(gain));
872 _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
873 i += 4;
874 }
875 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
876 *d = *s as f32 * gain;
877 }
878}
879
880pub fn convert_i24_to_f32(src: &[i32], dst: &mut [f32], gain: f32) {
884 let n = src.len().min(dst.len());
885 if n == 0 {
886 return;
887 }
888 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
889 unsafe {
890 if is_x86_feature_detected!("avx2") {
891 convert_i24_to_f32_avx2(&src[..n], &mut dst[..n], gain);
892 return;
893 }
894 if is_x86_feature_detected!("sse4.1") {
895 convert_i24_to_f32_sse41(&src[..n], &mut dst[..n], gain);
896 return;
897 }
898 }
899 convert_i24_to_f32_scalar(&src[..n], &mut dst[..n], gain);
900}
901
902fn convert_i24_to_f32_scalar(src: &[i32], dst: &mut [f32], gain: f32) {
903 for (s, d) in src.iter().zip(dst.iter_mut()) {
904 let mut v = *s & 0x00FF_FFFF;
905 if (v & 0x0080_0000) != 0 {
906 v |= 0xFF00_0000u32 as i32;
907 }
908 *d = v as f32 * gain;
909 }
910}
911
912#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
913unsafe fn convert_i24_to_f32_sse41(src: &[i32], dst: &mut [f32], gain: f32) {
914 let g = _mm_set1_ps(gain);
915 let mut i = 0;
916 while i + 4 <= src.len() {
917 let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
918 let extended = _mm_srai_epi32(_mm_slli_epi32(s, 8), 8);
919 let f = _mm_cvtepi32_ps(extended);
920 let r = _mm_mul_ps(f, g);
921 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
922 i += 4;
923 }
924 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
925 let mut v = *s & 0x00FF_FFFF;
926 if (v & 0x0080_0000) != 0 {
927 v |= 0xFF00_0000u32 as i32;
928 }
929 *d = v as f32 * gain;
930 }
931}
932
933#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
934#[target_feature(enable = "avx2")]
935unsafe fn convert_i24_to_f32_avx2(src: &[i32], dst: &mut [f32], gain: f32) {
936 let g = _mm256_set1_ps(gain);
937 let mut i = 0;
938 while i + 8 <= src.len() {
939 let s = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
940 let extended = _mm256_srai_epi32(_mm256_slli_epi32(s, 8), 8);
941 let f = _mm256_cvtepi32_ps(extended);
942 let r = _mm256_mul_ps(f, g);
943 _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
944 i += 8;
945 }
946 if i + 4 <= src.len() {
947 let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
948 let extended = _mm_srai_epi32(_mm_slli_epi32(s, 8), 8);
949 let f = _mm_cvtepi32_ps(extended);
950 let r = _mm_mul_ps(f, _mm_set1_ps(gain));
951 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
952 i += 4;
953 }
954 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
955 let mut v = *s & 0x00FF_FFFF;
956 if (v & 0x0080_0000) != 0 {
957 v |= 0xFF00_0000u32 as i32;
958 }
959 *d = v as f32 * gain;
960 }
961}
962
963pub fn convert_f32_to_i24(src: &[f32], dst: &mut [i32], gain: f32) {
967 let n = src.len().min(dst.len());
968 if n == 0 {
969 return;
970 }
971 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
972 unsafe {
973 if is_x86_feature_detected!("avx") {
974 convert_f32_to_i24_avx(&src[..n], &mut dst[..n], gain);
975 return;
976 }
977 if is_x86_feature_detected!("sse2") {
978 convert_f32_to_i24_sse2(&src[..n], &mut dst[..n], gain);
979 return;
980 }
981 }
982 convert_f32_to_i24_scalar(&src[..n], &mut dst[..n], gain);
983}
984
985fn convert_f32_to_i24_scalar(src: &[f32], dst: &mut [i32], gain: f32) {
986 for (s, d) in src.iter().zip(dst.iter_mut()) {
987 *d = (*s * gain) as i32 & 0x00FF_FFFF;
988 }
989}
990
991#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
992unsafe fn convert_f32_to_i24_sse2(src: &[f32], dst: &mut [i32], gain: f32) {
993 let g = _mm_set1_ps(gain);
994 let mask = _mm_set1_epi32(0x00FF_FFFF);
995 let mut i = 0;
996 while i + 4 <= src.len() {
997 let s = _mm_loadu_ps(src.as_ptr().add(i));
998 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
999 let m = _mm_and_si128(v, mask);
1000 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, m);
1001 i += 4;
1002 }
1003 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1004 *d = (*s * gain) as i32 & 0x00FF_FFFF;
1005 }
1006}
1007
1008#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1009#[target_feature(enable = "avx")]
1010unsafe fn convert_f32_to_i24_avx(src: &[f32], dst: &mut [i32], gain: f32) {
1011 let g = _mm256_set1_ps(gain);
1012 let mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x00FF_FFFF));
1013 let mut i = 0;
1014 while i + 8 <= src.len() {
1015 let s = _mm256_loadu_ps(src.as_ptr().add(i));
1016 let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1017 let m = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), mask));
1018 _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, m);
1019 i += 8;
1020 }
1021 if i + 4 <= src.len() {
1022 let g_sse = _mm_set1_ps(gain);
1023 let mask_sse = _mm_set1_epi32(0x00FF_FFFF);
1024 let s = _mm_loadu_ps(src.as_ptr().add(i));
1025 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g_sse));
1026 let m = _mm_and_si128(v, mask_sse);
1027 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, m);
1028 i += 4;
1029 }
1030 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1031 *d = (*s * gain) as i32 & 0x00FF_FFFF;
1032 }
1033}
1034
1035pub fn convert_f32_to_i32(src: &[f32], dst: &mut [i32], gain: f32) {
1039 let n = src.len().min(dst.len());
1040 if n == 0 {
1041 return;
1042 }
1043 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1044 unsafe {
1045 if is_x86_feature_detected!("avx") {
1046 convert_f32_to_i32_avx(&src[..n], &mut dst[..n], gain);
1047 return;
1048 }
1049 if is_x86_feature_detected!("sse2") {
1050 convert_f32_to_i32_sse2(&src[..n], &mut dst[..n], gain);
1051 return;
1052 }
1053 }
1054 convert_f32_to_i32_scalar(&src[..n], &mut dst[..n], gain);
1055}
1056
1057fn convert_f32_to_i32_scalar(src: &[f32], dst: &mut [i32], gain: f32) {
1058 for (s, d) in src.iter().zip(dst.iter_mut()) {
1059 *d = (*s * gain) as i32;
1060 }
1061}
1062
1063#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1064unsafe fn convert_f32_to_i32_sse2(src: &[f32], dst: &mut [i32], gain: f32) {
1065 let g = _mm_set1_ps(gain);
1066 let mut i = 0;
1067 while i + 4 <= src.len() {
1068 let s = _mm_loadu_ps(src.as_ptr().add(i));
1069 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
1070 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, v);
1071 i += 4;
1072 }
1073 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1074 *d = (*s * gain) as i32;
1075 }
1076}
1077
1078#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1079#[target_feature(enable = "avx")]
1080unsafe fn convert_f32_to_i32_avx(src: &[f32], dst: &mut [i32], gain: f32) {
1081 let g = _mm256_set1_ps(gain);
1082 let mut i = 0;
1083 while i + 8 <= src.len() {
1084 let s = _mm256_loadu_ps(src.as_ptr().add(i));
1085 let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1086 _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, v);
1087 i += 8;
1088 }
1089 if i + 4 <= src.len() {
1090 let s = _mm_loadu_ps(src.as_ptr().add(i));
1091 let v = _mm_cvttps_epi32(_mm_mul_ps(s, _mm_set1_ps(gain)));
1092 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, v);
1093 i += 4;
1094 }
1095 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1096 *d = (*s * gain) as i32;
1097 }
1098}
1099
1100pub fn convert_f32_to_i16(src: &[f32], dst: &mut [i16], gain: f32) {
1104 let n = src.len().min(dst.len());
1105 if n == 0 {
1106 return;
1107 }
1108 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1109 unsafe {
1110 if is_x86_feature_detected!("avx") {
1111 convert_f32_to_i16_avx(&src[..n], &mut dst[..n], gain);
1112 return;
1113 }
1114 if is_x86_feature_detected!("sse2") {
1115 convert_f32_to_i16_sse2(&src[..n], &mut dst[..n], gain);
1116 return;
1117 }
1118 }
1119 convert_f32_to_i16_scalar(&src[..n], &mut dst[..n], gain);
1120}
1121
1122fn convert_f32_to_i16_scalar(src: &[f32], dst: &mut [i16], gain: f32) {
1123 for (s, d) in src.iter().zip(dst.iter_mut()) {
1124 *d = (*s * gain) as i16;
1125 }
1126}
1127
1128#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1129unsafe fn convert_f32_to_i16_sse2(src: &[f32], dst: &mut [i16], gain: f32) {
1130 let g = _mm_set1_ps(gain);
1131 let mut i = 0;
1132 while i + 8 <= src.len() {
1133 let s0 = _mm_loadu_ps(src.as_ptr().add(i));
1134 let s1 = _mm_loadu_ps(src.as_ptr().add(i + 4));
1135 let v0 = _mm_cvttps_epi32(_mm_mul_ps(s0, g));
1136 let v1 = _mm_cvttps_epi32(_mm_mul_ps(s1, g));
1137 let packed = _mm_packs_epi32(v0, v1);
1138 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1139 i += 8;
1140 }
1141 if i + 4 <= src.len() {
1142 let s = _mm_loadu_ps(src.as_ptr().add(i));
1143 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
1144 let packed = _mm_packs_epi32(v, _mm_setzero_si128());
1145 _mm_storel_epi64(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1146 i += 4;
1147 }
1148 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1149 *d = (*s * gain) as i16;
1150 }
1151}
1152
1153#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1154#[target_feature(enable = "avx")]
1155unsafe fn convert_f32_to_i16_avx(src: &[f32], dst: &mut [i16], gain: f32) {
1156 let g = _mm256_set1_ps(gain);
1157 let mut i = 0usize;
1158 while i + 8 <= src.len() {
1159 let s = _mm256_loadu_ps(src.as_ptr().add(i));
1160 let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1161 let vlo = _mm256_castsi256_si128(v);
1162 let vhi = _mm256_extracti128_si256(v, 1);
1163 let packed = _mm_packs_epi32(vlo, vhi);
1164 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1165 i += 8;
1166 }
1167 if i + 4 <= src.len() {
1168 let s = _mm_loadu_ps(src.as_ptr().add(i));
1169 let v = _mm_cvttps_epi32(_mm_mul_ps(s, _mm_set1_ps(gain)));
1170 let packed = _mm_packs_epi32(v, _mm_setzero_si128());
1171 _mm_storel_epi64(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1172 i += 4;
1173 }
1174 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1175 *d = (*s * gain) as i16;
1176 }
1177}
1178
1179pub fn convert_f32_to_i8(src: &[f32], dst: &mut [i8], gain: f32) {
1183 let n = src.len().min(dst.len());
1184 if n == 0 {
1185 return;
1186 }
1187 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1188 unsafe {
1189 if is_x86_feature_detected!("avx") {
1190 convert_f32_to_i8_avx(&src[..n], &mut dst[..n], gain);
1191 return;
1192 }
1193 if is_x86_feature_detected!("sse2") {
1194 convert_f32_to_i8_sse2(&src[..n], &mut dst[..n], gain);
1195 return;
1196 }
1197 }
1198 convert_f32_to_i8_scalar(&src[..n], &mut dst[..n], gain);
1199}
1200
1201fn convert_f32_to_i8_scalar(src: &[f32], dst: &mut [i8], gain: f32) {
1202 for (s, d) in src.iter().zip(dst.iter_mut()) {
1203 *d = (*s * gain) as i8;
1204 }
1205}
1206
1207#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1208unsafe fn convert_f32_to_i8_sse2(src: &[f32], dst: &mut [i8], gain: f32) {
1209 let g = _mm_set1_ps(gain);
1210 let mut i = 0;
1211 while i + 16 <= src.len() {
1212 let s0 = _mm_loadu_ps(src.as_ptr().add(i));
1213 let s1 = _mm_loadu_ps(src.as_ptr().add(i + 4));
1214 let s2 = _mm_loadu_ps(src.as_ptr().add(i + 8));
1215 let s3 = _mm_loadu_ps(src.as_ptr().add(i + 12));
1216 let v0 = _mm_cvttps_epi32(_mm_mul_ps(s0, g));
1217 let v1 = _mm_cvttps_epi32(_mm_mul_ps(s1, g));
1218 let v2 = _mm_cvttps_epi32(_mm_mul_ps(s2, g));
1219 let v3 = _mm_cvttps_epi32(_mm_mul_ps(s3, g));
1220 let p0 = _mm_packs_epi32(v0, v1);
1221 let p1 = _mm_packs_epi32(v2, v3);
1222 let packed = _mm_packs_epi16(p0, p1);
1223 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1224 i += 16;
1225 }
1226 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1227 *d = (*s * gain) as i8;
1228 }
1229}
1230
1231#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1232#[target_feature(enable = "avx")]
1233unsafe fn convert_f32_to_i8_avx(src: &[f32], dst: &mut [i8], gain: f32) {
1234 let g = _mm256_set1_ps(gain);
1235 let mut i = 0usize;
1236 while i + 16 <= src.len() {
1237 let s0 = _mm256_loadu_ps(src.as_ptr().add(i));
1238 let s1 = _mm256_loadu_ps(src.as_ptr().add(i + 8));
1239 let v0 = _mm256_cvttps_epi32(_mm256_mul_ps(s0, g));
1240 let v1 = _mm256_cvttps_epi32(_mm256_mul_ps(s1, g));
1241 let p0 = _mm_packs_epi32(_mm256_castsi256_si128(v0), _mm256_extracti128_si256(v0, 1));
1242 let p1 = _mm_packs_epi32(_mm256_castsi256_si128(v1), _mm256_extracti128_si256(v1, 1));
1243 let packed = _mm_packs_epi16(p0, p1);
1244 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1245 i += 16;
1246 }
1247 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1248 *d = (*s * gain) as i8;
1249 }
1250}
1251
1252pub fn apply_fade_in_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
1255 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1256 unsafe {
1257 if is_x86_feature_detected!("avx") {
1258 apply_fade_in_inplace_avx(buf, start_t, dt);
1259 return;
1260 }
1261 if is_x86_feature_detected!("sse") {
1262 apply_fade_in_inplace_sse(buf, start_t, dt);
1263 return;
1264 }
1265 }
1266 for (i, v) in buf.iter_mut().enumerate() {
1267 let t = (start_t + i as f32 * dt).clamp(0.0, 1.0);
1268 *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1269 }
1270}
1271
1272pub fn apply_fade_out_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
1275 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1276 unsafe {
1277 if is_x86_feature_detected!("avx") {
1278 apply_fade_out_inplace_avx(buf, start_t, dt);
1279 return;
1280 }
1281 if is_x86_feature_detected!("sse") {
1282 apply_fade_out_inplace_sse(buf, start_t, dt);
1283 return;
1284 }
1285 }
1286 for (i, v) in buf.iter_mut().enumerate() {
1287 let t = (start_t + i as f32 * dt).clamp(0.0, 1.0);
1288 *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1289 }
1290}
1291
1292#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1293#[target_feature(enable = "avx")]
1294unsafe fn apply_fade_in_inplace_avx(buf: &mut [f32], start_t: f32, dt: f32) {
1295 let mut i = 0usize;
1296 while i + 8 <= buf.len() {
1297 let mut gain = [0.0f32; 8];
1298 for (lane, g) in gain.iter_mut().enumerate() {
1299 let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1300 *g = (t * std::f32::consts::FRAC_PI_2).sin();
1301 }
1302 let s = _mm256_loadu_ps(buf.as_ptr().add(i));
1303 let g = _mm256_loadu_ps(gain.as_ptr());
1304 let r = _mm256_mul_ps(s, g);
1305 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
1306 i += 8;
1307 }
1308 for (j, v) in buf[i..].iter_mut().enumerate() {
1309 let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1310 *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1311 }
1312}
1313
1314#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1315#[target_feature(enable = "sse")]
1316unsafe fn apply_fade_in_inplace_sse(buf: &mut [f32], start_t: f32, dt: f32) {
1317 let mut i = 0usize;
1318 while i + 4 <= buf.len() {
1319 let mut gain = [0.0f32; 4];
1320 for (lane, g) in gain.iter_mut().enumerate() {
1321 let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1322 *g = (t * std::f32::consts::FRAC_PI_2).sin();
1323 }
1324 let s = _mm_loadu_ps(buf.as_ptr().add(i));
1325 let g = _mm_loadu_ps(gain.as_ptr());
1326 let r = _mm_mul_ps(s, g);
1327 _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
1328 i += 4;
1329 }
1330 for (j, v) in buf[i..].iter_mut().enumerate() {
1331 let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1332 *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1333 }
1334}
1335
1336#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1337#[target_feature(enable = "avx")]
1338unsafe fn apply_fade_out_inplace_avx(buf: &mut [f32], start_t: f32, dt: f32) {
1339 let mut i = 0usize;
1340 while i + 8 <= buf.len() {
1341 let mut gain = [0.0f32; 8];
1342 for (lane, g) in gain.iter_mut().enumerate() {
1343 let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1344 *g = (t * std::f32::consts::FRAC_PI_2).cos();
1345 }
1346 let s = _mm256_loadu_ps(buf.as_ptr().add(i));
1347 let g = _mm256_loadu_ps(gain.as_ptr());
1348 let r = _mm256_mul_ps(s, g);
1349 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
1350 i += 8;
1351 }
1352 for (j, v) in buf[i..].iter_mut().enumerate() {
1353 let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1354 *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1355 }
1356}
1357
1358#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1359#[target_feature(enable = "sse")]
1360unsafe fn apply_fade_out_inplace_sse(buf: &mut [f32], start_t: f32, dt: f32) {
1361 let mut i = 0usize;
1362 while i + 4 <= buf.len() {
1363 let mut gain = [0.0f32; 4];
1364 for (lane, g) in gain.iter_mut().enumerate() {
1365 let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1366 *g = (t * std::f32::consts::FRAC_PI_2).cos();
1367 }
1368 let s = _mm_loadu_ps(buf.as_ptr().add(i));
1369 let g = _mm_loadu_ps(gain.as_ptr());
1370 let r = _mm_mul_ps(s, g);
1371 _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
1372 i += 4;
1373 }
1374 for (j, v) in buf[i..].iter_mut().enumerate() {
1375 let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1376 *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1377 }
1378}