1#![allow(unsafe_op_in_unsafe_fn)]
9
10#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
11mod x86 {
12 pub use std::arch::x86_64::*;
13}
14
15#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
16use x86::*;
17
18pub fn add_inplace(dst: &mut [f32], src: &[f32]) {
20 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
21 unsafe {
22 if is_x86_feature_detected!("avx") {
23 add_inplace_avx(dst, src);
24 return;
25 }
26 if is_x86_feature_detected!("sse") {
27 add_inplace_sse(dst, src);
28 return;
29 }
30 }
31 add_inplace_scalar(dst, src);
32}
33
34pub fn mul_inplace(dst: &mut [f32], gain: f32) {
36 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
37 unsafe {
38 if is_x86_feature_detected!("avx") {
39 mul_inplace_avx(dst, gain);
40 return;
41 }
42 if is_x86_feature_detected!("sse") {
43 mul_inplace_sse(dst, gain);
44 return;
45 }
46 }
47 mul_inplace_scalar(dst, gain);
48}
49
50pub fn add_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
52 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
53 unsafe {
54 if is_x86_feature_detected!("avx") && is_x86_feature_detected!("fma") {
55 add_scaled_inplace_avx_fma(dst, src, gain);
56 return;
57 }
58 if is_x86_feature_detected!("avx") {
59 add_scaled_inplace_avx(dst, src, gain);
60 return;
61 }
62 if is_x86_feature_detected!("sse") {
63 add_scaled_inplace_sse(dst, src, gain);
64 return;
65 }
66 }
67 add_scaled_inplace_scalar(dst, src, gain);
68}
69
70pub fn copy_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
72 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
73 unsafe {
74 if is_x86_feature_detected!("avx") {
75 copy_scaled_inplace_avx(dst, src, gain);
76 return;
77 }
78 if is_x86_feature_detected!("sse") {
79 copy_scaled_inplace_sse(dst, src, gain);
80 return;
81 }
82 }
83 copy_scaled_inplace_scalar(dst, src, gain);
84}
85
86pub fn add_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
88 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
89 unsafe {
90 if is_x86_feature_detected!("avx") {
91 add_sanitized_inplace_avx(dst, src);
92 return;
93 }
94 if is_x86_feature_detected!("sse") {
95 add_sanitized_inplace_sse(dst, src);
96 return;
97 }
98 }
99 add_sanitized_inplace_scalar(dst, src);
100}
101
102pub fn copy_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
104 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
105 unsafe {
106 if is_x86_feature_detected!("avx") {
107 copy_sanitized_inplace_avx(dst, src);
108 return;
109 }
110 if is_x86_feature_detected!("sse") {
111 copy_sanitized_inplace_sse(dst, src);
112 return;
113 }
114 }
115 copy_sanitized_inplace_scalar(dst, src);
116}
117
118pub fn sanitize_finite_inplace(buf: &mut [f32]) {
120 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
121 unsafe {
122 if is_x86_feature_detected!("avx") {
123 sanitize_finite_inplace_avx(buf);
124 return;
125 }
126 if is_x86_feature_detected!("sse") {
127 sanitize_finite_inplace_sse(buf);
128 return;
129 }
130 }
131 sanitize_finite_inplace_scalar(buf);
132}
133
134pub fn peak_abs(buf: &[f32]) -> f32 {
136 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
137 unsafe {
138 if is_x86_feature_detected!("avx") {
139 return peak_abs_avx(buf);
140 }
141 if is_x86_feature_detected!("sse") {
142 return peak_abs_sse(buf);
143 }
144 }
145 peak_abs_scalar(buf)
146}
147
148pub fn clamp_inplace(buf: &mut [f32], min: f32, max: f32) {
150 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
151 unsafe {
152 if is_x86_feature_detected!("avx") {
153 clamp_inplace_avx(buf, min, max);
154 return;
155 }
156 if is_x86_feature_detected!("sse") {
157 clamp_inplace_sse(buf, min, max);
158 return;
159 }
160 }
161 clamp_inplace_scalar(buf, min, max);
162}
163
164fn add_inplace_scalar(dst: &mut [f32], src: &[f32]) {
165 for (d, s) in dst.iter_mut().zip(src.iter()) {
166 *d += *s;
167 }
168}
169
170fn mul_inplace_scalar(dst: &mut [f32], gain: f32) {
171 for d in dst.iter_mut() {
172 *d *= gain;
173 }
174}
175
176fn add_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
177 for (d, s) in dst.iter_mut().zip(src.iter()) {
178 *d += *s * gain;
179 }
180}
181
182fn copy_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
183 for (d, s) in dst.iter_mut().zip(src.iter()) {
184 *d = *s * gain;
185 }
186}
187
188fn add_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
189 for (d, s) in dst.iter_mut().zip(src.iter()) {
190 *d += if s.is_finite() { *s } else { 0.0 };
191 }
192}
193
194fn copy_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
195 for (d, s) in dst.iter_mut().zip(src.iter()) {
196 *d = if s.is_finite() { *s } else { 0.0 };
197 }
198}
199
200fn sanitize_finite_inplace_scalar(buf: &mut [f32]) {
201 for s in buf.iter_mut() {
202 if !s.is_finite() {
203 *s = 0.0;
204 }
205 }
206}
207
208fn peak_abs_scalar(buf: &[f32]) -> f32 {
209 buf.iter().fold(0.0f32, |acc, s| acc.max(s.abs()))
210}
211
212fn clamp_inplace_scalar(buf: &mut [f32], min: f32, max: f32) {
213 for s in buf.iter_mut() {
214 *s = s.clamp(min, max);
215 }
216}
217
218#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
219#[target_feature(enable = "sse")]
220unsafe fn add_inplace_sse(dst: &mut [f32], src: &[f32]) {
221 let len = dst.len().min(src.len());
222 let dst_head = &mut dst[..len];
223 let src_head = &src[..len];
224 let mut i = 0usize;
225 while i + 4 <= dst_head.len() {
226 let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
227 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
228 let r = _mm_add_ps(d, s);
229 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
230 i += 4;
231 }
232 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
233 *d += *s;
234 }
235}
236
237#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
238#[target_feature(enable = "sse")]
239unsafe fn mul_inplace_sse(dst: &mut [f32], gain: f32) {
240 let g = _mm_set1_ps(gain);
241 let mut i = 0usize;
242 while i + 4 <= dst.len() {
243 let d = _mm_loadu_ps(dst.as_ptr().add(i));
244 let r = _mm_mul_ps(d, g);
245 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
246 i += 4;
247 }
248 for d in &mut dst[i..] {
249 *d *= gain;
250 }
251}
252
253#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
254#[target_feature(enable = "sse")]
255unsafe fn add_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
256 let len = dst.len().min(src.len());
257 let dst_head = &mut dst[..len];
258 let src_head = &src[..len];
259 let g = _mm_set1_ps(gain);
260 let mut i = 0usize;
261 while i + 4 <= dst_head.len() {
262 let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
263 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
264 let r = _mm_add_ps(d, _mm_mul_ps(s, g));
265 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
266 i += 4;
267 }
268 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
269 *d += *s * gain;
270 }
271}
272
273#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
274#[target_feature(enable = "sse")]
275unsafe fn copy_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
276 let len = dst.len().min(src.len());
277 let dst_head = &mut dst[..len];
278 let src_head = &src[..len];
279 let g = _mm_set1_ps(gain);
280 let mut i = 0usize;
281 while i + 4 <= dst_head.len() {
282 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
283 let r = _mm_mul_ps(s, g);
284 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
285 i += 4;
286 }
287 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
288 *d = *s * gain;
289 }
290}
291
292#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
293#[target_feature(enable = "sse")]
294unsafe fn add_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
295 let len = dst.len().min(src.len());
296 let dst_head = &mut dst[..len];
297 let src_head = &src[..len];
298 let sign_mask = _mm_set1_ps(-0.0);
299 let finite_max = _mm_set1_ps(f32::MAX);
300 let mut i = 0usize;
301 while i + 4 <= dst_head.len() {
302 let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
303 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
304 let abs_s = _mm_andnot_ps(sign_mask, s);
305 let finite_mask = _mm_cmple_ps(abs_s, finite_max);
306 let sanitized = _mm_and_ps(s, finite_mask);
307 let r = _mm_add_ps(d, sanitized);
308 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
309 i += 4;
310 }
311 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
312 *d += if s.is_finite() { *s } else { 0.0 };
313 }
314}
315
316#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
317#[target_feature(enable = "sse")]
318unsafe fn copy_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
319 let len = dst.len().min(src.len());
320 let dst_head = &mut dst[..len];
321 let src_head = &src[..len];
322 let sign_mask = _mm_set1_ps(-0.0);
323 let finite_max = _mm_set1_ps(f32::MAX);
324 let mut i = 0usize;
325 while i + 4 <= dst_head.len() {
326 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
327 let abs_s = _mm_andnot_ps(sign_mask, s);
328 let finite_mask = _mm_cmple_ps(abs_s, finite_max);
329 let r = _mm_and_ps(s, finite_mask);
330 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
331 i += 4;
332 }
333 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
334 *d = if s.is_finite() { *s } else { 0.0 };
335 }
336}
337
338#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
339#[target_feature(enable = "sse")]
340unsafe fn sanitize_finite_inplace_sse(buf: &mut [f32]) {
341 let sign_mask = _mm_set1_ps(-0.0);
342 let finite_max = _mm_set1_ps(f32::MAX);
343 let mut i = 0usize;
344 while i + 4 <= buf.len() {
345 let v = _mm_loadu_ps(buf.as_ptr().add(i));
346 let abs_v = _mm_andnot_ps(sign_mask, v);
347 let finite_mask = _mm_cmple_ps(abs_v, finite_max);
348 let r = _mm_and_ps(v, finite_mask);
349 _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
350 i += 4;
351 }
352 for s in &mut buf[i..] {
353 if !s.is_finite() {
354 *s = 0.0;
355 }
356 }
357}
358
359#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
360#[target_feature(enable = "sse")]
361unsafe fn peak_abs_sse(buf: &[f32]) -> f32 {
362 let sign_mask = _mm_set1_ps(-0.0);
363 let mut peak = _mm_setzero_ps();
364 let mut i = 0usize;
365 while i + 4 <= buf.len() {
366 let v = _mm_loadu_ps(buf.as_ptr().add(i));
367 let abs_v = _mm_andnot_ps(sign_mask, v);
368 peak = _mm_max_ps(peak, abs_v);
369 i += 4;
370 }
371 let mut arr = [0.0f32; 4];
372 _mm_storeu_ps(arr.as_mut_ptr(), peak);
373 let mut max_scalar = arr.into_iter().fold(0.0f32, |a, b| a.max(b));
374 for s in &buf[i..] {
375 max_scalar = max_scalar.max(s.abs());
376 }
377 max_scalar
378}
379
380#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
381#[target_feature(enable = "sse")]
382unsafe fn clamp_inplace_sse(buf: &mut [f32], min: f32, max: f32) {
383 let vmin = _mm_set1_ps(min);
384 let vmax = _mm_set1_ps(max);
385 let mut i = 0usize;
386 while i + 4 <= buf.len() {
387 let v = _mm_loadu_ps(buf.as_ptr().add(i));
388 let r = _mm_min_ps(_mm_max_ps(v, vmin), vmax);
389 _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
390 i += 4;
391 }
392 for s in &mut buf[i..] {
393 *s = s.clamp(min, max);
394 }
395}
396
397#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
398#[target_feature(enable = "avx")]
399unsafe fn add_inplace_avx(dst: &mut [f32], src: &[f32]) {
400 let len = dst.len().min(src.len());
401 let dst_head = &mut dst[..len];
402 let src_head = &src[..len];
403 let mut i = 0;
404 while i + 8 <= dst_head.len() {
405 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
406 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
407 let r = _mm256_add_ps(d, s);
408 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
409 i += 8;
410 }
411 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
412 *d += *s;
413 }
414}
415
416#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
417#[target_feature(enable = "avx")]
418unsafe fn mul_inplace_avx(dst: &mut [f32], gain: f32) {
419 let g = _mm256_set1_ps(gain);
420 let mut i = 0;
421 while i + 8 <= dst.len() {
422 let d = _mm256_loadu_ps(dst.as_ptr().add(i));
423 let r = _mm256_mul_ps(d, g);
424 _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
425 i += 8;
426 }
427 for d in &mut dst[i..] {
428 *d *= gain;
429 }
430}
431
432#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
433#[target_feature(enable = "avx")]
434unsafe fn add_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
435 let len = dst.len().min(src.len());
436 let dst_head = &mut dst[..len];
437 let src_head = &src[..len];
438 let g = _mm256_set1_ps(gain);
439 let mut i = 0;
440 while i + 8 <= dst_head.len() {
441 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
442 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
443 let r = _mm256_add_ps(d, _mm256_mul_ps(s, g));
444 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
445 i += 8;
446 }
447 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
448 *d += *s * gain;
449 }
450}
451
452#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
453#[target_feature(enable = "avx,fma")]
454unsafe fn add_scaled_inplace_avx_fma(dst: &mut [f32], src: &[f32], gain: f32) {
455 let len = dst.len().min(src.len());
456 let dst_head = &mut dst[..len];
457 let src_head = &src[..len];
458 let g = _mm256_set1_ps(gain);
459 let mut i = 0;
460 while i + 8 <= dst_head.len() {
461 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
462 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
463 let r = _mm256_fmadd_ps(s, g, d);
464 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
465 i += 8;
466 }
467 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
468 *d += *s * gain;
469 }
470}
471
472#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
473#[target_feature(enable = "avx")]
474unsafe fn copy_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
475 let len = dst.len().min(src.len());
476 let dst_head = &mut dst[..len];
477 let src_head = &src[..len];
478 let g = _mm256_set1_ps(gain);
479 let mut i = 0;
480 while i + 8 <= dst_head.len() {
481 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
482 let r = _mm256_mul_ps(s, g);
483 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
484 i += 8;
485 }
486 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
487 *d = *s * gain;
488 }
489}
490
491#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
492#[target_feature(enable = "avx")]
493unsafe fn add_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
494 let len = dst.len().min(src.len());
495 let dst_head = &mut dst[..len];
496 let src_head = &src[..len];
497 let zero = _mm256_setzero_ps();
498 let max_val = _mm256_set1_ps(f32::MAX);
499 let mut i = 0;
500 while i + 8 <= dst_head.len() {
501 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
502 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
503 let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
504 let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
505 let sanitized = _mm256_blendv_ps(zero, s, mask);
506 let r = _mm256_add_ps(d, sanitized);
507 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
508 i += 8;
509 }
510 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
511 *d += if s.is_finite() { *s } else { 0.0 };
512 }
513}
514
515#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
516#[target_feature(enable = "avx")]
517unsafe fn copy_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
518 let len = dst.len().min(src.len());
519 let dst_head = &mut dst[..len];
520 let src_head = &src[..len];
521 let zero = _mm256_setzero_ps();
522 let max_val = _mm256_set1_ps(f32::MAX);
523 let mut i = 0;
524 while i + 8 <= dst_head.len() {
525 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
526 let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
527 let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
528 let r = _mm256_blendv_ps(zero, s, mask);
529 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
530 i += 8;
531 }
532 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
533 *d = if s.is_finite() { *s } else { 0.0 };
534 }
535}
536
537#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
538#[target_feature(enable = "avx")]
539unsafe fn sanitize_finite_inplace_avx(buf: &mut [f32]) {
540 let zero = _mm256_setzero_ps();
541 let max_val = _mm256_set1_ps(f32::MAX);
542 let mut i = 0;
543 while i + 8 <= buf.len() {
544 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
545 let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
546 let mask = _mm256_cmp_ps(abs_v, max_val, _CMP_LE_OQ);
547 let r = _mm256_blendv_ps(zero, v, mask);
548 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
549 i += 8;
550 }
551 for s in &mut buf[i..] {
552 if !s.is_finite() {
553 *s = 0.0;
554 }
555 }
556}
557
558#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
559#[target_feature(enable = "avx")]
560unsafe fn peak_abs_avx(buf: &[f32]) -> f32 {
561 let mut peak = _mm256_setzero_ps();
562 let mut i = 0;
563 while i + 8 <= buf.len() {
564 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
565 let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
566 peak = _mm256_max_ps(peak, abs_v);
567 i += 8;
568 }
569 let mut arr = [0.0f32; 8];
570 _mm256_storeu_ps(arr.as_mut_ptr(), peak);
571 let mut max_scalar = arr.iter().fold(0.0f32, |a, &b| a.max(b));
572 for s in &buf[i..] {
573 max_scalar = max_scalar.max(s.abs());
574 }
575 max_scalar
576}
577
578#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
579#[target_feature(enable = "avx")]
580unsafe fn clamp_inplace_avx(buf: &mut [f32], min: f32, max: f32) {
581 let vmin = _mm256_set1_ps(min);
582 let vmax = _mm256_set1_ps(max);
583 let mut i = 0;
584 while i + 8 <= buf.len() {
585 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
586 let r = _mm256_max_ps(vmin, _mm256_min_ps(vmax, v));
587 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
588 i += 8;
589 }
590 for s in &mut buf[i..] {
591 *s = s.clamp(min, max);
592 }
593}
594
595pub fn convert_i32_to_f32(src: &[i32], dst: &mut [f32], gain: f32) {
598 let n = src.len().min(dst.len());
599 if n == 0 {
600 return;
601 }
602 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
603 unsafe {
604 if is_x86_feature_detected!("avx") {
605 convert_i32_to_f32_avx(&src[..n], &mut dst[..n], gain);
606 return;
607 }
608 if is_x86_feature_detected!("sse") {
609 convert_i32_to_f32_sse(&src[..n], &mut dst[..n], gain);
610 return;
611 }
612 }
613 convert_i32_to_f32_scalar(&src[..n], &mut dst[..n], gain);
614}
615
616fn convert_i32_to_f32_scalar(src: &[i32], dst: &mut [f32], gain: f32) {
617 for (s, d) in src.iter().zip(dst.iter_mut()) {
618 *d = *s as f32 * gain;
619 }
620}
621
622#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
623unsafe fn convert_i32_to_f32_sse(src: &[i32], dst: &mut [f32], gain: f32) {
624 let g = _mm_set1_ps(gain);
625 let mut i = 0;
626 while i + 4 <= src.len() {
627 let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
628 let f = _mm_cvtepi32_ps(s);
629 let r = _mm_mul_ps(f, g);
630 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
631 i += 4;
632 }
633 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
634 *d = *s as f32 * gain;
635 }
636}
637
638#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
639#[target_feature(enable = "avx")]
640unsafe fn convert_i32_to_f32_avx(src: &[i32], dst: &mut [f32], gain: f32) {
641 let g = _mm256_set1_ps(gain);
642 let mut i = 0;
643 while i + 8 <= src.len() {
644 let s = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
645 let f = _mm256_cvtepi32_ps(s);
646 let r = _mm256_mul_ps(f, g);
647 _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
648 i += 8;
649 }
650 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
651 *d = *s as f32 * gain;
652 }
653}
654
655pub fn convert_i16_to_f32(src: &[i16], dst: &mut [f32], gain: f32) {
658 let n = src.len().min(dst.len());
659 if n == 0 {
660 return;
661 }
662 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
663 unsafe {
664 if is_x86_feature_detected!("avx2") {
665 convert_i16_to_f32_avx2(&src[..n], &mut dst[..n], gain);
666 return;
667 }
668 if is_x86_feature_detected!("sse4.1") {
669 convert_i16_to_f32_sse41(&src[..n], &mut dst[..n], gain);
670 return;
671 }
672 }
673 convert_i16_to_f32_scalar(&src[..n], &mut dst[..n], gain);
674}
675
676fn convert_i16_to_f32_scalar(src: &[i16], dst: &mut [f32], gain: f32) {
677 for (s, d) in src.iter().zip(dst.iter_mut()) {
678 *d = *s as f32 * gain;
679 }
680}
681
682#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
683unsafe fn convert_i16_to_f32_sse41(src: &[i16], dst: &mut [f32], gain: f32) {
684 let g = _mm_set1_ps(gain);
685 let mut i = 0;
686 while i + 8 <= src.len() {
687 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
688 let low = _mm_cvtepi16_epi32(bytes);
689 let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
690 let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), g);
691 let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), g);
692 _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
693 _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
694 i += 8;
695 }
696 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
697 *d = *s as f32 * gain;
698 }
699}
700
701#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
702#[target_feature(enable = "avx2")]
703unsafe fn convert_i16_to_f32_avx2(src: &[i16], dst: &mut [f32], gain: f32) {
704 let g = _mm256_set1_ps(gain);
705 let mut i = 0;
706 while i + 16 <= src.len() {
707 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
708 let low = _mm256_cvtepi16_epi32(bytes);
709 let high = _mm256_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
710 let low_f = _mm256_mul_ps(_mm256_cvtepi32_ps(low), g);
711 let high_f = _mm256_mul_ps(_mm256_cvtepi32_ps(high), g);
712 _mm256_storeu_ps(dst.as_mut_ptr().add(i), low_f);
713 _mm256_storeu_ps(dst.as_mut_ptr().add(i + 8), high_f);
714 i += 16;
715 }
716 if i + 8 <= src.len() {
717 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
718 let low = _mm_cvtepi16_epi32(bytes);
719 let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
720 let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), _mm_set1_ps(gain));
721 let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), _mm_set1_ps(gain));
722 _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
723 _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
724 i += 8;
725 }
726 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
727 *d = *s as f32 * gain;
728 }
729}
730
731pub fn convert_i8_to_f32(src: &[i8], dst: &mut [f32], gain: f32) {
734 let n = src.len().min(dst.len());
735 if n == 0 {
736 return;
737 }
738 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
739 unsafe {
740 if is_x86_feature_detected!("avx2") {
741 convert_i8_to_f32_avx2(&src[..n], &mut dst[..n], gain);
742 return;
743 }
744 if is_x86_feature_detected!("sse4.1") {
745 convert_i8_to_f32_sse41(&src[..n], &mut dst[..n], gain);
746 return;
747 }
748 }
749 convert_i8_to_f32_scalar(&src[..n], &mut dst[..n], gain);
750}
751
752fn convert_i8_to_f32_scalar(src: &[i8], dst: &mut [f32], gain: f32) {
753 for (s, d) in src.iter().zip(dst.iter_mut()) {
754 *d = *s as f32 * gain;
755 }
756}
757
758#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
759unsafe fn convert_i8_to_f32_sse41(src: &[i8], dst: &mut [f32], gain: f32) {
760 let g = _mm_set1_ps(gain);
761 let mut i = 0;
762 while i + 4 <= src.len() {
763 let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
764 let i32s = _mm_cvtepi8_epi32(bytes);
765 let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), g);
766 _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
767 i += 4;
768 }
769 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
770 *d = *s as f32 * gain;
771 }
772}
773
774#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
775#[target_feature(enable = "avx2")]
776unsafe fn convert_i8_to_f32_avx2(src: &[i8], dst: &mut [f32], gain: f32) {
777 let g = _mm256_set1_ps(gain);
778 let mut i = 0;
779 while i + 8 <= src.len() {
780 let bytes = _mm_cvtsi64_si128(*(src.as_ptr().add(i) as *const i64));
781 let i32s = _mm256_cvtepi8_epi32(bytes);
782 let f32s = _mm256_mul_ps(_mm256_cvtepi32_ps(i32s), g);
783 _mm256_storeu_ps(dst.as_mut_ptr().add(i), f32s);
784 i += 8;
785 }
786 if i + 4 <= src.len() {
787 let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
788 let i32s = _mm_cvtepi8_epi32(bytes);
789 let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), _mm_set1_ps(gain));
790 _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
791 i += 4;
792 }
793 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
794 *d = *s as f32 * gain;
795 }
796}
797
798pub fn convert_i24_to_f32(src: &[i32], dst: &mut [f32], gain: f32) {
802 let n = src.len().min(dst.len());
803 if n == 0 {
804 return;
805 }
806 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
807 unsafe {
808 if is_x86_feature_detected!("avx2") {
809 convert_i24_to_f32_avx2(&src[..n], &mut dst[..n], gain);
810 return;
811 }
812 if is_x86_feature_detected!("sse4.1") {
813 convert_i24_to_f32_sse41(&src[..n], &mut dst[..n], gain);
814 return;
815 }
816 }
817 convert_i24_to_f32_scalar(&src[..n], &mut dst[..n], gain);
818}
819
820fn convert_i24_to_f32_scalar(src: &[i32], dst: &mut [f32], gain: f32) {
821 for (s, d) in src.iter().zip(dst.iter_mut()) {
822 let mut v = *s & 0x00FF_FFFF;
823 if (v & 0x0080_0000) != 0 {
824 v |= 0xFF00_0000u32 as i32;
825 }
826 *d = v as f32 * gain;
827 }
828}
829
830#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
831unsafe fn convert_i24_to_f32_sse41(src: &[i32], dst: &mut [f32], gain: f32) {
832 let g = _mm_set1_ps(gain);
833 let mut i = 0;
834 while i + 4 <= src.len() {
835 let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
836 let extended = _mm_srai_epi32(_mm_slli_epi32(s, 8), 8);
837 let f = _mm_cvtepi32_ps(extended);
838 let r = _mm_mul_ps(f, g);
839 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
840 i += 4;
841 }
842 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
843 let mut v = *s & 0x00FF_FFFF;
844 if (v & 0x0080_0000) != 0 {
845 v |= 0xFF00_0000u32 as i32;
846 }
847 *d = v as f32 * gain;
848 }
849}
850
851#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
852#[target_feature(enable = "avx2")]
853unsafe fn convert_i24_to_f32_avx2(src: &[i32], dst: &mut [f32], gain: f32) {
854 let g = _mm256_set1_ps(gain);
855 let mut i = 0;
856 while i + 8 <= src.len() {
857 let s = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
858 let extended = _mm256_srai_epi32(_mm256_slli_epi32(s, 8), 8);
859 let f = _mm256_cvtepi32_ps(extended);
860 let r = _mm256_mul_ps(f, g);
861 _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
862 i += 8;
863 }
864 if i + 4 <= src.len() {
865 let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
866 let extended = _mm_srai_epi32(_mm_slli_epi32(s, 8), 8);
867 let f = _mm_cvtepi32_ps(extended);
868 let r = _mm_mul_ps(f, _mm_set1_ps(gain));
869 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
870 i += 4;
871 }
872 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
873 let mut v = *s & 0x00FF_FFFF;
874 if (v & 0x0080_0000) != 0 {
875 v |= 0xFF00_0000u32 as i32;
876 }
877 *d = v as f32 * gain;
878 }
879}
880
881pub fn convert_f32_to_i24(src: &[f32], dst: &mut [i32], gain: f32) {
885 let n = src.len().min(dst.len());
886 if n == 0 {
887 return;
888 }
889 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
890 unsafe {
891 if is_x86_feature_detected!("avx") {
892 convert_f32_to_i24_avx(&src[..n], &mut dst[..n], gain);
893 return;
894 }
895 if is_x86_feature_detected!("sse2") {
896 convert_f32_to_i24_sse2(&src[..n], &mut dst[..n], gain);
897 return;
898 }
899 }
900 convert_f32_to_i24_scalar(&src[..n], &mut dst[..n], gain);
901}
902
903fn convert_f32_to_i24_scalar(src: &[f32], dst: &mut [i32], gain: f32) {
904 for (s, d) in src.iter().zip(dst.iter_mut()) {
905 *d = (*s * gain) as i32 & 0x00FF_FFFF;
906 }
907}
908
909#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
910unsafe fn convert_f32_to_i24_sse2(src: &[f32], dst: &mut [i32], gain: f32) {
911 let g = _mm_set1_ps(gain);
912 let mask = _mm_set1_epi32(0x00FF_FFFF);
913 let mut i = 0;
914 while i + 4 <= src.len() {
915 let s = _mm_loadu_ps(src.as_ptr().add(i));
916 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
917 let m = _mm_and_si128(v, mask);
918 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, m);
919 i += 4;
920 }
921 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
922 *d = (*s * gain) as i32 & 0x00FF_FFFF;
923 }
924}
925
926#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
927#[target_feature(enable = "avx")]
928unsafe fn convert_f32_to_i24_avx(src: &[f32], dst: &mut [i32], gain: f32) {
929 let g = _mm256_set1_ps(gain);
930 let mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x00FF_FFFF));
931 let mut i = 0;
932 while i + 8 <= src.len() {
933 let s = _mm256_loadu_ps(src.as_ptr().add(i));
934 let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
935 let m = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), mask));
936 _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, m);
937 i += 8;
938 }
939 if i + 4 <= src.len() {
940 let g_sse = _mm_set1_ps(gain);
941 let mask_sse = _mm_set1_epi32(0x00FF_FFFF);
942 let s = _mm_loadu_ps(src.as_ptr().add(i));
943 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g_sse));
944 let m = _mm_and_si128(v, mask_sse);
945 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, m);
946 i += 4;
947 }
948 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
949 *d = (*s * gain) as i32 & 0x00FF_FFFF;
950 }
951}
952
953pub fn convert_f32_to_i32(src: &[f32], dst: &mut [i32], gain: f32) {
957 let n = src.len().min(dst.len());
958 if n == 0 {
959 return;
960 }
961 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
962 unsafe {
963 if is_x86_feature_detected!("avx") {
964 convert_f32_to_i32_avx(&src[..n], &mut dst[..n], gain);
965 return;
966 }
967 if is_x86_feature_detected!("sse2") {
968 convert_f32_to_i32_sse2(&src[..n], &mut dst[..n], gain);
969 return;
970 }
971 }
972 convert_f32_to_i32_scalar(&src[..n], &mut dst[..n], gain);
973}
974
975fn convert_f32_to_i32_scalar(src: &[f32], dst: &mut [i32], gain: f32) {
976 for (s, d) in src.iter().zip(dst.iter_mut()) {
977 *d = (*s * gain) as i32;
978 }
979}
980
981#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
982unsafe fn convert_f32_to_i32_sse2(src: &[f32], dst: &mut [i32], gain: f32) {
983 let g = _mm_set1_ps(gain);
984 let mut i = 0;
985 while i + 4 <= src.len() {
986 let s = _mm_loadu_ps(src.as_ptr().add(i));
987 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
988 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, v);
989 i += 4;
990 }
991 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
992 *d = (*s * gain) as i32;
993 }
994}
995
996#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
997#[target_feature(enable = "avx")]
998unsafe fn convert_f32_to_i32_avx(src: &[f32], dst: &mut [i32], gain: f32) {
999 let g = _mm256_set1_ps(gain);
1000 let mut i = 0;
1001 while i + 8 <= src.len() {
1002 let s = _mm256_loadu_ps(src.as_ptr().add(i));
1003 let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1004 _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, v);
1005 i += 8;
1006 }
1007 if i + 4 <= src.len() {
1008 let s = _mm_loadu_ps(src.as_ptr().add(i));
1009 let v = _mm_cvttps_epi32(_mm_mul_ps(s, _mm_set1_ps(gain)));
1010 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, v);
1011 i += 4;
1012 }
1013 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1014 *d = (*s * gain) as i32;
1015 }
1016}
1017
1018pub fn convert_f32_to_i16(src: &[f32], dst: &mut [i16], gain: f32) {
1022 let n = src.len().min(dst.len());
1023 if n == 0 {
1024 return;
1025 }
1026 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1027 unsafe {
1028 if is_x86_feature_detected!("avx") {
1029 convert_f32_to_i16_avx(&src[..n], &mut dst[..n], gain);
1030 return;
1031 }
1032 if is_x86_feature_detected!("sse2") {
1033 convert_f32_to_i16_sse2(&src[..n], &mut dst[..n], gain);
1034 return;
1035 }
1036 }
1037 convert_f32_to_i16_scalar(&src[..n], &mut dst[..n], gain);
1038}
1039
1040fn convert_f32_to_i16_scalar(src: &[f32], dst: &mut [i16], gain: f32) {
1041 for (s, d) in src.iter().zip(dst.iter_mut()) {
1042 *d = (*s * gain) as i16;
1043 }
1044}
1045
1046#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1047unsafe fn convert_f32_to_i16_sse2(src: &[f32], dst: &mut [i16], gain: f32) {
1048 let g = _mm_set1_ps(gain);
1049 let mut i = 0;
1050 while i + 8 <= src.len() {
1051 let s0 = _mm_loadu_ps(src.as_ptr().add(i));
1052 let s1 = _mm_loadu_ps(src.as_ptr().add(i + 4));
1053 let v0 = _mm_cvttps_epi32(_mm_mul_ps(s0, g));
1054 let v1 = _mm_cvttps_epi32(_mm_mul_ps(s1, g));
1055 let packed = _mm_packs_epi32(v0, v1);
1056 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1057 i += 8;
1058 }
1059 if i + 4 <= src.len() {
1060 let s = _mm_loadu_ps(src.as_ptr().add(i));
1061 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
1062 let packed = _mm_packs_epi32(v, _mm_setzero_si128());
1063 _mm_storel_epi64(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1064 i += 4;
1065 }
1066 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1067 *d = (*s * gain) as i16;
1068 }
1069}
1070
1071#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1072#[target_feature(enable = "avx")]
1073unsafe fn convert_f32_to_i16_avx(src: &[f32], dst: &mut [i16], gain: f32) {
1074 let g = _mm256_set1_ps(gain);
1075 let mut i = 0usize;
1076 while i + 8 <= src.len() {
1077 let s = _mm256_loadu_ps(src.as_ptr().add(i));
1078 let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1079 let vlo = _mm256_castsi256_si128(v);
1080 let vhi = _mm256_extracti128_si256(v, 1);
1081 let packed = _mm_packs_epi32(vlo, vhi);
1082 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1083 i += 8;
1084 }
1085 if i + 4 <= src.len() {
1086 let s = _mm_loadu_ps(src.as_ptr().add(i));
1087 let v = _mm_cvttps_epi32(_mm_mul_ps(s, _mm_set1_ps(gain)));
1088 let packed = _mm_packs_epi32(v, _mm_setzero_si128());
1089 _mm_storel_epi64(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1090 i += 4;
1091 }
1092 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1093 *d = (*s * gain) as i16;
1094 }
1095}
1096
1097pub fn convert_f32_to_i8(src: &[f32], dst: &mut [i8], gain: f32) {
1101 let n = src.len().min(dst.len());
1102 if n == 0 {
1103 return;
1104 }
1105 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1106 unsafe {
1107 if is_x86_feature_detected!("avx") {
1108 convert_f32_to_i8_avx(&src[..n], &mut dst[..n], gain);
1109 return;
1110 }
1111 if is_x86_feature_detected!("sse2") {
1112 convert_f32_to_i8_sse2(&src[..n], &mut dst[..n], gain);
1113 return;
1114 }
1115 }
1116 convert_f32_to_i8_scalar(&src[..n], &mut dst[..n], gain);
1117}
1118
1119fn convert_f32_to_i8_scalar(src: &[f32], dst: &mut [i8], gain: f32) {
1120 for (s, d) in src.iter().zip(dst.iter_mut()) {
1121 *d = (*s * gain) as i8;
1122 }
1123}
1124
1125#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1126unsafe fn convert_f32_to_i8_sse2(src: &[f32], dst: &mut [i8], gain: f32) {
1127 let g = _mm_set1_ps(gain);
1128 let mut i = 0;
1129 while i + 16 <= src.len() {
1130 let s0 = _mm_loadu_ps(src.as_ptr().add(i));
1131 let s1 = _mm_loadu_ps(src.as_ptr().add(i + 4));
1132 let s2 = _mm_loadu_ps(src.as_ptr().add(i + 8));
1133 let s3 = _mm_loadu_ps(src.as_ptr().add(i + 12));
1134 let v0 = _mm_cvttps_epi32(_mm_mul_ps(s0, g));
1135 let v1 = _mm_cvttps_epi32(_mm_mul_ps(s1, g));
1136 let v2 = _mm_cvttps_epi32(_mm_mul_ps(s2, g));
1137 let v3 = _mm_cvttps_epi32(_mm_mul_ps(s3, g));
1138 let p0 = _mm_packs_epi32(v0, v1);
1139 let p1 = _mm_packs_epi32(v2, v3);
1140 let packed = _mm_packs_epi16(p0, p1);
1141 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1142 i += 16;
1143 }
1144 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1145 *d = (*s * gain) as i8;
1146 }
1147}
1148
1149#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1150#[target_feature(enable = "avx")]
1151unsafe fn convert_f32_to_i8_avx(src: &[f32], dst: &mut [i8], gain: f32) {
1152 let g = _mm256_set1_ps(gain);
1153 let mut i = 0usize;
1154 while i + 16 <= src.len() {
1155 let s0 = _mm256_loadu_ps(src.as_ptr().add(i));
1156 let s1 = _mm256_loadu_ps(src.as_ptr().add(i + 8));
1157 let v0 = _mm256_cvttps_epi32(_mm256_mul_ps(s0, g));
1158 let v1 = _mm256_cvttps_epi32(_mm256_mul_ps(s1, g));
1159 let p0 = _mm_packs_epi32(_mm256_castsi256_si128(v0), _mm256_extracti128_si256(v0, 1));
1160 let p1 = _mm_packs_epi32(_mm256_castsi256_si128(v1), _mm256_extracti128_si256(v1, 1));
1161 let packed = _mm_packs_epi16(p0, p1);
1162 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1163 i += 16;
1164 }
1165 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1166 *d = (*s * gain) as i8;
1167 }
1168}
1169
1170pub fn apply_fade_in_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
1173 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1174 unsafe {
1175 if is_x86_feature_detected!("avx") {
1176 apply_fade_in_inplace_avx(buf, start_t, dt);
1177 return;
1178 }
1179 if is_x86_feature_detected!("sse") {
1180 apply_fade_in_inplace_sse(buf, start_t, dt);
1181 return;
1182 }
1183 }
1184 for (i, v) in buf.iter_mut().enumerate() {
1185 let t = (start_t + i as f32 * dt).clamp(0.0, 1.0);
1186 *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1187 }
1188}
1189
1190pub fn apply_fade_out_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
1193 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1194 unsafe {
1195 if is_x86_feature_detected!("avx") {
1196 apply_fade_out_inplace_avx(buf, start_t, dt);
1197 return;
1198 }
1199 if is_x86_feature_detected!("sse") {
1200 apply_fade_out_inplace_sse(buf, start_t, dt);
1201 return;
1202 }
1203 }
1204 for (i, v) in buf.iter_mut().enumerate() {
1205 let t = (start_t + i as f32 * dt).clamp(0.0, 1.0);
1206 *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1207 }
1208}
1209
1210#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1211#[target_feature(enable = "avx")]
1212unsafe fn apply_fade_in_inplace_avx(buf: &mut [f32], start_t: f32, dt: f32) {
1213 let mut i = 0usize;
1214 while i + 8 <= buf.len() {
1215 let mut gain = [0.0f32; 8];
1216 for (lane, g) in gain.iter_mut().enumerate() {
1217 let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1218 *g = (t * std::f32::consts::FRAC_PI_2).sin();
1219 }
1220 let s = _mm256_loadu_ps(buf.as_ptr().add(i));
1221 let g = _mm256_loadu_ps(gain.as_ptr());
1222 let r = _mm256_mul_ps(s, g);
1223 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
1224 i += 8;
1225 }
1226 for (j, v) in buf[i..].iter_mut().enumerate() {
1227 let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1228 *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1229 }
1230}
1231
1232#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1233#[target_feature(enable = "sse")]
1234unsafe fn apply_fade_in_inplace_sse(buf: &mut [f32], start_t: f32, dt: f32) {
1235 let mut i = 0usize;
1236 while i + 4 <= buf.len() {
1237 let mut gain = [0.0f32; 4];
1238 for (lane, g) in gain.iter_mut().enumerate() {
1239 let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1240 *g = (t * std::f32::consts::FRAC_PI_2).sin();
1241 }
1242 let s = _mm_loadu_ps(buf.as_ptr().add(i));
1243 let g = _mm_loadu_ps(gain.as_ptr());
1244 let r = _mm_mul_ps(s, g);
1245 _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
1246 i += 4;
1247 }
1248 for (j, v) in buf[i..].iter_mut().enumerate() {
1249 let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1250 *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1251 }
1252}
1253
1254#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1255#[target_feature(enable = "avx")]
1256unsafe fn apply_fade_out_inplace_avx(buf: &mut [f32], start_t: f32, dt: f32) {
1257 let mut i = 0usize;
1258 while i + 8 <= buf.len() {
1259 let mut gain = [0.0f32; 8];
1260 for (lane, g) in gain.iter_mut().enumerate() {
1261 let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1262 *g = (t * std::f32::consts::FRAC_PI_2).cos();
1263 }
1264 let s = _mm256_loadu_ps(buf.as_ptr().add(i));
1265 let g = _mm256_loadu_ps(gain.as_ptr());
1266 let r = _mm256_mul_ps(s, g);
1267 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
1268 i += 8;
1269 }
1270 for (j, v) in buf[i..].iter_mut().enumerate() {
1271 let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1272 *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1273 }
1274}
1275
1276#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1277#[target_feature(enable = "sse")]
1278unsafe fn apply_fade_out_inplace_sse(buf: &mut [f32], start_t: f32, dt: f32) {
1279 let mut i = 0usize;
1280 while i + 4 <= buf.len() {
1281 let mut gain = [0.0f32; 4];
1282 for (lane, g) in gain.iter_mut().enumerate() {
1283 let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1284 *g = (t * std::f32::consts::FRAC_PI_2).cos();
1285 }
1286 let s = _mm_loadu_ps(buf.as_ptr().add(i));
1287 let g = _mm_loadu_ps(gain.as_ptr());
1288 let r = _mm_mul_ps(s, g);
1289 _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
1290 i += 4;
1291 }
1292 for (j, v) in buf[i..].iter_mut().enumerate() {
1293 let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1294 *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1295 }
1296}
1297
1298#[cfg(test)]
1299mod tests {
1300 use super::*;
1301
1302 #[test]
1303 fn add_inplace_basic() {
1304 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
1305 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
1306 add_inplace(&mut a, &b);
1307 assert_eq!(a, [11.0, 22.0, 33.0, 44.0, 55.0]);
1308 }
1309
1310 #[test]
1311 fn mul_inplace_basic() {
1312 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
1313 mul_inplace(&mut a, 2.0);
1314 assert_eq!(a, [2.0, 4.0, 6.0, 8.0, 10.0]);
1315 }
1316
1317 #[test]
1318 fn add_scaled_inplace_basic() {
1319 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
1320 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
1321 add_scaled_inplace(&mut a, &b, 0.5);
1322 assert_eq!(a, [6.0, 12.0, 18.0, 24.0, 30.0]);
1323 }
1324
1325 #[test]
1326 fn copy_scaled_inplace_basic() {
1327 let mut a = [0.0f32; 5];
1328 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
1329 copy_scaled_inplace(&mut a, &b, 0.5);
1330 assert_eq!(a, [5.0, 10.0, 15.0, 20.0, 25.0]);
1331 }
1332
1333 #[test]
1334 fn add_sanitized_inplace_basic() {
1335 let mut a = [1.0f32, 2.0, 3.0, 4.0];
1336 let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
1337 add_sanitized_inplace(&mut a, &b);
1338 assert!(a[0].is_finite() && a[0] == 1.5);
1339 assert!(a[1].is_finite() && a[1] == 2.0);
1340 assert!(a[2].is_finite() && a[2] == 3.0);
1341 assert!(a[3].is_finite() && a[3] == 5.0);
1342 }
1343
1344 #[test]
1345 fn copy_sanitized_inplace_basic() {
1346 let mut a = [0.0f32; 4];
1347 let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
1348 copy_sanitized_inplace(&mut a, &b);
1349 assert!(a[0].is_finite() && a[0] == 0.5);
1350 assert!(a[1].is_finite() && a[1] == 0.0);
1351 assert!(a[2].is_finite() && a[2] == 0.0);
1352 assert!(a[3].is_finite() && a[3] == 1.0);
1353 }
1354
1355 #[test]
1356 fn sanitize_finite_inplace_basic() {
1357 let mut a = [1.0f32, f32::NAN, f32::INFINITY, 4.0, f32::NEG_INFINITY];
1358 sanitize_finite_inplace(&mut a);
1359 assert!(a[0].is_finite() && a[0] == 1.0);
1360 assert_eq!(a[1], 0.0);
1361 assert_eq!(a[2], 0.0);
1362 assert!(a[3].is_finite() && a[3] == 4.0);
1363 assert_eq!(a[4], 0.0);
1364 }
1365
1366 #[test]
1367 fn peak_abs_basic() {
1368 let a = [1.0f32, -3.0, 2.0, 0.5];
1369 assert_eq!(peak_abs(&a), 3.0);
1370 }
1371
1372 #[test]
1373 fn clamp_inplace_basic() {
1374 let mut a = [-2.0f32, -0.5, 0.0, 0.5, 2.0];
1375 clamp_inplace(&mut a, -1.0, 1.0);
1376 assert_eq!(a, [-1.0, -0.5, 0.0, 0.5, 1.0]);
1377 }
1378}