1#![allow(unsafe_op_in_unsafe_fn)]
9
10#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
11mod x86 {
12 pub use std::arch::x86_64::*;
13 pub use wide::f32x4;
14}
15
16#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
17use x86::*;
18
19pub fn add_inplace(dst: &mut [f32], src: &[f32]) {
25 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
26 unsafe {
27 if is_x86_feature_detected!("avx") {
28 add_inplace_avx(dst, src);
29 return;
30 }
31 if is_x86_feature_detected!("sse") {
32 add_inplace_sse(dst, src);
33 return;
34 }
35 }
36 add_inplace_scalar(dst, src);
37}
38
39pub fn mul_inplace(dst: &mut [f32], gain: f32) {
41 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
42 unsafe {
43 if is_x86_feature_detected!("avx") {
44 mul_inplace_avx(dst, gain);
45 return;
46 }
47 if is_x86_feature_detected!("sse") {
48 mul_inplace_sse(dst, gain);
49 return;
50 }
51 }
52 mul_inplace_scalar(dst, gain);
53}
54
55pub fn add_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
57 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
58 unsafe {
59 if is_x86_feature_detected!("avx") {
60 add_scaled_inplace_avx(dst, src, gain);
61 return;
62 }
63 if is_x86_feature_detected!("sse") {
64 add_scaled_inplace_sse(dst, src, gain);
65 return;
66 }
67 }
68 add_scaled_inplace_scalar(dst, src, gain);
69}
70
71pub fn copy_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
73 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
74 unsafe {
75 if is_x86_feature_detected!("avx") {
76 copy_scaled_inplace_avx(dst, src, gain);
77 return;
78 }
79 if is_x86_feature_detected!("sse") {
80 copy_scaled_inplace_sse(dst, src, gain);
81 return;
82 }
83 }
84 copy_scaled_inplace_scalar(dst, src, gain);
85}
86
87pub fn add_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
89 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
90 unsafe {
91 if is_x86_feature_detected!("avx") {
92 add_sanitized_inplace_avx(dst, src);
93 return;
94 }
95 if is_x86_feature_detected!("sse") {
96 add_sanitized_inplace_sse(dst, src);
97 return;
98 }
99 }
100 add_sanitized_inplace_scalar(dst, src);
101}
102
103pub fn copy_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
105 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
106 unsafe {
107 if is_x86_feature_detected!("avx") {
108 copy_sanitized_inplace_avx(dst, src);
109 return;
110 }
111 if is_x86_feature_detected!("sse") {
112 copy_sanitized_inplace_sse(dst, src);
113 return;
114 }
115 }
116 copy_sanitized_inplace_scalar(dst, src);
117}
118
119pub fn sanitize_finite_inplace(buf: &mut [f32]) {
121 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
122 unsafe {
123 if is_x86_feature_detected!("avx") {
124 sanitize_finite_inplace_avx(buf);
125 return;
126 }
127 if is_x86_feature_detected!("sse") {
128 sanitize_finite_inplace_sse(buf);
129 return;
130 }
131 }
132 sanitize_finite_inplace_scalar(buf);
133}
134
135pub fn peak_abs(buf: &[f32]) -> f32 {
137 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
138 unsafe {
139 if is_x86_feature_detected!("avx") {
140 return peak_abs_avx(buf);
141 }
142 if is_x86_feature_detected!("sse") {
143 return peak_abs_sse(buf);
144 }
145 }
146 peak_abs_scalar(buf)
147}
148
149pub fn clamp_inplace(buf: &mut [f32], min: f32, max: f32) {
151 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
152 unsafe {
153 if is_x86_feature_detected!("avx") {
154 clamp_inplace_avx(buf, min, max);
155 return;
156 }
157 if is_x86_feature_detected!("sse") {
158 clamp_inplace_sse(buf, min, max);
159 return;
160 }
161 }
162 clamp_inplace_scalar(buf, min, max);
163}
164
165fn add_inplace_scalar(dst: &mut [f32], src: &[f32]) {
170 for (d, s) in dst.iter_mut().zip(src.iter()) {
171 *d += *s;
172 }
173}
174
175fn mul_inplace_scalar(dst: &mut [f32], gain: f32) {
176 for d in dst.iter_mut() {
177 *d *= gain;
178 }
179}
180
181fn add_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
182 for (d, s) in dst.iter_mut().zip(src.iter()) {
183 *d += *s * gain;
184 }
185}
186
187fn copy_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
188 for (d, s) in dst.iter_mut().zip(src.iter()) {
189 *d = *s * gain;
190 }
191}
192
193fn add_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
194 for (d, s) in dst.iter_mut().zip(src.iter()) {
195 *d += if s.is_finite() { *s } else { 0.0 };
196 }
197}
198
199fn copy_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
200 for (d, s) in dst.iter_mut().zip(src.iter()) {
201 *d = if s.is_finite() { *s } else { 0.0 };
202 }
203}
204
205fn sanitize_finite_inplace_scalar(buf: &mut [f32]) {
206 for s in buf.iter_mut() {
207 if !s.is_finite() {
208 *s = 0.0;
209 }
210 }
211}
212
213fn peak_abs_scalar(buf: &[f32]) -> f32 {
214 buf.iter().fold(0.0f32, |acc, s| acc.max(s.abs()))
215}
216
217fn clamp_inplace_scalar(buf: &mut [f32], min: f32, max: f32) {
218 for s in buf.iter_mut() {
219 *s = s.clamp(min, max);
220 }
221}
222
223#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
228unsafe fn add_inplace_sse(dst: &mut [f32], src: &[f32]) {
229 let len = dst.len().min(src.len());
230 let dst_head = &mut dst[..len];
231 let src_head = &src[..len];
232 let n = dst_head.len() / 4;
233 for i in 0..n {
234 let d_chunk = &mut dst_head[i * 4..(i + 1) * 4];
235 let s_chunk = &src_head[i * 4..(i + 1) * 4];
236 let d: f32x4 = [d_chunk[0], d_chunk[1], d_chunk[2], d_chunk[3]].into();
237 let s: f32x4 = [s_chunk[0], s_chunk[1], s_chunk[2], s_chunk[3]].into();
238 let r = d + s;
239 d_chunk.copy_from_slice(&r.to_array());
240 }
241 for (d, s) in dst_head[n * 4..].iter_mut().zip(src_head[n * 4..].iter()) {
242 *d += *s;
243 }
244}
245
246#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
247unsafe fn mul_inplace_sse(dst: &mut [f32], gain: f32) {
248 let n = dst.len() / 4;
249 let g: f32x4 = [gain; 4].into();
250 for i in 0..n {
251 let d_chunk = &mut dst[i * 4..(i + 1) * 4];
252 let d: f32x4 = [d_chunk[0], d_chunk[1], d_chunk[2], d_chunk[3]].into();
253 let r = d * g;
254 d_chunk.copy_from_slice(&r.to_array());
255 }
256 for d in &mut dst[n * 4..] {
257 *d *= gain;
258 }
259}
260
261#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
262unsafe fn add_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
263 let len = dst.len().min(src.len());
264 let dst_head = &mut dst[..len];
265 let src_head = &src[..len];
266 let n = dst_head.len() / 4;
267 let g: f32x4 = [gain; 4].into();
268 for i in 0..n {
269 let d_chunk = &mut dst_head[i * 4..(i + 1) * 4];
270 let s_chunk = &src_head[i * 4..(i + 1) * 4];
271 let d: f32x4 = [d_chunk[0], d_chunk[1], d_chunk[2], d_chunk[3]].into();
272 let s: f32x4 = [s_chunk[0], s_chunk[1], s_chunk[2], s_chunk[3]].into();
273 let r = d + s * g;
274 d_chunk.copy_from_slice(&r.to_array());
275 }
276 for (d, s) in dst_head[n * 4..].iter_mut().zip(src_head[n * 4..].iter()) {
277 *d += *s * gain;
278 }
279}
280
281#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
282unsafe fn copy_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
283 let len = dst.len().min(src.len());
284 let dst_head = &mut dst[..len];
285 let src_head = &src[..len];
286 let n = dst_head.len() / 4;
287 let g: f32x4 = [gain; 4].into();
288 for i in 0..n {
289 let d_chunk = &mut dst_head[i * 4..(i + 1) * 4];
290 let s_chunk = &src_head[i * 4..(i + 1) * 4];
291 let s: f32x4 = [s_chunk[0], s_chunk[1], s_chunk[2], s_chunk[3]].into();
292 let r = s * g;
293 d_chunk.copy_from_slice(&r.to_array());
294 }
295 for (d, s) in dst_head[n * 4..].iter_mut().zip(src_head[n * 4..].iter()) {
296 *d = *s * gain;
297 }
298}
299
300#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
301unsafe fn add_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
302 let len = dst.len().min(src.len());
303 let dst_head = &mut dst[..len];
304 let src_head = &src[..len];
305 let n = dst_head.len() / 4;
306 let zero: f32x4 = [0.0; 4].into();
307 for i in 0..n {
308 let d_chunk = &mut dst_head[i * 4..(i + 1) * 4];
309 let s_chunk = &src_head[i * 4..(i + 1) * 4];
310 let d: f32x4 = [d_chunk[0], d_chunk[1], d_chunk[2], d_chunk[3]].into();
311 let s: f32x4 = [s_chunk[0], s_chunk[1], s_chunk[2], s_chunk[3]].into();
312 let mask = s.is_finite();
313 let sanitized = mask.blend(s, zero);
314 let r = d + sanitized;
315 d_chunk.copy_from_slice(&r.to_array());
316 }
317 for (d, s) in dst_head[n * 4..].iter_mut().zip(src_head[n * 4..].iter()) {
318 *d += if s.is_finite() { *s } else { 0.0 };
319 }
320}
321
322#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
323unsafe fn copy_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
324 let len = dst.len().min(src.len());
325 let dst_head = &mut dst[..len];
326 let src_head = &src[..len];
327 let n = dst_head.len() / 4;
328 let zero: f32x4 = [0.0; 4].into();
329 for i in 0..n {
330 let d_chunk = &mut dst_head[i * 4..(i + 1) * 4];
331 let s_chunk = &src_head[i * 4..(i + 1) * 4];
332 let s: f32x4 = [s_chunk[0], s_chunk[1], s_chunk[2], s_chunk[3]].into();
333 let mask = s.is_finite();
334 let r = mask.blend(s, zero);
335 d_chunk.copy_from_slice(&r.to_array());
336 }
337 for (d, s) in dst_head[n * 4..].iter_mut().zip(src_head[n * 4..].iter()) {
338 *d = if s.is_finite() { *s } else { 0.0 };
339 }
340}
341
342#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
343unsafe fn sanitize_finite_inplace_sse(buf: &mut [f32]) {
344 let n = buf.len() / 4;
345 let zero: f32x4 = [0.0; 4].into();
346 for i in 0..n {
347 let chunk = &mut buf[i * 4..(i + 1) * 4];
348 let v: f32x4 = [chunk[0], chunk[1], chunk[2], chunk[3]].into();
349 let mask = v.is_finite();
350 let r = mask.blend(v, zero);
351 chunk.copy_from_slice(&r.to_array());
352 }
353 for s in &mut buf[n * 4..] {
354 if !s.is_finite() {
355 *s = 0.0;
356 }
357 }
358}
359
360#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
361unsafe fn peak_abs_sse(buf: &[f32]) -> f32 {
362 let n = buf.len() / 4;
363 let mut peak: f32x4 = [0.0; 4].into();
364 for i in 0..n {
365 let chunk = &buf[i * 4..(i + 1) * 4];
366 let v: f32x4 = [chunk[0], chunk[1], chunk[2], chunk[3]].into();
367 peak = peak.max(v.abs());
368 }
369 let mut max_scalar = peak.to_array().into_iter().fold(0.0f32, |a, b| a.max(b));
370 for s in &buf[n * 4..] {
371 max_scalar = max_scalar.max(s.abs());
372 }
373 max_scalar
374}
375
376#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
377unsafe fn clamp_inplace_sse(buf: &mut [f32], min: f32, max: f32) {
378 let n = buf.len() / 4;
379 let vmin: f32x4 = [min; 4].into();
380 let vmax: f32x4 = [max; 4].into();
381 for i in 0..n {
382 let chunk = &mut buf[i * 4..(i + 1) * 4];
383 let v: f32x4 = [chunk[0], chunk[1], chunk[2], chunk[3]].into();
384 let r = v.clamp(vmin, vmax);
385 chunk.copy_from_slice(&r.to_array());
386 }
387 for s in &mut buf[n * 4..] {
388 *s = s.clamp(min, max);
389 }
390}
391
392#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
397#[target_feature(enable = "avx")]
398unsafe fn add_inplace_avx(dst: &mut [f32], src: &[f32]) {
399 let len = dst.len().min(src.len());
400 let dst_head = &mut dst[..len];
401 let src_head = &src[..len];
402 let mut i = 0;
403 while i + 8 <= dst_head.len() {
404 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
405 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
406 let r = _mm256_add_ps(d, s);
407 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
408 i += 8;
409 }
410 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
411 *d += *s;
412 }
413}
414
415#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
416#[target_feature(enable = "avx")]
417unsafe fn mul_inplace_avx(dst: &mut [f32], gain: f32) {
418 let g = _mm256_set1_ps(gain);
419 let mut i = 0;
420 while i + 8 <= dst.len() {
421 let d = _mm256_loadu_ps(dst.as_ptr().add(i));
422 let r = _mm256_mul_ps(d, g);
423 _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
424 i += 8;
425 }
426 for d in &mut dst[i..] {
427 *d *= gain;
428 }
429}
430
431#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
432#[target_feature(enable = "avx")]
433unsafe fn add_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
434 let len = dst.len().min(src.len());
435 let dst_head = &mut dst[..len];
436 let src_head = &src[..len];
437 let g = _mm256_set1_ps(gain);
438 let mut i = 0;
439 while i + 8 <= dst_head.len() {
440 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
441 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
442 let r = _mm256_add_ps(d, _mm256_mul_ps(s, g));
443 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
444 i += 8;
445 }
446 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
447 *d += *s * gain;
448 }
449}
450
451#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
452#[target_feature(enable = "avx")]
453unsafe fn copy_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
454 let len = dst.len().min(src.len());
455 let dst_head = &mut dst[..len];
456 let src_head = &src[..len];
457 let g = _mm256_set1_ps(gain);
458 let mut i = 0;
459 while i + 8 <= dst_head.len() {
460 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
461 let r = _mm256_mul_ps(s, g);
462 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
463 i += 8;
464 }
465 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
466 *d = *s * gain;
467 }
468}
469
470#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
471#[target_feature(enable = "avx")]
472unsafe fn add_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
473 let len = dst.len().min(src.len());
474 let dst_head = &mut dst[..len];
475 let src_head = &src[..len];
476 let zero = _mm256_setzero_ps();
477 let max_val = _mm256_set1_ps(f32::MAX);
478 let mut i = 0;
479 while i + 8 <= dst_head.len() {
480 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
481 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
482 let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
483 let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
484 let sanitized = _mm256_blendv_ps(zero, s, mask);
485 let r = _mm256_add_ps(d, sanitized);
486 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
487 i += 8;
488 }
489 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
490 *d += if s.is_finite() { *s } else { 0.0 };
491 }
492}
493
494#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
495#[target_feature(enable = "avx")]
496unsafe fn copy_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
497 let len = dst.len().min(src.len());
498 let dst_head = &mut dst[..len];
499 let src_head = &src[..len];
500 let zero = _mm256_setzero_ps();
501 let max_val = _mm256_set1_ps(f32::MAX);
502 let mut i = 0;
503 while i + 8 <= dst_head.len() {
504 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
505 let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
506 let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
507 let r = _mm256_blendv_ps(zero, s, mask);
508 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
509 i += 8;
510 }
511 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
512 *d = if s.is_finite() { *s } else { 0.0 };
513 }
514}
515
516#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
517#[target_feature(enable = "avx")]
518unsafe fn sanitize_finite_inplace_avx(buf: &mut [f32]) {
519 let zero = _mm256_setzero_ps();
520 let max_val = _mm256_set1_ps(f32::MAX);
521 let mut i = 0;
522 while i + 8 <= buf.len() {
523 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
524 let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
525 let mask = _mm256_cmp_ps(abs_v, max_val, _CMP_LE_OQ);
526 let r = _mm256_blendv_ps(zero, v, mask);
527 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
528 i += 8;
529 }
530 for s in &mut buf[i..] {
531 if !s.is_finite() {
532 *s = 0.0;
533 }
534 }
535}
536
537#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
538#[target_feature(enable = "avx")]
539unsafe fn peak_abs_avx(buf: &[f32]) -> f32 {
540 let mut peak = _mm256_setzero_ps();
541 let mut i = 0;
542 while i + 8 <= buf.len() {
543 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
544 let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
545 peak = _mm256_max_ps(peak, abs_v);
546 i += 8;
547 }
548 let mut arr = [0.0f32; 8];
549 _mm256_storeu_ps(arr.as_mut_ptr(), peak);
550 let mut max_scalar = arr.iter().fold(0.0f32, |a, &b| a.max(b));
551 for s in &buf[i..] {
552 max_scalar = max_scalar.max(s.abs());
553 }
554 max_scalar
555}
556
557#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
558#[target_feature(enable = "avx")]
559unsafe fn clamp_inplace_avx(buf: &mut [f32], min: f32, max: f32) {
560 let vmin = _mm256_set1_ps(min);
561 let vmax = _mm256_set1_ps(max);
562 let mut i = 0;
563 while i + 8 <= buf.len() {
564 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
565 let r = _mm256_max_ps(vmin, _mm256_min_ps(vmax, v));
566 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
567 i += 8;
568 }
569 for s in &mut buf[i..] {
570 *s = s.clamp(min, max);
571 }
572}
573
574#[cfg(test)]
575mod tests {
576 use super::*;
577
578 #[test]
579 fn add_inplace_basic() {
580 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
581 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
582 add_inplace(&mut a, &b);
583 assert_eq!(a, [11.0, 22.0, 33.0, 44.0, 55.0]);
584 }
585
586 #[test]
587 fn mul_inplace_basic() {
588 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
589 mul_inplace(&mut a, 2.0);
590 assert_eq!(a, [2.0, 4.0, 6.0, 8.0, 10.0]);
591 }
592
593 #[test]
594 fn add_scaled_inplace_basic() {
595 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
596 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
597 add_scaled_inplace(&mut a, &b, 0.5);
598 assert_eq!(a, [6.0, 12.0, 18.0, 24.0, 30.0]);
599 }
600
601 #[test]
602 fn copy_scaled_inplace_basic() {
603 let mut a = [0.0f32; 5];
604 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
605 copy_scaled_inplace(&mut a, &b, 0.5);
606 assert_eq!(a, [5.0, 10.0, 15.0, 20.0, 25.0]);
607 }
608
609 #[test]
610 fn add_sanitized_inplace_basic() {
611 let mut a = [1.0f32, 2.0, 3.0, 4.0];
612 let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
613 add_sanitized_inplace(&mut a, &b);
614 assert!(a[0].is_finite() && a[0] == 1.5);
615 assert!(a[1].is_finite() && a[1] == 2.0);
616 assert!(a[2].is_finite() && a[2] == 3.0);
617 assert!(a[3].is_finite() && a[3] == 5.0);
618 }
619
620 #[test]
621 fn copy_sanitized_inplace_basic() {
622 let mut a = [0.0f32; 4];
623 let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
624 copy_sanitized_inplace(&mut a, &b);
625 assert!(a[0].is_finite() && a[0] == 0.5);
626 assert!(a[1].is_finite() && a[1] == 0.0);
627 assert!(a[2].is_finite() && a[2] == 0.0);
628 assert!(a[3].is_finite() && a[3] == 1.0);
629 }
630
631 #[test]
632 fn sanitize_finite_inplace_basic() {
633 let mut a = [1.0f32, f32::NAN, f32::INFINITY, 4.0, f32::NEG_INFINITY];
634 sanitize_finite_inplace(&mut a);
635 assert!(a[0].is_finite() && a[0] == 1.0);
636 assert_eq!(a[1], 0.0);
637 assert_eq!(a[2], 0.0);
638 assert!(a[3].is_finite() && a[3] == 4.0);
639 assert_eq!(a[4], 0.0);
640 }
641
642 #[test]
643 fn peak_abs_basic() {
644 let a = [1.0f32, -3.0, 2.0, 0.5];
645 assert_eq!(peak_abs(&a), 3.0);
646 }
647
648 #[test]
649 fn clamp_inplace_basic() {
650 let mut a = [-2.0f32, -0.5, 0.0, 0.5, 2.0];
651 clamp_inplace(&mut a, -1.0, 1.0);
652 assert_eq!(a, [-1.0, -0.5, 0.0, 0.5, 1.0]);
653 }
654}
655
656pub fn convert_i32_to_f32(src: &[i32], dst: &mut [f32], gain: f32) {
663 let n = src.len().min(dst.len());
664 if n == 0 {
665 return;
666 }
667 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
668 unsafe {
669 if is_x86_feature_detected!("avx") {
670 convert_i32_to_f32_avx(&src[..n], &mut dst[..n], gain);
671 return;
672 }
673 if is_x86_feature_detected!("sse") {
674 convert_i32_to_f32_sse(&src[..n], &mut dst[..n], gain);
675 return;
676 }
677 }
678 convert_i32_to_f32_scalar(&src[..n], &mut dst[..n], gain);
679}
680
681fn convert_i32_to_f32_scalar(src: &[i32], dst: &mut [f32], gain: f32) {
682 for (s, d) in src.iter().zip(dst.iter_mut()) {
683 *d = *s as f32 * gain;
684 }
685}
686
687#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
688unsafe fn convert_i32_to_f32_sse(src: &[i32], dst: &mut [f32], gain: f32) {
689 let g = _mm_set1_ps(gain);
690 let mut i = 0;
691 while i + 4 <= src.len() {
692 let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
693 let f = _mm_cvtepi32_ps(s);
694 let r = _mm_mul_ps(f, g);
695 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
696 i += 4;
697 }
698 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
699 *d = *s as f32 * gain;
700 }
701}
702
703#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
704#[target_feature(enable = "avx")]
705unsafe fn convert_i32_to_f32_avx(src: &[i32], dst: &mut [f32], gain: f32) {
706 let g = _mm256_set1_ps(gain);
707 let mut i = 0;
708 while i + 8 <= src.len() {
709 let s = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
710 let f = _mm256_cvtepi32_ps(s);
711 let r = _mm256_mul_ps(f, g);
712 _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
713 i += 8;
714 }
715 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
716 *d = *s as f32 * gain;
717 }
718}
719
720pub fn convert_i16_to_f32(src: &[i16], dst: &mut [f32], gain: f32) {
723 let n = src.len().min(dst.len());
724 if n == 0 {
725 return;
726 }
727 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
728 unsafe {
729 if is_x86_feature_detected!("avx2") {
730 convert_i16_to_f32_avx2(&src[..n], &mut dst[..n], gain);
731 return;
732 }
733 if is_x86_feature_detected!("sse4.1") {
734 convert_i16_to_f32_sse41(&src[..n], &mut dst[..n], gain);
735 return;
736 }
737 }
738 convert_i16_to_f32_scalar(&src[..n], &mut dst[..n], gain);
739}
740
741fn convert_i16_to_f32_scalar(src: &[i16], dst: &mut [f32], gain: f32) {
742 for (s, d) in src.iter().zip(dst.iter_mut()) {
743 *d = *s as f32 * gain;
744 }
745}
746
747#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
748unsafe fn convert_i16_to_f32_sse41(src: &[i16], dst: &mut [f32], gain: f32) {
749 let g = _mm_set1_ps(gain);
750 let mut i = 0;
751 while i + 8 <= src.len() {
752 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
753 let low = _mm_cvtepi16_epi32(bytes);
754 let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
755 let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), g);
756 let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), g);
757 _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
758 _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
759 i += 8;
760 }
761 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
762 *d = *s as f32 * gain;
763 }
764}
765
766#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
767#[target_feature(enable = "avx2")]
768unsafe fn convert_i16_to_f32_avx2(src: &[i16], dst: &mut [f32], gain: f32) {
769 let g = _mm256_set1_ps(gain);
770 let mut i = 0;
771 while i + 16 <= src.len() {
772 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
773 let low = _mm256_cvtepi16_epi32(bytes);
774 let high = _mm256_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
775 let low_f = _mm256_mul_ps(_mm256_cvtepi32_ps(low), g);
776 let high_f = _mm256_mul_ps(_mm256_cvtepi32_ps(high), g);
777 _mm256_storeu_ps(dst.as_mut_ptr().add(i), low_f);
778 _mm256_storeu_ps(dst.as_mut_ptr().add(i + 8), high_f);
779 i += 16;
780 }
781 if i + 8 <= src.len() {
782 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
783 let low = _mm_cvtepi16_epi32(bytes);
784 let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
785 let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), _mm_set1_ps(gain));
786 let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), _mm_set1_ps(gain));
787 _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
788 _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
789 i += 8;
790 }
791 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
792 *d = *s as f32 * gain;
793 }
794}
795
796pub fn convert_i8_to_f32(src: &[i8], dst: &mut [f32], gain: f32) {
799 let n = src.len().min(dst.len());
800 if n == 0 {
801 return;
802 }
803 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
804 unsafe {
805 if is_x86_feature_detected!("avx2") {
806 convert_i8_to_f32_avx2(&src[..n], &mut dst[..n], gain);
807 return;
808 }
809 if is_x86_feature_detected!("sse4.1") {
810 convert_i8_to_f32_sse41(&src[..n], &mut dst[..n], gain);
811 return;
812 }
813 }
814 convert_i8_to_f32_scalar(&src[..n], &mut dst[..n], gain);
815}
816
817fn convert_i8_to_f32_scalar(src: &[i8], dst: &mut [f32], gain: f32) {
818 for (s, d) in src.iter().zip(dst.iter_mut()) {
819 *d = *s as f32 * gain;
820 }
821}
822
823#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
824unsafe fn convert_i8_to_f32_sse41(src: &[i8], dst: &mut [f32], gain: f32) {
825 let g = _mm_set1_ps(gain);
826 let mut i = 0;
827 while i + 4 <= src.len() {
828 let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
829 let i32s = _mm_cvtepi8_epi32(bytes);
830 let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), g);
831 _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
832 i += 4;
833 }
834 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
835 *d = *s as f32 * gain;
836 }
837}
838
839#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
840#[target_feature(enable = "avx2")]
841unsafe fn convert_i8_to_f32_avx2(src: &[i8], dst: &mut [f32], gain: f32) {
842 let g = _mm256_set1_ps(gain);
843 let mut i = 0;
844 while i + 8 <= src.len() {
845 let bytes = _mm_cvtsi64_si128(*(src.as_ptr().add(i) as *const i64));
846 let i32s = _mm256_cvtepi8_epi32(bytes);
847 let f32s = _mm256_mul_ps(_mm256_cvtepi32_ps(i32s), g);
848 _mm256_storeu_ps(dst.as_mut_ptr().add(i), f32s);
849 i += 8;
850 }
851 if i + 4 <= src.len() {
852 let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
853 let i32s = _mm_cvtepi8_epi32(bytes);
854 let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), _mm_set1_ps(gain));
855 _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
856 i += 4;
857 }
858 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
859 *d = *s as f32 * gain;
860 }
861}
862
863pub fn apply_fade_in_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
870 use wide::f32x4;
871 let pi_2 = f32x4::splat(std::f32::consts::FRAC_PI_2);
872 let dt_vec = f32x4::from([0.0, dt, 2.0 * dt, 3.0 * dt]);
873 let mut i = 0;
874 while i + 4 <= buf.len() {
875 let base_t = f32x4::splat(start_t + i as f32 * dt);
876 let t = (base_t + dt_vec).clamp(f32x4::splat(0.0), f32x4::splat(1.0));
877 let gain = (t * pi_2).sin();
878 let chunk = &mut buf[i..i + 4];
879 let s: f32x4 = [chunk[0], chunk[1], chunk[2], chunk[3]].into();
880 let r = s * gain;
881 chunk.copy_from_slice(&r.to_array());
882 i += 4;
883 }
884 for (j, v) in buf[i..].iter_mut().enumerate() {
885 let t = (((i + j) as f32 * dt) + start_t).clamp(0.0, 1.0);
886 *v *= (t * std::f32::consts::FRAC_PI_2).sin();
887 }
888}
889
890pub fn apply_fade_out_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
893 use wide::f32x4;
894 let pi_2 = f32x4::splat(std::f32::consts::FRAC_PI_2);
895 let dt_vec = f32x4::from([0.0, dt, 2.0 * dt, 3.0 * dt]);
896 let mut i = 0;
897 while i + 4 <= buf.len() {
898 let base_t = f32x4::splat(start_t + i as f32 * dt);
899 let t = (base_t + dt_vec).clamp(f32x4::splat(0.0), f32x4::splat(1.0));
900 let gain = (t * pi_2).cos();
901 let chunk = &mut buf[i..i + 4];
902 let s: f32x4 = [chunk[0], chunk[1], chunk[2], chunk[3]].into();
903 let r = s * gain;
904 chunk.copy_from_slice(&r.to_array());
905 i += 4;
906 }
907 for (j, v) in buf[i..].iter_mut().enumerate() {
908 let t = (((i + j) as f32 * dt) + start_t).clamp(0.0, 1.0);
909 *v *= (t * std::f32::consts::FRAC_PI_2).cos();
910 }
911}