1#![allow(unsafe_op_in_unsafe_fn)]
2
3#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
4mod x86 {
5 pub use std::arch::x86_64::*;
6}
7
8#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
9use x86::*;
10
11pub fn add_inplace(dst: &mut [f32], src: &[f32]) {
12 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
13 unsafe {
14 if is_x86_feature_detected!("avx") {
15 add_inplace_avx(dst, src);
16 return;
17 }
18 if is_x86_feature_detected!("sse") {
19 add_inplace_sse(dst, src);
20 return;
21 }
22 }
23 add_inplace_scalar(dst, src);
24}
25
26pub fn mul_inplace(dst: &mut [f32], gain: f32) {
27 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
28 unsafe {
29 if is_x86_feature_detected!("avx") {
30 mul_inplace_avx(dst, gain);
31 return;
32 }
33 if is_x86_feature_detected!("sse") {
34 mul_inplace_sse(dst, gain);
35 return;
36 }
37 }
38 mul_inplace_scalar(dst, gain);
39}
40
41pub fn add_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
42 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
43 unsafe {
44 if is_x86_feature_detected!("avx") && is_x86_feature_detected!("fma") {
45 add_scaled_inplace_avx_fma(dst, src, gain);
46 return;
47 }
48 if is_x86_feature_detected!("avx") {
49 add_scaled_inplace_avx(dst, src, gain);
50 return;
51 }
52 if is_x86_feature_detected!("sse") {
53 add_scaled_inplace_sse(dst, src, gain);
54 return;
55 }
56 }
57 add_scaled_inplace_scalar(dst, src, gain);
58}
59
60pub fn copy_scaled_inplace(dst: &mut [f32], src: &[f32], gain: f32) {
61 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
62 unsafe {
63 if is_x86_feature_detected!("avx") {
64 copy_scaled_inplace_avx(dst, src, gain);
65 return;
66 }
67 if is_x86_feature_detected!("sse") {
68 copy_scaled_inplace_sse(dst, src, gain);
69 return;
70 }
71 }
72 copy_scaled_inplace_scalar(dst, src, gain);
73}
74
75pub fn add_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
76 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
77 unsafe {
78 if is_x86_feature_detected!("avx") {
79 add_sanitized_inplace_avx(dst, src);
80 return;
81 }
82 if is_x86_feature_detected!("sse") {
83 add_sanitized_inplace_sse(dst, src);
84 return;
85 }
86 }
87 add_sanitized_inplace_scalar(dst, src);
88}
89
90pub fn copy_sanitized_inplace(dst: &mut [f32], src: &[f32]) {
91 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
92 unsafe {
93 if is_x86_feature_detected!("avx") {
94 copy_sanitized_inplace_avx(dst, src);
95 return;
96 }
97 if is_x86_feature_detected!("sse") {
98 copy_sanitized_inplace_sse(dst, src);
99 return;
100 }
101 }
102 copy_sanitized_inplace_scalar(dst, src);
103}
104
105pub fn sanitize_finite_inplace(buf: &mut [f32]) {
106 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
107 unsafe {
108 if is_x86_feature_detected!("avx") {
109 sanitize_finite_inplace_avx(buf);
110 return;
111 }
112 if is_x86_feature_detected!("sse") {
113 sanitize_finite_inplace_sse(buf);
114 return;
115 }
116 }
117 sanitize_finite_inplace_scalar(buf);
118}
119
120pub fn peak_abs(buf: &[f32]) -> f32 {
121 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
122 unsafe {
123 if is_x86_feature_detected!("avx") {
124 return peak_abs_avx(buf);
125 }
126 if is_x86_feature_detected!("sse") {
127 return peak_abs_sse(buf);
128 }
129 }
130 peak_abs_scalar(buf)
131}
132
133pub fn clamp_inplace(buf: &mut [f32], min: f32, max: f32) {
134 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
135 unsafe {
136 if is_x86_feature_detected!("avx") {
137 clamp_inplace_avx(buf, min, max);
138 return;
139 }
140 if is_x86_feature_detected!("sse") {
141 clamp_inplace_sse(buf, min, max);
142 return;
143 }
144 }
145 clamp_inplace_scalar(buf, min, max);
146}
147
148fn add_inplace_scalar(dst: &mut [f32], src: &[f32]) {
149 for (d, s) in dst.iter_mut().zip(src.iter()) {
150 *d += *s;
151 }
152}
153
154fn mul_inplace_scalar(dst: &mut [f32], gain: f32) {
155 for d in dst.iter_mut() {
156 *d *= gain;
157 }
158}
159
160fn add_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
161 for (d, s) in dst.iter_mut().zip(src.iter()) {
162 *d += *s * gain;
163 }
164}
165
166fn copy_scaled_inplace_scalar(dst: &mut [f32], src: &[f32], gain: f32) {
167 for (d, s) in dst.iter_mut().zip(src.iter()) {
168 *d = *s * gain;
169 }
170}
171
172fn add_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
173 for (d, s) in dst.iter_mut().zip(src.iter()) {
174 *d += if s.is_finite() { *s } else { 0.0 };
175 }
176}
177
178fn copy_sanitized_inplace_scalar(dst: &mut [f32], src: &[f32]) {
179 for (d, s) in dst.iter_mut().zip(src.iter()) {
180 *d = if s.is_finite() { *s } else { 0.0 };
181 }
182}
183
184fn sanitize_finite_inplace_scalar(buf: &mut [f32]) {
185 for s in buf.iter_mut() {
186 if !s.is_finite() {
187 *s = 0.0;
188 }
189 }
190}
191
192fn peak_abs_scalar(buf: &[f32]) -> f32 {
193 buf.iter().fold(0.0f32, |acc, s| acc.max(s.abs()))
194}
195
196fn clamp_inplace_scalar(buf: &mut [f32], min: f32, max: f32) {
197 for s in buf.iter_mut() {
198 *s = s.clamp(min, max);
199 }
200}
201
202#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
203#[target_feature(enable = "sse")]
204unsafe fn add_inplace_sse(dst: &mut [f32], src: &[f32]) {
205 let len = dst.len().min(src.len());
206 let dst_head = &mut dst[..len];
207 let src_head = &src[..len];
208 let mut i = 0usize;
209 while i + 4 <= dst_head.len() {
210 let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
211 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
212 let r = _mm_add_ps(d, s);
213 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
214 i += 4;
215 }
216 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
217 *d += *s;
218 }
219}
220
221#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
222#[target_feature(enable = "sse")]
223unsafe fn mul_inplace_sse(dst: &mut [f32], gain: f32) {
224 let g = _mm_set1_ps(gain);
225 let mut i = 0usize;
226 while i + 4 <= dst.len() {
227 let d = _mm_loadu_ps(dst.as_ptr().add(i));
228 let r = _mm_mul_ps(d, g);
229 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
230 i += 4;
231 }
232 for d in &mut dst[i..] {
233 *d *= gain;
234 }
235}
236
237#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
238#[target_feature(enable = "sse")]
239unsafe fn add_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
240 let len = dst.len().min(src.len());
241 let dst_head = &mut dst[..len];
242 let src_head = &src[..len];
243 let g = _mm_set1_ps(gain);
244 let mut i = 0usize;
245 while i + 4 <= dst_head.len() {
246 let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
247 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
248 let r = _mm_add_ps(d, _mm_mul_ps(s, g));
249 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
250 i += 4;
251 }
252 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
253 *d += *s * gain;
254 }
255}
256
257#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
258#[target_feature(enable = "sse")]
259unsafe fn copy_scaled_inplace_sse(dst: &mut [f32], src: &[f32], gain: f32) {
260 let len = dst.len().min(src.len());
261 let dst_head = &mut dst[..len];
262 let src_head = &src[..len];
263 let g = _mm_set1_ps(gain);
264 let mut i = 0usize;
265 while i + 4 <= dst_head.len() {
266 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
267 let r = _mm_mul_ps(s, g);
268 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
269 i += 4;
270 }
271 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
272 *d = *s * gain;
273 }
274}
275
276#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
277#[target_feature(enable = "sse")]
278unsafe fn add_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
279 let len = dst.len().min(src.len());
280 let dst_head = &mut dst[..len];
281 let src_head = &src[..len];
282 let sign_mask = _mm_set1_ps(-0.0);
283 let finite_max = _mm_set1_ps(f32::MAX);
284 let mut i = 0usize;
285 while i + 4 <= dst_head.len() {
286 let d = _mm_loadu_ps(dst_head.as_ptr().add(i));
287 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
288 let abs_s = _mm_andnot_ps(sign_mask, s);
289 let finite_mask = _mm_cmple_ps(abs_s, finite_max);
290 let sanitized = _mm_and_ps(s, finite_mask);
291 let r = _mm_add_ps(d, sanitized);
292 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
293 i += 4;
294 }
295 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
296 *d += if s.is_finite() { *s } else { 0.0 };
297 }
298}
299
300#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
301#[target_feature(enable = "sse")]
302unsafe fn copy_sanitized_inplace_sse(dst: &mut [f32], src: &[f32]) {
303 let len = dst.len().min(src.len());
304 let dst_head = &mut dst[..len];
305 let src_head = &src[..len];
306 let sign_mask = _mm_set1_ps(-0.0);
307 let finite_max = _mm_set1_ps(f32::MAX);
308 let mut i = 0usize;
309 while i + 4 <= dst_head.len() {
310 let s = _mm_loadu_ps(src_head.as_ptr().add(i));
311 let abs_s = _mm_andnot_ps(sign_mask, s);
312 let finite_mask = _mm_cmple_ps(abs_s, finite_max);
313 let r = _mm_and_ps(s, finite_mask);
314 _mm_storeu_ps(dst_head.as_mut_ptr().add(i), r);
315 i += 4;
316 }
317 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
318 *d = if s.is_finite() { *s } else { 0.0 };
319 }
320}
321
322#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
323#[target_feature(enable = "sse")]
324unsafe fn sanitize_finite_inplace_sse(buf: &mut [f32]) {
325 let sign_mask = _mm_set1_ps(-0.0);
326 let finite_max = _mm_set1_ps(f32::MAX);
327 let mut i = 0usize;
328 while i + 4 <= buf.len() {
329 let v = _mm_loadu_ps(buf.as_ptr().add(i));
330 let abs_v = _mm_andnot_ps(sign_mask, v);
331 let finite_mask = _mm_cmple_ps(abs_v, finite_max);
332 let r = _mm_and_ps(v, finite_mask);
333 _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
334 i += 4;
335 }
336 for s in &mut buf[i..] {
337 if !s.is_finite() {
338 *s = 0.0;
339 }
340 }
341}
342
343#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
344#[target_feature(enable = "sse")]
345unsafe fn peak_abs_sse(buf: &[f32]) -> f32 {
346 let sign_mask = _mm_set1_ps(-0.0);
347 let mut peak = _mm_setzero_ps();
348 let mut i = 0usize;
349 while i + 4 <= buf.len() {
350 let v = _mm_loadu_ps(buf.as_ptr().add(i));
351 let abs_v = _mm_andnot_ps(sign_mask, v);
352 peak = _mm_max_ps(peak, abs_v);
353 i += 4;
354 }
355 let mut arr = [0.0f32; 4];
356 _mm_storeu_ps(arr.as_mut_ptr(), peak);
357 let mut max_scalar = arr.into_iter().fold(0.0f32, |a, b| a.max(b));
358 for s in &buf[i..] {
359 max_scalar = max_scalar.max(s.abs());
360 }
361 max_scalar
362}
363
364#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
365#[target_feature(enable = "sse")]
366unsafe fn clamp_inplace_sse(buf: &mut [f32], min: f32, max: f32) {
367 let vmin = _mm_set1_ps(min);
368 let vmax = _mm_set1_ps(max);
369 let mut i = 0usize;
370 while i + 4 <= buf.len() {
371 let v = _mm_loadu_ps(buf.as_ptr().add(i));
372 let r = _mm_min_ps(_mm_max_ps(v, vmin), vmax);
373 _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
374 i += 4;
375 }
376 for s in &mut buf[i..] {
377 *s = s.clamp(min, max);
378 }
379}
380
381#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
382#[target_feature(enable = "avx")]
383unsafe fn add_inplace_avx(dst: &mut [f32], src: &[f32]) {
384 let len = dst.len().min(src.len());
385 let dst_head = &mut dst[..len];
386 let src_head = &src[..len];
387 let mut i = 0;
388 while i + 8 <= dst_head.len() {
389 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
390 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
391 let r = _mm256_add_ps(d, s);
392 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
393 i += 8;
394 }
395 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
396 *d += *s;
397 }
398}
399
400#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
401#[target_feature(enable = "avx")]
402unsafe fn mul_inplace_avx(dst: &mut [f32], gain: f32) {
403 let g = _mm256_set1_ps(gain);
404 let mut i = 0;
405 while i + 8 <= dst.len() {
406 let d = _mm256_loadu_ps(dst.as_ptr().add(i));
407 let r = _mm256_mul_ps(d, g);
408 _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
409 i += 8;
410 }
411 for d in &mut dst[i..] {
412 *d *= gain;
413 }
414}
415
416#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
417#[target_feature(enable = "avx")]
418unsafe fn add_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
419 let len = dst.len().min(src.len());
420 let dst_head = &mut dst[..len];
421 let src_head = &src[..len];
422 let g = _mm256_set1_ps(gain);
423 let mut i = 0;
424 while i + 8 <= dst_head.len() {
425 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
426 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
427 let r = _mm256_add_ps(d, _mm256_mul_ps(s, g));
428 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
429 i += 8;
430 }
431 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
432 *d += *s * gain;
433 }
434}
435
436#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
437#[target_feature(enable = "avx,fma")]
438unsafe fn add_scaled_inplace_avx_fma(dst: &mut [f32], src: &[f32], gain: f32) {
439 let len = dst.len().min(src.len());
440 let dst_head = &mut dst[..len];
441 let src_head = &src[..len];
442 let g = _mm256_set1_ps(gain);
443 let mut i = 0;
444 while i + 8 <= dst_head.len() {
445 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
446 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
447 let r = _mm256_fmadd_ps(s, g, d);
448 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
449 i += 8;
450 }
451 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
452 *d += *s * gain;
453 }
454}
455
456#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
457#[target_feature(enable = "avx")]
458unsafe fn copy_scaled_inplace_avx(dst: &mut [f32], src: &[f32], gain: f32) {
459 let len = dst.len().min(src.len());
460 let dst_head = &mut dst[..len];
461 let src_head = &src[..len];
462 let g = _mm256_set1_ps(gain);
463 let mut i = 0;
464 while i + 8 <= dst_head.len() {
465 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
466 let r = _mm256_mul_ps(s, g);
467 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
468 i += 8;
469 }
470 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
471 *d = *s * gain;
472 }
473}
474
475#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
476#[target_feature(enable = "avx")]
477unsafe fn add_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
478 let len = dst.len().min(src.len());
479 let dst_head = &mut dst[..len];
480 let src_head = &src[..len];
481 let zero = _mm256_setzero_ps();
482 let max_val = _mm256_set1_ps(f32::MAX);
483 let mut i = 0;
484 while i + 8 <= dst_head.len() {
485 let d = _mm256_loadu_ps(dst_head.as_ptr().add(i));
486 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
487 let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
488 let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
489 let sanitized = _mm256_blendv_ps(zero, s, mask);
490 let r = _mm256_add_ps(d, sanitized);
491 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
492 i += 8;
493 }
494 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
495 *d += if s.is_finite() { *s } else { 0.0 };
496 }
497}
498
499#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
500#[target_feature(enable = "avx")]
501unsafe fn copy_sanitized_inplace_avx(dst: &mut [f32], src: &[f32]) {
502 let len = dst.len().min(src.len());
503 let dst_head = &mut dst[..len];
504 let src_head = &src[..len];
505 let zero = _mm256_setzero_ps();
506 let max_val = _mm256_set1_ps(f32::MAX);
507 let mut i = 0;
508 while i + 8 <= dst_head.len() {
509 let s = _mm256_loadu_ps(src_head.as_ptr().add(i));
510 let abs_s = _mm256_andnot_ps(_mm256_set1_ps(-0.0), s);
511 let mask = _mm256_cmp_ps(abs_s, max_val, _CMP_LE_OQ);
512 let r = _mm256_blendv_ps(zero, s, mask);
513 _mm256_storeu_ps(dst_head.as_mut_ptr().add(i), r);
514 i += 8;
515 }
516 for (d, s) in dst_head[i..].iter_mut().zip(src_head[i..].iter()) {
517 *d = if s.is_finite() { *s } else { 0.0 };
518 }
519}
520
521#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
522#[target_feature(enable = "avx")]
523unsafe fn sanitize_finite_inplace_avx(buf: &mut [f32]) {
524 let zero = _mm256_setzero_ps();
525 let max_val = _mm256_set1_ps(f32::MAX);
526 let mut i = 0;
527 while i + 8 <= buf.len() {
528 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
529 let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
530 let mask = _mm256_cmp_ps(abs_v, max_val, _CMP_LE_OQ);
531 let r = _mm256_blendv_ps(zero, v, mask);
532 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
533 i += 8;
534 }
535 for s in &mut buf[i..] {
536 if !s.is_finite() {
537 *s = 0.0;
538 }
539 }
540}
541
542#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
543#[target_feature(enable = "avx")]
544unsafe fn peak_abs_avx(buf: &[f32]) -> f32 {
545 let mut peak = _mm256_setzero_ps();
546 let mut i = 0;
547 while i + 8 <= buf.len() {
548 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
549 let abs_v = _mm256_andnot_ps(_mm256_set1_ps(-0.0), v);
550 peak = _mm256_max_ps(peak, abs_v);
551 i += 8;
552 }
553 let mut arr = [0.0f32; 8];
554 _mm256_storeu_ps(arr.as_mut_ptr(), peak);
555 let mut max_scalar = arr.iter().fold(0.0f32, |a, &b| a.max(b));
556 for s in &buf[i..] {
557 max_scalar = max_scalar.max(s.abs());
558 }
559 max_scalar
560}
561
562#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
563#[target_feature(enable = "avx")]
564unsafe fn clamp_inplace_avx(buf: &mut [f32], min: f32, max: f32) {
565 let vmin = _mm256_set1_ps(min);
566 let vmax = _mm256_set1_ps(max);
567 let mut i = 0;
568 while i + 8 <= buf.len() {
569 let v = _mm256_loadu_ps(buf.as_ptr().add(i));
570 let r = _mm256_max_ps(vmin, _mm256_min_ps(vmax, v));
571 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
572 i += 8;
573 }
574 for s in &mut buf[i..] {
575 *s = s.clamp(min, max);
576 }
577}
578
579pub fn convert_i32_to_f32(src: &[i32], dst: &mut [f32], gain: f32) {
580 let n = src.len().min(dst.len());
581 if n == 0 {
582 return;
583 }
584 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
585 unsafe {
586 if is_x86_feature_detected!("avx") {
587 convert_i32_to_f32_avx(&src[..n], &mut dst[..n], gain);
588 return;
589 }
590 if is_x86_feature_detected!("sse") {
591 convert_i32_to_f32_sse(&src[..n], &mut dst[..n], gain);
592 return;
593 }
594 }
595 convert_i32_to_f32_scalar(&src[..n], &mut dst[..n], gain);
596}
597
598fn convert_i32_to_f32_scalar(src: &[i32], dst: &mut [f32], gain: f32) {
599 for (s, d) in src.iter().zip(dst.iter_mut()) {
600 *d = *s as f32 * gain;
601 }
602}
603
604#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
605unsafe fn convert_i32_to_f32_sse(src: &[i32], dst: &mut [f32], gain: f32) {
606 let g = _mm_set1_ps(gain);
607 let mut i = 0;
608 while i + 4 <= src.len() {
609 let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
610 let f = _mm_cvtepi32_ps(s);
611 let r = _mm_mul_ps(f, g);
612 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
613 i += 4;
614 }
615 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
616 *d = *s as f32 * gain;
617 }
618}
619
620#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
621#[target_feature(enable = "avx")]
622unsafe fn convert_i32_to_f32_avx(src: &[i32], dst: &mut [f32], gain: f32) {
623 let g = _mm256_set1_ps(gain);
624 let mut i = 0;
625 while i + 8 <= src.len() {
626 let s = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
627 let f = _mm256_cvtepi32_ps(s);
628 let r = _mm256_mul_ps(f, g);
629 _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
630 i += 8;
631 }
632 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
633 *d = *s as f32 * gain;
634 }
635}
636
637pub fn convert_i16_to_f32(src: &[i16], dst: &mut [f32], gain: f32) {
638 let n = src.len().min(dst.len());
639 if n == 0 {
640 return;
641 }
642 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
643 unsafe {
644 if is_x86_feature_detected!("avx2") {
645 convert_i16_to_f32_avx2(&src[..n], &mut dst[..n], gain);
646 return;
647 }
648 if is_x86_feature_detected!("sse4.1") {
649 convert_i16_to_f32_sse41(&src[..n], &mut dst[..n], gain);
650 return;
651 }
652 }
653 convert_i16_to_f32_scalar(&src[..n], &mut dst[..n], gain);
654}
655
656fn convert_i16_to_f32_scalar(src: &[i16], dst: &mut [f32], gain: f32) {
657 for (s, d) in src.iter().zip(dst.iter_mut()) {
658 *d = *s as f32 * gain;
659 }
660}
661
662#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
663unsafe fn convert_i16_to_f32_sse41(src: &[i16], dst: &mut [f32], gain: f32) {
664 let g = _mm_set1_ps(gain);
665 let mut i = 0;
666 while i + 8 <= src.len() {
667 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
668 let low = _mm_cvtepi16_epi32(bytes);
669 let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
670 let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), g);
671 let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), g);
672 _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
673 _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
674 i += 8;
675 }
676 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
677 *d = *s as f32 * gain;
678 }
679}
680
681#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
682#[target_feature(enable = "avx2")]
683unsafe fn convert_i16_to_f32_avx2(src: &[i16], dst: &mut [f32], gain: f32) {
684 let g = _mm256_set1_ps(gain);
685 let mut i = 0;
686 while i + 16 <= src.len() {
687 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
688 let low = _mm256_cvtepi16_epi32(bytes);
689 let high = _mm256_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
690 let low_f = _mm256_mul_ps(_mm256_cvtepi32_ps(low), g);
691 let high_f = _mm256_mul_ps(_mm256_cvtepi32_ps(high), g);
692 _mm256_storeu_ps(dst.as_mut_ptr().add(i), low_f);
693 _mm256_storeu_ps(dst.as_mut_ptr().add(i + 8), high_f);
694 i += 16;
695 }
696 if i + 8 <= src.len() {
697 let bytes = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
698 let low = _mm_cvtepi16_epi32(bytes);
699 let high = _mm_cvtepi16_epi32(_mm_srli_si128(bytes, 8));
700 let low_f = _mm_mul_ps(_mm_cvtepi32_ps(low), _mm_set1_ps(gain));
701 let high_f = _mm_mul_ps(_mm_cvtepi32_ps(high), _mm_set1_ps(gain));
702 _mm_storeu_ps(dst.as_mut_ptr().add(i), low_f);
703 _mm_storeu_ps(dst.as_mut_ptr().add(i + 4), high_f);
704 i += 8;
705 }
706 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
707 *d = *s as f32 * gain;
708 }
709}
710
711pub fn convert_i8_to_f32(src: &[i8], dst: &mut [f32], gain: f32) {
712 let n = src.len().min(dst.len());
713 if n == 0 {
714 return;
715 }
716 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
717 unsafe {
718 if is_x86_feature_detected!("avx2") {
719 convert_i8_to_f32_avx2(&src[..n], &mut dst[..n], gain);
720 return;
721 }
722 if is_x86_feature_detected!("sse4.1") {
723 convert_i8_to_f32_sse41(&src[..n], &mut dst[..n], gain);
724 return;
725 }
726 }
727 convert_i8_to_f32_scalar(&src[..n], &mut dst[..n], gain);
728}
729
730fn convert_i8_to_f32_scalar(src: &[i8], dst: &mut [f32], gain: f32) {
731 for (s, d) in src.iter().zip(dst.iter_mut()) {
732 *d = *s as f32 * gain;
733 }
734}
735
736#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
737unsafe fn convert_i8_to_f32_sse41(src: &[i8], dst: &mut [f32], gain: f32) {
738 let g = _mm_set1_ps(gain);
739 let mut i = 0;
740 while i + 4 <= src.len() {
741 let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
742 let i32s = _mm_cvtepi8_epi32(bytes);
743 let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), g);
744 _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
745 i += 4;
746 }
747 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
748 *d = *s as f32 * gain;
749 }
750}
751
752#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
753#[target_feature(enable = "avx2")]
754unsafe fn convert_i8_to_f32_avx2(src: &[i8], dst: &mut [f32], gain: f32) {
755 let g = _mm256_set1_ps(gain);
756 let mut i = 0;
757 while i + 8 <= src.len() {
758 let bytes = _mm_cvtsi64_si128(*(src.as_ptr().add(i) as *const i64));
759 let i32s = _mm256_cvtepi8_epi32(bytes);
760 let f32s = _mm256_mul_ps(_mm256_cvtepi32_ps(i32s), g);
761 _mm256_storeu_ps(dst.as_mut_ptr().add(i), f32s);
762 i += 8;
763 }
764 if i + 4 <= src.len() {
765 let bytes = _mm_cvtsi32_si128(*(src.as_ptr().add(i) as *const i32));
766 let i32s = _mm_cvtepi8_epi32(bytes);
767 let f32s = _mm_mul_ps(_mm_cvtepi32_ps(i32s), _mm_set1_ps(gain));
768 _mm_storeu_ps(dst.as_mut_ptr().add(i), f32s);
769 i += 4;
770 }
771 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
772 *d = *s as f32 * gain;
773 }
774}
775
776pub fn convert_i24_to_f32(src: &[i32], dst: &mut [f32], gain: f32) {
777 let n = src.len().min(dst.len());
778 if n == 0 {
779 return;
780 }
781 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
782 unsafe {
783 if is_x86_feature_detected!("avx2") {
784 convert_i24_to_f32_avx2(&src[..n], &mut dst[..n], gain);
785 return;
786 }
787 if is_x86_feature_detected!("sse4.1") {
788 convert_i24_to_f32_sse41(&src[..n], &mut dst[..n], gain);
789 return;
790 }
791 }
792 convert_i24_to_f32_scalar(&src[..n], &mut dst[..n], gain);
793}
794
795fn convert_i24_to_f32_scalar(src: &[i32], dst: &mut [f32], gain: f32) {
796 for (s, d) in src.iter().zip(dst.iter_mut()) {
797 let mut v = *s & 0x00FF_FFFF;
798 if (v & 0x0080_0000) != 0 {
799 v |= 0xFF00_0000u32 as i32;
800 }
801 *d = v as f32 * gain;
802 }
803}
804
805#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
806unsafe fn convert_i24_to_f32_sse41(src: &[i32], dst: &mut [f32], gain: f32) {
807 let g = _mm_set1_ps(gain);
808 let mut i = 0;
809 while i + 4 <= src.len() {
810 let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
811 let extended = _mm_srai_epi32(_mm_slli_epi32(s, 8), 8);
812 let f = _mm_cvtepi32_ps(extended);
813 let r = _mm_mul_ps(f, g);
814 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
815 i += 4;
816 }
817 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
818 let mut v = *s & 0x00FF_FFFF;
819 if (v & 0x0080_0000) != 0 {
820 v |= 0xFF00_0000u32 as i32;
821 }
822 *d = v as f32 * gain;
823 }
824}
825
826#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
827#[target_feature(enable = "avx2")]
828unsafe fn convert_i24_to_f32_avx2(src: &[i32], dst: &mut [f32], gain: f32) {
829 let g = _mm256_set1_ps(gain);
830 let mut i = 0;
831 while i + 8 <= src.len() {
832 let s = _mm256_loadu_si256(src.as_ptr().add(i) as *const __m256i);
833 let extended = _mm256_srai_epi32(_mm256_slli_epi32(s, 8), 8);
834 let f = _mm256_cvtepi32_ps(extended);
835 let r = _mm256_mul_ps(f, g);
836 _mm256_storeu_ps(dst.as_mut_ptr().add(i), r);
837 i += 8;
838 }
839 if i + 4 <= src.len() {
840 let s = _mm_loadu_si128(src.as_ptr().add(i) as *const __m128i);
841 let extended = _mm_srai_epi32(_mm_slli_epi32(s, 8), 8);
842 let f = _mm_cvtepi32_ps(extended);
843 let r = _mm_mul_ps(f, _mm_set1_ps(gain));
844 _mm_storeu_ps(dst.as_mut_ptr().add(i), r);
845 i += 4;
846 }
847 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
848 let mut v = *s & 0x00FF_FFFF;
849 if (v & 0x0080_0000) != 0 {
850 v |= 0xFF00_0000u32 as i32;
851 }
852 *d = v as f32 * gain;
853 }
854}
855
856pub fn convert_f32_to_i24(src: &[f32], dst: &mut [i32], gain: f32) {
857 let n = src.len().min(dst.len());
858 if n == 0 {
859 return;
860 }
861 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
862 unsafe {
863 if is_x86_feature_detected!("avx") {
864 convert_f32_to_i24_avx(&src[..n], &mut dst[..n], gain);
865 return;
866 }
867 if is_x86_feature_detected!("sse2") {
868 convert_f32_to_i24_sse2(&src[..n], &mut dst[..n], gain);
869 return;
870 }
871 }
872 convert_f32_to_i24_scalar(&src[..n], &mut dst[..n], gain);
873}
874
875fn convert_f32_to_i24_scalar(src: &[f32], dst: &mut [i32], gain: f32) {
876 for (s, d) in src.iter().zip(dst.iter_mut()) {
877 *d = (*s * gain) as i32 & 0x00FF_FFFF;
878 }
879}
880
881#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
882unsafe fn convert_f32_to_i24_sse2(src: &[f32], dst: &mut [i32], gain: f32) {
883 let g = _mm_set1_ps(gain);
884 let mask = _mm_set1_epi32(0x00FF_FFFF);
885 let mut i = 0;
886 while i + 4 <= src.len() {
887 let s = _mm_loadu_ps(src.as_ptr().add(i));
888 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
889 let m = _mm_and_si128(v, mask);
890 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, m);
891 i += 4;
892 }
893 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
894 *d = (*s * gain) as i32 & 0x00FF_FFFF;
895 }
896}
897
898#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
899#[target_feature(enable = "avx")]
900unsafe fn convert_f32_to_i24_avx(src: &[f32], dst: &mut [i32], gain: f32) {
901 let g = _mm256_set1_ps(gain);
902 let mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x00FF_FFFF));
903 let mut i = 0;
904 while i + 8 <= src.len() {
905 let s = _mm256_loadu_ps(src.as_ptr().add(i));
906 let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
907 let m = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), mask));
908 _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, m);
909 i += 8;
910 }
911 if i + 4 <= src.len() {
912 let g_sse = _mm_set1_ps(gain);
913 let mask_sse = _mm_set1_epi32(0x00FF_FFFF);
914 let s = _mm_loadu_ps(src.as_ptr().add(i));
915 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g_sse));
916 let m = _mm_and_si128(v, mask_sse);
917 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, m);
918 i += 4;
919 }
920 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
921 *d = (*s * gain) as i32 & 0x00FF_FFFF;
922 }
923}
924
925pub fn convert_f32_to_i32(src: &[f32], dst: &mut [i32], gain: f32) {
926 let n = src.len().min(dst.len());
927 if n == 0 {
928 return;
929 }
930 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
931 unsafe {
932 if is_x86_feature_detected!("avx") {
933 convert_f32_to_i32_avx(&src[..n], &mut dst[..n], gain);
934 return;
935 }
936 if is_x86_feature_detected!("sse2") {
937 convert_f32_to_i32_sse2(&src[..n], &mut dst[..n], gain);
938 return;
939 }
940 }
941 convert_f32_to_i32_scalar(&src[..n], &mut dst[..n], gain);
942}
943
944fn convert_f32_to_i32_scalar(src: &[f32], dst: &mut [i32], gain: f32) {
945 for (s, d) in src.iter().zip(dst.iter_mut()) {
946 *d = (*s * gain) as i32;
947 }
948}
949
950#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
951unsafe fn convert_f32_to_i32_sse2(src: &[f32], dst: &mut [i32], gain: f32) {
952 let g = _mm_set1_ps(gain);
953 let mut i = 0;
954 while i + 4 <= src.len() {
955 let s = _mm_loadu_ps(src.as_ptr().add(i));
956 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
957 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, v);
958 i += 4;
959 }
960 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
961 *d = (*s * gain) as i32;
962 }
963}
964
965#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
966#[target_feature(enable = "avx")]
967unsafe fn convert_f32_to_i32_avx(src: &[f32], dst: &mut [i32], gain: f32) {
968 let g = _mm256_set1_ps(gain);
969 let mut i = 0;
970 while i + 8 <= src.len() {
971 let s = _mm256_loadu_ps(src.as_ptr().add(i));
972 let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
973 _mm256_storeu_si256(dst.as_mut_ptr().add(i) as *mut __m256i, v);
974 i += 8;
975 }
976 if i + 4 <= src.len() {
977 let s = _mm_loadu_ps(src.as_ptr().add(i));
978 let v = _mm_cvttps_epi32(_mm_mul_ps(s, _mm_set1_ps(gain)));
979 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, v);
980 i += 4;
981 }
982 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
983 *d = (*s * gain) as i32;
984 }
985}
986
987pub fn convert_f32_to_i16(src: &[f32], dst: &mut [i16], gain: f32) {
988 let n = src.len().min(dst.len());
989 if n == 0 {
990 return;
991 }
992 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
993 unsafe {
994 if is_x86_feature_detected!("avx") {
995 convert_f32_to_i16_avx(&src[..n], &mut dst[..n], gain);
996 return;
997 }
998 if is_x86_feature_detected!("sse2") {
999 convert_f32_to_i16_sse2(&src[..n], &mut dst[..n], gain);
1000 return;
1001 }
1002 }
1003 convert_f32_to_i16_scalar(&src[..n], &mut dst[..n], gain);
1004}
1005
1006fn convert_f32_to_i16_scalar(src: &[f32], dst: &mut [i16], gain: f32) {
1007 for (s, d) in src.iter().zip(dst.iter_mut()) {
1008 *d = (*s * gain) as i16;
1009 }
1010}
1011
1012#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1013unsafe fn convert_f32_to_i16_sse2(src: &[f32], dst: &mut [i16], gain: f32) {
1014 let g = _mm_set1_ps(gain);
1015 let mut i = 0;
1016 while i + 8 <= src.len() {
1017 let s0 = _mm_loadu_ps(src.as_ptr().add(i));
1018 let s1 = _mm_loadu_ps(src.as_ptr().add(i + 4));
1019 let v0 = _mm_cvttps_epi32(_mm_mul_ps(s0, g));
1020 let v1 = _mm_cvttps_epi32(_mm_mul_ps(s1, g));
1021 let packed = _mm_packs_epi32(v0, v1);
1022 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1023 i += 8;
1024 }
1025 if i + 4 <= src.len() {
1026 let s = _mm_loadu_ps(src.as_ptr().add(i));
1027 let v = _mm_cvttps_epi32(_mm_mul_ps(s, g));
1028 let packed = _mm_packs_epi32(v, _mm_setzero_si128());
1029 _mm_storel_epi64(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1030 i += 4;
1031 }
1032 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1033 *d = (*s * gain) as i16;
1034 }
1035}
1036
1037#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1038#[target_feature(enable = "avx")]
1039unsafe fn convert_f32_to_i16_avx(src: &[f32], dst: &mut [i16], gain: f32) {
1040 let g = _mm256_set1_ps(gain);
1041 let mut i = 0usize;
1042 while i + 8 <= src.len() {
1043 let s = _mm256_loadu_ps(src.as_ptr().add(i));
1044 let v = _mm256_cvttps_epi32(_mm256_mul_ps(s, g));
1045 let vlo = _mm256_castsi256_si128(v);
1046 let vhi = _mm256_extracti128_si256(v, 1);
1047 let packed = _mm_packs_epi32(vlo, vhi);
1048 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1049 i += 8;
1050 }
1051 if i + 4 <= src.len() {
1052 let s = _mm_loadu_ps(src.as_ptr().add(i));
1053 let v = _mm_cvttps_epi32(_mm_mul_ps(s, _mm_set1_ps(gain)));
1054 let packed = _mm_packs_epi32(v, _mm_setzero_si128());
1055 _mm_storel_epi64(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1056 i += 4;
1057 }
1058 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1059 *d = (*s * gain) as i16;
1060 }
1061}
1062
1063pub fn convert_f32_to_i8(src: &[f32], dst: &mut [i8], gain: f32) {
1064 let n = src.len().min(dst.len());
1065 if n == 0 {
1066 return;
1067 }
1068 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1069 unsafe {
1070 if is_x86_feature_detected!("avx") {
1071 convert_f32_to_i8_avx(&src[..n], &mut dst[..n], gain);
1072 return;
1073 }
1074 if is_x86_feature_detected!("sse2") {
1075 convert_f32_to_i8_sse2(&src[..n], &mut dst[..n], gain);
1076 return;
1077 }
1078 }
1079 convert_f32_to_i8_scalar(&src[..n], &mut dst[..n], gain);
1080}
1081
1082fn convert_f32_to_i8_scalar(src: &[f32], dst: &mut [i8], gain: f32) {
1083 for (s, d) in src.iter().zip(dst.iter_mut()) {
1084 *d = (*s * gain) as i8;
1085 }
1086}
1087
1088#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1089unsafe fn convert_f32_to_i8_sse2(src: &[f32], dst: &mut [i8], gain: f32) {
1090 let g = _mm_set1_ps(gain);
1091 let mut i = 0;
1092 while i + 16 <= src.len() {
1093 let s0 = _mm_loadu_ps(src.as_ptr().add(i));
1094 let s1 = _mm_loadu_ps(src.as_ptr().add(i + 4));
1095 let s2 = _mm_loadu_ps(src.as_ptr().add(i + 8));
1096 let s3 = _mm_loadu_ps(src.as_ptr().add(i + 12));
1097 let v0 = _mm_cvttps_epi32(_mm_mul_ps(s0, g));
1098 let v1 = _mm_cvttps_epi32(_mm_mul_ps(s1, g));
1099 let v2 = _mm_cvttps_epi32(_mm_mul_ps(s2, g));
1100 let v3 = _mm_cvttps_epi32(_mm_mul_ps(s3, g));
1101 let p0 = _mm_packs_epi32(v0, v1);
1102 let p1 = _mm_packs_epi32(v2, v3);
1103 let packed = _mm_packs_epi16(p0, p1);
1104 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1105 i += 16;
1106 }
1107 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1108 *d = (*s * gain) as i8;
1109 }
1110}
1111
1112#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1113#[target_feature(enable = "avx")]
1114unsafe fn convert_f32_to_i8_avx(src: &[f32], dst: &mut [i8], gain: f32) {
1115 let g = _mm256_set1_ps(gain);
1116 let mut i = 0usize;
1117 while i + 16 <= src.len() {
1118 let s0 = _mm256_loadu_ps(src.as_ptr().add(i));
1119 let s1 = _mm256_loadu_ps(src.as_ptr().add(i + 8));
1120 let v0 = _mm256_cvttps_epi32(_mm256_mul_ps(s0, g));
1121 let v1 = _mm256_cvttps_epi32(_mm256_mul_ps(s1, g));
1122 let p0 = _mm_packs_epi32(_mm256_castsi256_si128(v0), _mm256_extracti128_si256(v0, 1));
1123 let p1 = _mm_packs_epi32(_mm256_castsi256_si128(v1), _mm256_extracti128_si256(v1, 1));
1124 let packed = _mm_packs_epi16(p0, p1);
1125 _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut __m128i, packed);
1126 i += 16;
1127 }
1128 for (s, d) in src[i..].iter().zip(dst[i..].iter_mut()) {
1129 *d = (*s * gain) as i8;
1130 }
1131}
1132
1133pub fn apply_fade_in_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
1134 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1135 unsafe {
1136 if is_x86_feature_detected!("avx") {
1137 apply_fade_in_inplace_avx(buf, start_t, dt);
1138 return;
1139 }
1140 if is_x86_feature_detected!("sse") {
1141 apply_fade_in_inplace_sse(buf, start_t, dt);
1142 return;
1143 }
1144 }
1145 for (i, v) in buf.iter_mut().enumerate() {
1146 let t = (start_t + i as f32 * dt).clamp(0.0, 1.0);
1147 *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1148 }
1149}
1150
1151pub fn apply_fade_out_inplace(buf: &mut [f32], start_t: f32, dt: f32) {
1152 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1153 unsafe {
1154 if is_x86_feature_detected!("avx") {
1155 apply_fade_out_inplace_avx(buf, start_t, dt);
1156 return;
1157 }
1158 if is_x86_feature_detected!("sse") {
1159 apply_fade_out_inplace_sse(buf, start_t, dt);
1160 return;
1161 }
1162 }
1163 for (i, v) in buf.iter_mut().enumerate() {
1164 let t = (start_t + i as f32 * dt).clamp(0.0, 1.0);
1165 *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1166 }
1167}
1168
1169#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1170#[target_feature(enable = "avx")]
1171unsafe fn apply_fade_in_inplace_avx(buf: &mut [f32], start_t: f32, dt: f32) {
1172 let mut i = 0usize;
1173 while i + 8 <= buf.len() {
1174 let mut gain = [0.0f32; 8];
1175 for (lane, g) in gain.iter_mut().enumerate() {
1176 let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1177 *g = (t * std::f32::consts::FRAC_PI_2).sin();
1178 }
1179 let s = _mm256_loadu_ps(buf.as_ptr().add(i));
1180 let g = _mm256_loadu_ps(gain.as_ptr());
1181 let r = _mm256_mul_ps(s, g);
1182 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
1183 i += 8;
1184 }
1185 for (j, v) in buf[i..].iter_mut().enumerate() {
1186 let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1187 *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1188 }
1189}
1190
1191#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1192#[target_feature(enable = "sse")]
1193unsafe fn apply_fade_in_inplace_sse(buf: &mut [f32], start_t: f32, dt: f32) {
1194 let mut i = 0usize;
1195 while i + 4 <= buf.len() {
1196 let mut gain = [0.0f32; 4];
1197 for (lane, g) in gain.iter_mut().enumerate() {
1198 let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1199 *g = (t * std::f32::consts::FRAC_PI_2).sin();
1200 }
1201 let s = _mm_loadu_ps(buf.as_ptr().add(i));
1202 let g = _mm_loadu_ps(gain.as_ptr());
1203 let r = _mm_mul_ps(s, g);
1204 _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
1205 i += 4;
1206 }
1207 for (j, v) in buf[i..].iter_mut().enumerate() {
1208 let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1209 *v *= (t * std::f32::consts::FRAC_PI_2).sin();
1210 }
1211}
1212
1213#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1214#[target_feature(enable = "avx")]
1215unsafe fn apply_fade_out_inplace_avx(buf: &mut [f32], start_t: f32, dt: f32) {
1216 let mut i = 0usize;
1217 while i + 8 <= buf.len() {
1218 let mut gain = [0.0f32; 8];
1219 for (lane, g) in gain.iter_mut().enumerate() {
1220 let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1221 *g = (t * std::f32::consts::FRAC_PI_2).cos();
1222 }
1223 let s = _mm256_loadu_ps(buf.as_ptr().add(i));
1224 let g = _mm256_loadu_ps(gain.as_ptr());
1225 let r = _mm256_mul_ps(s, g);
1226 _mm256_storeu_ps(buf.as_mut_ptr().add(i), r);
1227 i += 8;
1228 }
1229 for (j, v) in buf[i..].iter_mut().enumerate() {
1230 let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1231 *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1232 }
1233}
1234
1235#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
1236#[target_feature(enable = "sse")]
1237unsafe fn apply_fade_out_inplace_sse(buf: &mut [f32], start_t: f32, dt: f32) {
1238 let mut i = 0usize;
1239 while i + 4 <= buf.len() {
1240 let mut gain = [0.0f32; 4];
1241 for (lane, g) in gain.iter_mut().enumerate() {
1242 let t = (start_t + (i + lane) as f32 * dt).clamp(0.0, 1.0);
1243 *g = (t * std::f32::consts::FRAC_PI_2).cos();
1244 }
1245 let s = _mm_loadu_ps(buf.as_ptr().add(i));
1246 let g = _mm_loadu_ps(gain.as_ptr());
1247 let r = _mm_mul_ps(s, g);
1248 _mm_storeu_ps(buf.as_mut_ptr().add(i), r);
1249 i += 4;
1250 }
1251 for (j, v) in buf[i..].iter_mut().enumerate() {
1252 let t = (start_t + (i + j) as f32 * dt).clamp(0.0, 1.0);
1253 *v *= (t * std::f32::consts::FRAC_PI_2).cos();
1254 }
1255}
1256
1257#[cfg(test)]
1258mod tests {
1259 use super::*;
1260
1261 #[test]
1262 fn add_inplace_basic() {
1263 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
1264 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
1265 add_inplace(&mut a, &b);
1266 assert_eq!(a, [11.0, 22.0, 33.0, 44.0, 55.0]);
1267 }
1268
1269 #[test]
1270 fn mul_inplace_basic() {
1271 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
1272 mul_inplace(&mut a, 2.0);
1273 assert_eq!(a, [2.0, 4.0, 6.0, 8.0, 10.0]);
1274 }
1275
1276 #[test]
1277 fn add_scaled_inplace_basic() {
1278 let mut a = [1.0f32, 2.0, 3.0, 4.0, 5.0];
1279 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
1280 add_scaled_inplace(&mut a, &b, 0.5);
1281 assert_eq!(a, [6.0, 12.0, 18.0, 24.0, 30.0]);
1282 }
1283
1284 #[test]
1285 fn copy_scaled_inplace_basic() {
1286 let mut a = [0.0f32; 5];
1287 let b = [10.0f32, 20.0, 30.0, 40.0, 50.0];
1288 copy_scaled_inplace(&mut a, &b, 0.5);
1289 assert_eq!(a, [5.0, 10.0, 15.0, 20.0, 25.0]);
1290 }
1291
1292 #[test]
1293 fn add_sanitized_inplace_basic() {
1294 let mut a = [1.0f32, 2.0, 3.0, 4.0];
1295 let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
1296 add_sanitized_inplace(&mut a, &b);
1297 assert!(a[0].is_finite() && a[0] == 1.5);
1298 assert!(a[1].is_finite() && a[1] == 2.0);
1299 assert!(a[2].is_finite() && a[2] == 3.0);
1300 assert!(a[3].is_finite() && a[3] == 5.0);
1301 }
1302
1303 #[test]
1304 fn copy_sanitized_inplace_basic() {
1305 let mut a = [0.0f32; 4];
1306 let b = [0.5f32, f32::NAN, f32::INFINITY, 1.0];
1307 copy_sanitized_inplace(&mut a, &b);
1308 assert!(a[0].is_finite() && a[0] == 0.5);
1309 assert!(a[1].is_finite() && a[1] == 0.0);
1310 assert!(a[2].is_finite() && a[2] == 0.0);
1311 assert!(a[3].is_finite() && a[3] == 1.0);
1312 }
1313
1314 #[test]
1315 fn sanitize_finite_inplace_basic() {
1316 let mut a = [1.0f32, f32::NAN, f32::INFINITY, 4.0, f32::NEG_INFINITY];
1317 sanitize_finite_inplace(&mut a);
1318 assert!(a[0].is_finite() && a[0] == 1.0);
1319 assert_eq!(a[1], 0.0);
1320 assert_eq!(a[2], 0.0);
1321 assert!(a[3].is_finite() && a[3] == 4.0);
1322 assert_eq!(a[4], 0.0);
1323 }
1324
1325 #[test]
1326 fn peak_abs_basic() {
1327 let a = [1.0f32, -3.0, 2.0, 0.5];
1328 assert_eq!(peak_abs(&a), 3.0);
1329 }
1330
1331 #[test]
1332 fn clamp_inplace_basic() {
1333 let mut a = [-2.0f32, -0.5, 0.0, 0.5, 2.0];
1334 clamp_inplace(&mut a, -1.0, 1.0);
1335 assert_eq!(a, [-1.0, -0.5, 0.0, 0.5, 1.0]);
1336 }
1337}