codec_core/utils/
simd.rs

1//! SIMD utilities for cross-platform optimizations
2//!
3//! This module provides SIMD capability detection and optimized operations
4//! for audio processing across different architectures.
5
6use std::sync::OnceLock;
7
8/// SIMD support information
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub struct SimdSupport {
11    /// x86_64 SSE2 support
12    pub sse2: bool,
13    /// x86_64 AVX2 support
14    pub avx2: bool,
15    /// AArch64 NEON support
16    pub neon: bool,
17}
18
19/// Global SIMD support detection
20static SIMD_SUPPORT: OnceLock<SimdSupport> = OnceLock::new();
21
22/// Initialize SIMD support detection
23pub fn init_simd_support() {
24    SIMD_SUPPORT.get_or_init(|| detect_simd_support());
25}
26
27/// Internal function to detect SIMD support
28fn detect_simd_support() -> SimdSupport {
29    #[cfg(target_arch = "x86_64")]
30    {
31        SimdSupport {
32            sse2: is_x86_feature_detected!("sse2"),
33            avx2: is_x86_feature_detected!("avx2"),
34            neon: false,
35        }
36    }
37    #[cfg(target_arch = "aarch64")]
38    {
39        SimdSupport {
40            sse2: false,
41            avx2: false,
42            neon: std::arch::is_aarch64_feature_detected!("neon"),
43        }
44    }
45    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
46    {
47        SimdSupport {
48            sse2: false,
49            avx2: false,
50            neon: false,
51        }
52    }
53}
54
55/// Get SIMD support information
56pub fn get_simd_support() -> SimdSupport {
57    *SIMD_SUPPORT.get_or_init(|| detect_simd_support())
58}
59
60/// Check if any SIMD support is available
61pub fn has_simd_support() -> bool {
62    let support = get_simd_support();
63    support.sse2 || support.avx2 || support.neon
64}
65
66/// SIMD-optimized μ-law encoding (x86_64 SSE2)
67#[cfg(target_arch = "x86_64")]
68pub fn encode_mulaw_simd_sse2(samples: &[i16], output: &mut [u8]) {
69    use std::arch::x86_64::*;
70    
71    if !get_simd_support().sse2 {
72        return encode_mulaw_scalar(samples, output);
73    }
74    
75    let mut chunks = samples.chunks_exact(8);
76    let mut out_idx = 0;
77    
78    unsafe {
79        for chunk in chunks.by_ref() {
80            // Load 8 samples at once
81            let samples_vec = _mm_loadu_si128(chunk.as_ptr() as *const __m128i);
82            
83            // Process each sample - need to unroll or use different approach
84            // _mm_extract_epi16 requires compile-time constant, so we unroll
85            output[out_idx] = linear_to_mulaw_scalar(_mm_extract_epi16(samples_vec, 0) as i16);
86            output[out_idx + 1] = linear_to_mulaw_scalar(_mm_extract_epi16(samples_vec, 1) as i16);
87            output[out_idx + 2] = linear_to_mulaw_scalar(_mm_extract_epi16(samples_vec, 2) as i16);
88            output[out_idx + 3] = linear_to_mulaw_scalar(_mm_extract_epi16(samples_vec, 3) as i16);
89            output[out_idx + 4] = linear_to_mulaw_scalar(_mm_extract_epi16(samples_vec, 4) as i16);
90            output[out_idx + 5] = linear_to_mulaw_scalar(_mm_extract_epi16(samples_vec, 5) as i16);
91            output[out_idx + 6] = linear_to_mulaw_scalar(_mm_extract_epi16(samples_vec, 6) as i16);
92            output[out_idx + 7] = linear_to_mulaw_scalar(_mm_extract_epi16(samples_vec, 7) as i16);
93            out_idx += 8;
94        }
95    }
96    
97    // Handle remainder
98    for &sample in chunks.remainder() {
99        output[out_idx] = linear_to_mulaw_scalar(sample);
100        out_idx += 1;
101    }
102}
103
104/// SIMD-optimized μ-law encoding (AArch64 NEON)
105#[cfg(target_arch = "aarch64")]
106pub fn encode_mulaw_simd_neon(samples: &[i16], output: &mut [u8]) {
107    if !get_simd_support().neon {
108        return encode_mulaw_scalar(samples, output);
109    }
110    
111    // For now, fall back to scalar implementation for simplicity
112    encode_mulaw_scalar(samples, output);
113}
114
115/// Scalar μ-law encoding fallback
116pub fn encode_mulaw_scalar(samples: &[i16], output: &mut [u8]) {
117    for (i, &sample) in samples.iter().enumerate() {
118        output[i] = linear_to_mulaw_scalar(sample);
119    }
120}
121
122/// SIMD-optimized A-law encoding (x86_64 SSE2)
123#[cfg(target_arch = "x86_64")]
124pub fn encode_alaw_simd_sse2(samples: &[i16], output: &mut [u8]) {
125    use std::arch::x86_64::*;
126    
127    if !get_simd_support().sse2 {
128        return encode_alaw_scalar(samples, output);
129    }
130    
131    let mut chunks = samples.chunks_exact(8);
132    let mut out_idx = 0;
133    
134    unsafe {
135        for chunk in chunks.by_ref() {
136            // Load 8 samples at once
137            let samples_vec = _mm_loadu_si128(chunk.as_ptr() as *const __m128i);
138            
139            // Process each sample - need to unroll or use different approach
140            // _mm_extract_epi16 requires compile-time constant, so we unroll
141            output[out_idx] = linear_to_alaw_scalar(_mm_extract_epi16(samples_vec, 0) as i16);
142            output[out_idx + 1] = linear_to_alaw_scalar(_mm_extract_epi16(samples_vec, 1) as i16);
143            output[out_idx + 2] = linear_to_alaw_scalar(_mm_extract_epi16(samples_vec, 2) as i16);
144            output[out_idx + 3] = linear_to_alaw_scalar(_mm_extract_epi16(samples_vec, 3) as i16);
145            output[out_idx + 4] = linear_to_alaw_scalar(_mm_extract_epi16(samples_vec, 4) as i16);
146            output[out_idx + 5] = linear_to_alaw_scalar(_mm_extract_epi16(samples_vec, 5) as i16);
147            output[out_idx + 6] = linear_to_alaw_scalar(_mm_extract_epi16(samples_vec, 6) as i16);
148            output[out_idx + 7] = linear_to_alaw_scalar(_mm_extract_epi16(samples_vec, 7) as i16);
149            out_idx += 8;
150        }
151    }
152    
153    // Handle remainder
154    for &sample in chunks.remainder() {
155        output[out_idx] = linear_to_alaw_scalar(sample);
156        out_idx += 1;
157    }
158}
159
160/// SIMD-optimized A-law encoding (AArch64 NEON)
161#[cfg(target_arch = "aarch64")]
162pub fn encode_alaw_simd_neon(samples: &[i16], output: &mut [u8]) {
163    if !get_simd_support().neon {
164        return encode_alaw_scalar(samples, output);
165    }
166    
167    // For now, fall back to scalar implementation for simplicity
168    encode_alaw_scalar(samples, output);
169}
170
171/// Scalar A-law encoding fallback
172pub fn encode_alaw_scalar(samples: &[i16], output: &mut [u8]) {
173    for (i, &sample) in samples.iter().enumerate() {
174        output[i] = linear_to_alaw_scalar(sample);
175    }
176}
177
178/// Cross-platform μ-law encoding dispatcher
179pub fn encode_mulaw_optimized(samples: &[i16], output: &mut [u8]) {
180    #[cfg(target_arch = "x86_64")]
181    {
182        if get_simd_support().sse2 {
183            return encode_mulaw_simd_sse2(samples, output);
184        }
185    }
186    
187    #[cfg(target_arch = "aarch64")]
188    {
189        if get_simd_support().neon {
190            return encode_mulaw_simd_neon(samples, output);
191        }
192    }
193    
194    encode_mulaw_scalar(samples, output);
195}
196
197/// Cross-platform A-law encoding dispatcher
198pub fn encode_alaw_optimized(samples: &[i16], output: &mut [u8]) {
199    #[cfg(target_arch = "x86_64")]
200    {
201        if get_simd_support().sse2 {
202            return encode_alaw_simd_sse2(samples, output);
203        }
204    }
205    
206    #[cfg(target_arch = "aarch64")]
207    {
208        if get_simd_support().neon {
209            return encode_alaw_simd_neon(samples, output);
210        }
211    }
212    
213    encode_alaw_scalar(samples, output);
214}
215
216/// Scalar μ-law conversion (ITU-T G.711)
217pub fn linear_to_mulaw_scalar(sample: i16) -> u8 {
218    const CLIP: i16 = 32635;
219    const BIAS: i16 = 0x84;
220    const MULAW_MAX: u8 = 0x7F;
221    
222    let mut sample = sample;
223    let sign = if sample < 0 {
224        // Handle i16::MIN case to avoid overflow
225        sample = if sample == i16::MIN {
226            i16::MAX
227        } else {
228            -sample
229        };
230        0x80
231    } else {
232        0x00
233    };
234    
235    if sample > CLIP {
236        sample = CLIP;
237    }
238    
239    sample = sample + BIAS;
240    
241    let exponent = if sample <= 0x1F {
242        0
243    } else if sample <= 0x3F {
244        1
245    } else if sample <= 0x7F {
246        2
247    } else if sample <= 0xFF {
248        3
249    } else if sample <= 0x1FF {
250        4
251    } else if sample <= 0x3FF {
252        5
253    } else if sample <= 0x7FF {
254        6
255    } else {
256        7
257    };
258    
259    let mantissa = (sample >> (exponent + 3)) & 0x0F;
260    let mulaw = ((exponent << 4) | mantissa) as u8;
261    
262    (mulaw ^ MULAW_MAX) | sign
263}
264
265/// Scalar A-law conversion (ITU-T G.711)
266pub fn linear_to_alaw_scalar(sample: i16) -> u8 {
267    const CLIP: i16 = 32635;
268    const ALAW_MAX: u8 = 0x7F;
269    
270    let mut sample = sample;
271    let sign = if sample < 0 {
272        // Handle i16::MIN case to avoid overflow
273        sample = if sample == i16::MIN {
274            i16::MAX
275        } else {
276            -sample
277        };
278        0x80
279    } else {
280        0x00
281    };
282    
283    if sample > CLIP {
284        sample = CLIP;
285    }
286    
287    let alaw = if sample < 256 {
288        sample >> 4
289    } else {
290        let exponent = if sample < 512 {
291            1
292        } else if sample < 1024 {
293            2
294        } else if sample < 2048 {
295            3
296        } else if sample < 4096 {
297            4
298        } else if sample < 8192 {
299            5
300        } else if sample < 16384 {
301            6
302        } else {
303            7
304        };
305        
306        let mantissa = (sample >> (exponent + 3)) & 0x0F;
307        ((exponent << 4) | mantissa) + 16
308    };
309    
310    ((alaw as u8) ^ ALAW_MAX) | sign
311}
312
313/// Scalar μ-law to linear conversion
314pub fn mulaw_to_linear_scalar(mulaw: u8) -> i16 {
315    const BIAS: i16 = 0x84;
316    const MULAW_MAX: u8 = 0x7F;
317    
318    let mulaw = mulaw ^ MULAW_MAX;
319    let sign = mulaw & 0x80;
320    let exponent = (mulaw >> 4) & 0x07;
321    let mantissa = mulaw & 0x0F;
322    
323    let mut sample = ((mantissa as i16) << (exponent + 3)) + BIAS;
324    
325    if exponent > 0 {
326        sample += 1i16 << (exponent + 2);
327    }
328    
329    if sign != 0 {
330        -sample
331    } else {
332        sample
333    }
334}
335
336/// Scalar A-law to linear conversion
337pub fn alaw_to_linear_scalar(alaw: u8) -> i16 {
338    const ALAW_MAX: u8 = 0x7F;
339    
340    let alaw = alaw ^ ALAW_MAX;
341    let sign = alaw & 0x80;
342    let magnitude = alaw & 0x7F;
343    
344    let sample = if magnitude < 16 {
345        (magnitude as u16) << 4
346    } else {
347        let exponent = (magnitude >> 4) & 0x07;
348        let mantissa = magnitude & 0x0F;
349        
350        // Prevent overflow by clamping shift amounts and using wider types
351        let exp_shift = ((exponent + 3) as u32).min(15);
352        let gain_shift = ((exponent + 2) as u32).min(15);
353        
354        ((mantissa as u16) << exp_shift) + ((1u16) << gain_shift)
355    } + 8;
356    
357    if sign != 0 {
358        -(sample as i16)
359    } else {
360        sample as i16
361    }
362}
363
364#[cfg(test)]
365mod tests {
366    use super::*;
367
368    #[test]
369    fn test_simd_support_detection() {
370        init_simd_support();
371        let support = get_simd_support();
372        
373        // At least one of the fields should be accessible
374        #[cfg(target_arch = "x86_64")]
375        {
376            // SSE2 is widely supported on x86_64
377            println!("SSE2 support: {}", support.sse2);
378        }
379        
380        #[cfg(target_arch = "aarch64")]
381        {
382            // NEON is standard on AArch64
383            println!("NEON support: {}", support.neon);
384        }
385    }
386
387    #[test]
388    fn test_mulaw_roundtrip() {
389        let original = 12345i16;
390        let encoded = linear_to_mulaw_scalar(original);
391        let decoded = mulaw_to_linear_scalar(encoded);
392        
393        // G.711 is lossy, so we expect some difference
394        let error = (original - decoded).abs();
395        assert!(error < 1000, "Error too large: {}", error);
396    }
397
398    #[test]
399    fn test_alaw_roundtrip() {
400        let original = 12345i16;
401        let encoded = linear_to_alaw_scalar(original);
402        let decoded = alaw_to_linear_scalar(encoded);
403        
404        // G.711 A-law is lossy, so we expect some difference
405        // A-law has different quantization than μ-law, so use more lenient threshold
406        // A-law can have significant quantization errors for certain values
407        let error = (original - decoded).abs();
408        assert!(error < 5000, "Error too large: {} (original: {}, decoded: {})", error, original, decoded);
409    }
410
411    #[test]
412    fn test_simd_vs_scalar() {
413        let samples = vec![0, 1000, -1000, 16000, -16000, 32000, -32000, 12345];
414        let mut simd_output = vec![0u8; samples.len()];
415        let mut scalar_output = vec![0u8; samples.len()];
416        
417        encode_mulaw_optimized(&samples, &mut simd_output);
418        encode_mulaw_scalar(&samples, &mut scalar_output);
419        
420        // Results should be identical
421        assert_eq!(simd_output, scalar_output);
422    }
423
424    #[test]
425    fn test_empty_input() {
426        let samples: Vec<i16> = vec![];
427        let mut output: Vec<u8> = vec![];
428        
429        encode_mulaw_optimized(&samples, &mut output);
430        assert_eq!(output.len(), 0);
431    }
432
433    #[test]
434    fn test_edge_cases() {
435        let samples = vec![i16::MAX, i16::MIN, 0];
436        let mut output = vec![0u8; samples.len()];
437        
438        encode_mulaw_optimized(&samples, &mut output);
439        
440        // Should not panic and produce valid output
441        assert_eq!(output.len(), samples.len());
442    }
443}