Skip to main content

oximedia_codec/simd/
mod.rs

1//! SIMD abstraction layer for video codec implementations.
2//!
3//! This module provides a unified interface for SIMD operations used in
4//! video encoding and decoding. It abstracts over different SIMD instruction
5//! sets (AVX2, AVX-512, NEON) while providing a scalar fallback for portability.
6//!
7//! # Architecture
8//!
9//! The SIMD abstraction consists of:
10//!
11//! - **Types** (`types.rs`): Vector types like `I16x8`, `I32x4`, `U8x16`
12//! - **Traits** (`traits.rs`): `SimdOps` and `SimdOpsExt` for SIMD operations
13//! - **Architecture-specific**: x86 (AVX2/AVX-512), ARM (NEON), scalar fallback
14//! - **Codec-specific**: AV1 and VP9 optimized operations
15//! - **Operations**: Domain-specific modules for codec operations
16//!
17//! # Usage
18//!
19//! ```ignore
20//! use oximedia_codec::simd::{detect_simd, select_transform_impl};
21//!
22//! // Detect SIMD capabilities
23//! let caps = detect_simd();
24//! println!("Best SIMD: {}", caps.best_level());
25//!
26//! // Use codec-specific SIMD operations
27//! use oximedia_codec::simd::av1::TransformSimd;
28//! let transform = TransformSimd::new(select_transform_impl());
29//! transform.forward_dct_8x8(&input, &mut output);
30//! ```
31//!
32//! # Feature Detection and Dispatch
33//!
34//! The SIMD implementation is selected at runtime based on CPU capabilities:
35//!
36//! ```ignore
37//! use oximedia_codec::simd::{SimdCapabilities, detect_simd};
38//!
39//! let caps = detect_simd();
40//! if caps.avx512 {
41//!     // Use AVX-512 optimized path
42//! } else if caps.avx2 {
43//!     // Use AVX2 path
44//! } else if caps.neon {
45//!     // Use ARM NEON path
46//! } else {
47//!     // Use scalar fallback
48//! }
49//! ```
50//!
51//! ## SIMD Dispatch Mechanism
52//!
53//! OxiMedia uses a two-tier dispatch strategy to guarantee correctness on every target
54//! while achieving maximum throughput on modern hardware.
55//!
56//! **Tier 1: Compile-time `cfg` selection.**
57//! Target-specific code paths are gated with `#[cfg(target_arch = "...")]`, so only the
58//! code relevant to the current build target is compiled in:
59//!
60//! - `x86_64` — AVX-512 (`avx512f` + `avx512bw` + `avx512dq`), AVX2, SSE4.2 paths
61//! - `aarch64` — ARM NEON path (always present on AArch64)
62//! - `wasm32` — WASM SIMD128 path (`simd/wasm.rs`, `core::arch::wasm32` intrinsics)
63//! - All other targets — scalar fallback only
64//!
65//! **Tier 2: Runtime [`SimdCapabilities`] detection.**
66//! Even on `x86_64`, AVX-512 may not be available at runtime. [`detect_simd`] probes the
67//! CPU at startup using `is_x86_feature_detected!` and fills a [`SimdCapabilities`] struct:
68//!
69//! ```ignore
70//! use oximedia_codec::simd::{SimdCapabilities, detect_simd};
71//!
72//! let caps: SimdCapabilities = detect_simd();
73//! if caps.avx512 {
74//!     // 512-bit vector path — Ice Lake, Skylake-X, Zen 4+
75//! } else if caps.avx2 {
76//!     // 256-bit vector path — Haswell 2013+, Excavator 2015+
77//! } else if caps.neon {
78//!     // ARM NEON path — all ARMv8/AArch64
79//! } else {
80//!     // Pure scalar fallback
81//! }
82//! ```
83//!
84//! The `get_simd()` helper encapsulates the dispatch and returns a `&'static dyn SimdOps`:
85//!
86//! ```ignore
87//! use oximedia_codec::simd::get_simd;
88//!
89//! let ops = get_simd();  // picks AVX-512 → AVX2 → NEON → scalar
90//! ops.sad_8x8(&src, &ref_block); // calls fastest available path
91//! ```
92//!
93//! **Tier 3: Scalar fallback.**
94//! [`ScalarFallback`] provides a 100% pure-Rust implementation of every [`SimdOps`]
95//! operation. It is always compiled in and always selected when no SIMD extension is
96//! detected. This means OxiMedia:
97//!
98//! - compiles on any Rust target (including `wasm32`, `riscv64`, `mips`, etc.)
99//! - runs correctly on any hardware, even without SIMD support
100//! - achieves SIMD acceleration silently when the extension is available
101//!
102//! No unsafe dispatch tables or runtime dynamic linking are used; all dispatch paths are
103//! statically allocated (`static AVX2_INSTANCE: Avx2Simd = Avx2Simd`) and accessed
104//! via a single `&'static dyn SimdOps` fat pointer.
105
106#![allow(unsafe_code)]
107
108// Core modules
109pub mod scalar;
110pub mod traits;
111pub mod types;
112
113// Architecture-specific implementations
114pub mod arm;
115pub mod x86;
116
117// Codec-specific SIMD operations
118pub mod av1;
119pub mod vp9;
120
121// Legacy operation modules (preserved for compatibility)
122pub mod blend;
123pub mod dct;
124pub mod filter;
125pub mod sad;
126
127// Pixel format conversion (YUV ↔ RGB, all subsampling modes)
128pub mod pixel_convert;
129
130// YUV subsampling format conversion (4:2:0 ↔ 4:2:2 ↔ 4:4:4, NV12 ↔ I420)
131pub mod yuv_convert;
132
133// Re-exports
134pub use blend::{blend_ops, BlendOps};
135pub use dct::{dct_ops, DctOps};
136pub use filter::{filter_ops, FilterOps};
137pub use sad::{sad_ops, SadOps};
138pub use traits::{SimdOps, SimdOpsExt, SimdSelector};
139pub use types::{I16x16, I16x8, I32x4, I32x8, U8x16, U8x32};
140
141// Architecture-specific re-exports
142pub use arm::NeonSimd;
143pub use scalar::ScalarFallback;
144pub use x86::{Avx2Simd, Avx512Simd};
145
146// Codec-specific re-exports
147pub use av1::{CdefSimd, IntraPredSimd, LoopFilterSimd, MotionCompSimd, TransformSimd};
148pub use vp9::{Vp9DctSimd, Vp9InterpolateSimd, Vp9IntraPredSimd, Vp9LoopFilterSimd};
149
150// ============================================================================
151// CPU Feature Detection and Dispatch
152// ============================================================================
153
154/// CPU SIMD capabilities.
155///
156/// This structure represents the SIMD instruction sets available on the
157/// current CPU, detected at runtime.
158#[derive(Clone, Copy, Debug, Default)]
159#[allow(clippy::struct_excessive_bools)]
160pub struct SimdCapabilities {
161    /// x86 AVX2 support (Intel Haswell 2013+, AMD Excavator 2015+).
162    pub avx2: bool,
163
164    /// x86 AVX-512 support (Intel Skylake-X 2017+, Ice Lake 2019+).
165    pub avx512: bool,
166
167    /// ARM NEON support (all ARMv8/AArch64, ARMv7-A with NEON).
168    pub neon: bool,
169}
170
171impl SimdCapabilities {
172    /// Create with all features disabled.
173    #[must_use]
174    pub const fn none() -> Self {
175        Self {
176            avx2: false,
177            avx512: false,
178            neon: false,
179        }
180    }
181
182    /// Check if AVX2 is available.
183    #[inline]
184    #[must_use]
185    pub const fn has_avx2(&self) -> bool {
186        self.avx2
187    }
188
189    /// Check if AVX-512 is available.
190    #[inline]
191    #[must_use]
192    pub const fn has_avx512(&self) -> bool {
193        self.avx512
194    }
195
196    /// Check if NEON is available.
197    #[inline]
198    #[must_use]
199    pub const fn has_neon(&self) -> bool {
200        self.neon
201    }
202
203    /// Get the best available SIMD level name.
204    #[must_use]
205    pub const fn best_level(&self) -> &'static str {
206        if self.avx512 {
207            "avx512"
208        } else if self.avx2 {
209            "avx2"
210        } else if self.neon {
211            "neon"
212        } else {
213            "scalar"
214        }
215    }
216}
217
218/// Detect CPU SIMD capabilities at runtime.
219///
220/// This function uses CPU feature detection to determine which SIMD
221/// instruction sets are available on the current processor.
222///
223/// # Returns
224///
225/// A `SimdCapabilities` struct indicating which SIMD features are available.
226///
227/// # Example
228///
229/// ```ignore
230/// use oximedia_codec::simd::detect_simd;
231///
232/// let caps = detect_simd();
233/// println!("Running on: {}", caps.best_level());
234/// ```
235#[must_use]
236pub fn detect_simd() -> SimdCapabilities {
237    #[cfg(target_arch = "x86_64")]
238    {
239        SimdCapabilities {
240            avx2: is_x86_feature_detected!("avx2"),
241            avx512: is_x86_feature_detected!("avx512f")
242                && is_x86_feature_detected!("avx512bw")
243                && is_x86_feature_detected!("avx512dq"),
244            neon: false,
245        }
246    }
247
248    #[cfg(target_arch = "aarch64")]
249    {
250        // On AArch64, NEON is always available
251        SimdCapabilities {
252            avx2: false,
253            avx512: false,
254            neon: true,
255        }
256    }
257
258    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
259    {
260        SimdCapabilities::none()
261    }
262}
263
264/// Transform implementation selection.
265///
266/// This enum represents the different SIMD implementations available
267/// for transform operations.
268#[derive(Clone, Copy, Debug, PartialEq, Eq)]
269pub enum TransformImpl {
270    /// AVX-512 implementation.
271    Avx512,
272    /// AVX2 implementation.
273    Avx2,
274    /// ARM NEON implementation.
275    Neon,
276    /// Scalar fallback implementation.
277    Scalar,
278}
279
280/// Select the best transform implementation for the current CPU.
281///
282/// This function detects CPU capabilities and returns the optimal
283/// transform implementation.
284///
285/// # Returns
286///
287/// The best available `TransformImpl` for the current CPU.
288#[must_use]
289pub fn select_transform_impl() -> TransformImpl {
290    let caps = detect_simd();
291
292    if caps.has_avx512() {
293        TransformImpl::Avx512
294    } else if caps.has_avx2() {
295        TransformImpl::Avx2
296    } else if caps.has_neon() {
297        TransformImpl::Neon
298    } else {
299        TransformImpl::Scalar
300    }
301}
302
303// Static instances for each SIMD implementation
304static SCALAR_INSTANCE: ScalarFallback = ScalarFallback;
305
306#[cfg(target_arch = "x86_64")]
307static AVX2_INSTANCE: Avx2Simd = Avx2Simd;
308
309#[cfg(target_arch = "x86_64")]
310static AVX512_INSTANCE: Avx512Simd = Avx512Simd;
311
312#[cfg(target_arch = "aarch64")]
313static NEON_INSTANCE: NeonSimd = NeonSimd;
314
315/// Get the best SIMD implementation for the current CPU.
316///
317/// Returns a reference to the optimal SIMD implementation based on
318/// detected CPU capabilities. This provides dynamic dispatch to the
319/// fastest available implementation.
320///
321/// # Returns
322///
323/// A static reference to a `SimdOps` implementation.
324#[must_use]
325pub fn get_simd() -> &'static dyn SimdOps {
326    #[cfg(target_arch = "x86_64")]
327    {
328        if Avx512Simd::is_available() {
329            return &AVX512_INSTANCE;
330        } else if Avx2Simd::is_available() {
331            return &AVX2_INSTANCE;
332        }
333    }
334
335    #[cfg(target_arch = "aarch64")]
336    {
337        if NeonSimd::is_available() {
338            return &NEON_INSTANCE;
339        }
340    }
341
342    &SCALAR_INSTANCE
343}
344
345/// Get the best extended SIMD implementation for the current CPU.
346///
347/// Returns a reference to the optimal extended SIMD implementation
348/// (with additional operations like transpose and butterfly).
349///
350/// # Returns
351///
352/// A static reference to a `SimdOpsExt` implementation.
353#[must_use]
354pub fn get_simd_ext() -> &'static dyn SimdOpsExt {
355    #[cfg(target_arch = "x86_64")]
356    {
357        if Avx512Simd::is_available() {
358            return &AVX512_INSTANCE;
359        } else if Avx2Simd::is_available() {
360            return &AVX2_INSTANCE;
361        }
362    }
363
364    #[cfg(target_arch = "aarch64")]
365    {
366        if NeonSimd::is_available() {
367            return &NEON_INSTANCE;
368        }
369    }
370
371    &SCALAR_INSTANCE
372}
373
374// ============================================================================
375// Legacy Compatibility
376// ============================================================================
377
378/// Legacy scalar SIMD accessor (deprecated, use `ScalarFallback` directly).
379#[deprecated(
380    since = "0.1.0",
381    note = "Use &SCALAR_INSTANCE or ScalarFallback directly"
382)]
383#[must_use]
384pub fn scalar_simd() -> &'static ScalarFallback {
385    &SCALAR_INSTANCE
386}
387
388/// Legacy capabilities detection (deprecated, use `detect_simd` instead).
389#[deprecated(since = "0.1.0", note = "Use detect_simd() instead")]
390#[must_use]
391pub fn detect_capabilities() -> SimdCapabilities {
392    detect_simd()
393}
394
395#[cfg(test)]
396mod tests {
397    use super::*;
398
399    #[test]
400    fn test_detect_simd() {
401        let caps = detect_simd();
402
403        // Should return valid capabilities
404        let level = caps.best_level();
405        assert!(!level.is_empty());
406
407        // At least one implementation should be available
408        assert!(get_simd().is_available());
409    }
410
411    #[test]
412    fn test_simd_capabilities() {
413        let caps = SimdCapabilities::none();
414        assert!(!caps.has_avx2());
415        assert!(!caps.has_avx512());
416        assert!(!caps.has_neon());
417        assert_eq!(caps.best_level(), "scalar");
418    }
419
420    #[test]
421    fn test_get_simd() {
422        let simd = get_simd();
423        assert!(simd.is_available());
424
425        // Check that the name matches expected values
426        let name = simd.name();
427        assert!(
428            name == "scalar" || name == "avx2" || name == "avx512" || name == "neon",
429            "Unexpected SIMD name: {}",
430            name
431        );
432    }
433
434    #[test]
435    fn test_get_simd_ext() {
436        let simd = get_simd_ext();
437        assert!(simd.is_available());
438    }
439
440    #[test]
441    fn test_select_transform_impl() {
442        let impl_type = select_transform_impl();
443
444        // Should select a valid implementation
445        match impl_type {
446            TransformImpl::Avx512
447            | TransformImpl::Avx2
448            | TransformImpl::Neon
449            | TransformImpl::Scalar => {}
450        }
451    }
452
453    #[test]
454    fn test_module_reexports() {
455        // Test that all reexports work
456        let _v = I16x8::zero();
457        let _v = I32x4::zero();
458        let _v = U8x16::zero();
459
460        let _ops = sad_ops();
461        let _ops = blend_ops();
462        let _ops = dct_ops();
463        let _ops = filter_ops();
464    }
465
466    #[test]
467    fn test_architecture_specific() {
468        // Test that architecture-specific types are accessible
469        let _scalar = ScalarFallback::new();
470
471        #[cfg(target_arch = "x86_64")]
472        {
473            let _avx2 = Avx2Simd::new();
474            let _avx512 = Avx512Simd::new();
475        }
476
477        #[cfg(target_arch = "aarch64")]
478        {
479            let _neon = NeonSimd::new();
480        }
481    }
482
483    #[test]
484    fn test_codec_specific_types() {
485        // Verify codec-specific types are accessible
486        use crate::simd::scalar::ScalarFallback;
487
488        let simd = ScalarFallback::new();
489
490        // AV1
491        let _transform = TransformSimd::new(simd);
492        let _loop_filter = LoopFilterSimd::new(simd);
493        let _cdef = CdefSimd::new(simd);
494        let _intra = IntraPredSimd::new(simd);
495        let _motion = MotionCompSimd::new(simd);
496
497        // VP9
498        let _vp9_dct = Vp9DctSimd::new(simd);
499        let _vp9_interp = Vp9InterpolateSimd::new(simd);
500        let _vp9_intra = Vp9IntraPredSimd::new(simd);
501        let _vp9_lf = Vp9LoopFilterSimd::new(simd);
502    }
503
504    #[test]
505    fn test_integration_sad() {
506        let sad = sad_ops();
507
508        // Test basic SAD calculation
509        let src = [128u8; 64];
510        let ref_block = [128u8; 64];
511
512        let result = sad.sad_8x8(&src, 8, &ref_block, 8);
513        assert_eq!(result, 0);
514    }
515
516    #[test]
517    fn test_integration_blend() {
518        let blend = blend_ops();
519
520        // Test linear interpolation
521        let result = blend.lerp_u8(0, 255, 128);
522        assert!(result >= 126 && result <= 130);
523    }
524
525    #[test]
526    fn test_integration_dct() {
527        let dct = dct_ops();
528
529        // Test DCT round-trip
530        let input = [100i16; 16];
531        let mut dct_out = [0i16; 16];
532        let mut reconstructed = [0i16; 16];
533
534        dct.forward_dct_4x4(&input, &mut dct_out);
535        dct.inverse_dct_4x4(&dct_out, &mut reconstructed);
536
537        // Should be close to original
538        for i in 0..16 {
539            let diff = (input[i] - reconstructed[i]).abs();
540            assert!(
541                diff <= 2,
542                "DCT mismatch at {}: {} vs {}",
543                i,
544                input[i],
545                reconstructed[i]
546            );
547        }
548    }
549
550    #[test]
551    fn test_integration_filter() {
552        let filter = filter_ops();
553
554        // Test 2-tap filter on constant input
555        let src = [128u8; 16];
556        let mut dst = [0u8; 15];
557
558        filter.filter_h_2tap(&src, &mut dst, 15);
559
560        for &v in &dst {
561            assert_eq!(v, 128);
562        }
563    }
564}