oximedia_codec/simd/mod.rs
1//! SIMD abstraction layer for video codec implementations.
2//!
3//! This module provides a unified interface for SIMD operations used in
4//! video encoding and decoding. It abstracts over different SIMD instruction
5//! sets (AVX2, AVX-512, NEON) while providing a scalar fallback for portability.
6//!
7//! # Architecture
8//!
9//! The SIMD abstraction consists of:
10//!
11//! - **Types** (`types.rs`): Vector types like `I16x8`, `I32x4`, `U8x16`
12//! - **Traits** (`traits.rs`): `SimdOps` and `SimdOpsExt` for SIMD operations
13//! - **Architecture-specific**: x86 (AVX2/AVX-512), ARM (NEON), scalar fallback
14//! - **Codec-specific**: AV1 and VP9 optimized operations
15//! - **Operations**: Domain-specific modules for codec operations
16//!
17//! # Usage
18//!
19//! ```ignore
20//! use oximedia_codec::simd::{detect_simd, select_transform_impl};
21//!
22//! // Detect SIMD capabilities
23//! let caps = detect_simd();
24//! println!("Best SIMD: {}", caps.best_level());
25//!
26//! // Use codec-specific SIMD operations
27//! use oximedia_codec::simd::av1::TransformSimd;
28//! let transform = TransformSimd::new(select_transform_impl());
29//! transform.forward_dct_8x8(&input, &mut output);
30//! ```
31//!
32//! # Feature Detection and Dispatch
33//!
34//! The SIMD implementation is selected at runtime based on CPU capabilities:
35//!
36//! ```ignore
37//! use oximedia_codec::simd::{SimdCapabilities, detect_simd};
38//!
39//! let caps = detect_simd();
40//! if caps.avx512 {
41//! // Use AVX-512 optimized path
42//! } else if caps.avx2 {
43//! // Use AVX2 path
44//! } else if caps.neon {
45//! // Use ARM NEON path
46//! } else {
47//! // Use scalar fallback
48//! }
49//! ```
50//!
51//! ## SIMD Dispatch Mechanism
52//!
53//! OxiMedia uses a two-tier dispatch strategy to guarantee correctness on every target
54//! while achieving maximum throughput on modern hardware.
55//!
56//! **Tier 1: Compile-time `cfg` selection.**
57//! Target-specific code paths are gated with `#[cfg(target_arch = "...")]`, so only the
58//! code relevant to the current build target is compiled in:
59//!
60//! - `x86_64` — AVX-512 (`avx512f` + `avx512bw` + `avx512dq`), AVX2, SSE4.2 paths
61//! - `aarch64` — ARM NEON path (always present on AArch64)
62//! - `wasm32` — WASM SIMD128 path (`simd/wasm.rs`, `core::arch::wasm32` intrinsics)
63//! - All other targets — scalar fallback only
64//!
65//! **Tier 2: Runtime [`SimdCapabilities`] detection.**
66//! Even on `x86_64`, AVX-512 may not be available at runtime. [`detect_simd`] probes the
67//! CPU at startup using `is_x86_feature_detected!` and fills a [`SimdCapabilities`] struct:
68//!
69//! ```ignore
70//! use oximedia_codec::simd::{SimdCapabilities, detect_simd};
71//!
72//! let caps: SimdCapabilities = detect_simd();
73//! if caps.avx512 {
74//! // 512-bit vector path — Ice Lake, Skylake-X, Zen 4+
75//! } else if caps.avx2 {
76//! // 256-bit vector path — Haswell 2013+, Excavator 2015+
77//! } else if caps.neon {
78//! // ARM NEON path — all ARMv8/AArch64
79//! } else {
80//! // Pure scalar fallback
81//! }
82//! ```
83//!
84//! The `get_simd()` helper encapsulates the dispatch and returns a `&'static dyn SimdOps`:
85//!
86//! ```ignore
87//! use oximedia_codec::simd::get_simd;
88//!
89//! let ops = get_simd(); // picks AVX-512 → AVX2 → NEON → scalar
90//! ops.sad_8x8(&src, &ref_block); // calls fastest available path
91//! ```
92//!
93//! **Tier 3: Scalar fallback.**
94//! [`ScalarFallback`] provides a 100% pure-Rust implementation of every [`SimdOps`]
95//! operation. It is always compiled in and always selected when no SIMD extension is
96//! detected. This means OxiMedia:
97//!
98//! - compiles on any Rust target (including `wasm32`, `riscv64`, `mips`, etc.)
99//! - runs correctly on any hardware, even without SIMD support
100//! - achieves SIMD acceleration silently when the extension is available
101//!
102//! No unsafe dispatch tables or runtime dynamic linking are used; all dispatch paths are
103//! statically allocated (`static AVX2_INSTANCE: Avx2Simd = Avx2Simd`) and accessed
104//! via a single `&'static dyn SimdOps` fat pointer.
105
106#![allow(unsafe_code)]
107
108// Core modules
109pub mod scalar;
110pub mod traits;
111pub mod types;
112
113// Architecture-specific implementations
114pub mod arm;
115pub mod x86;
116
117// Codec-specific SIMD operations
118pub mod av1;
119pub mod vp9;
120
121// Legacy operation modules (preserved for compatibility)
122pub mod blend;
123pub mod dct;
124pub mod filter;
125pub mod sad;
126
127// Pixel format conversion (YUV ↔ RGB, all subsampling modes)
128pub mod pixel_convert;
129
130// YUV subsampling format conversion (4:2:0 ↔ 4:2:2 ↔ 4:4:4, NV12 ↔ I420)
131pub mod yuv_convert;
132
133// Re-exports
134pub use blend::{blend_ops, BlendOps};
135pub use dct::{dct_ops, DctOps};
136pub use filter::{filter_ops, FilterOps};
137pub use sad::{sad_ops, SadOps};
138pub use traits::{SimdOps, SimdOpsExt, SimdSelector};
139pub use types::{I16x16, I16x8, I32x4, I32x8, U8x16, U8x32};
140
141// Architecture-specific re-exports
142pub use arm::NeonSimd;
143pub use scalar::ScalarFallback;
144pub use x86::{Avx2Simd, Avx512Simd};
145
146// Codec-specific re-exports
147pub use av1::{CdefSimd, IntraPredSimd, LoopFilterSimd, MotionCompSimd, TransformSimd};
148pub use vp9::{Vp9DctSimd, Vp9InterpolateSimd, Vp9IntraPredSimd, Vp9LoopFilterSimd};
149
150// ============================================================================
151// CPU Feature Detection and Dispatch
152// ============================================================================
153
154/// CPU SIMD capabilities.
155///
156/// This structure represents the SIMD instruction sets available on the
157/// current CPU, detected at runtime.
158#[derive(Clone, Copy, Debug, Default)]
159#[allow(clippy::struct_excessive_bools)]
160pub struct SimdCapabilities {
161 /// x86 AVX2 support (Intel Haswell 2013+, AMD Excavator 2015+).
162 pub avx2: bool,
163
164 /// x86 AVX-512 support (Intel Skylake-X 2017+, Ice Lake 2019+).
165 pub avx512: bool,
166
167 /// ARM NEON support (all ARMv8/AArch64, ARMv7-A with NEON).
168 pub neon: bool,
169}
170
171impl SimdCapabilities {
172 /// Create with all features disabled.
173 #[must_use]
174 pub const fn none() -> Self {
175 Self {
176 avx2: false,
177 avx512: false,
178 neon: false,
179 }
180 }
181
182 /// Check if AVX2 is available.
183 #[inline]
184 #[must_use]
185 pub const fn has_avx2(&self) -> bool {
186 self.avx2
187 }
188
189 /// Check if AVX-512 is available.
190 #[inline]
191 #[must_use]
192 pub const fn has_avx512(&self) -> bool {
193 self.avx512
194 }
195
196 /// Check if NEON is available.
197 #[inline]
198 #[must_use]
199 pub const fn has_neon(&self) -> bool {
200 self.neon
201 }
202
203 /// Get the best available SIMD level name.
204 #[must_use]
205 pub const fn best_level(&self) -> &'static str {
206 if self.avx512 {
207 "avx512"
208 } else if self.avx2 {
209 "avx2"
210 } else if self.neon {
211 "neon"
212 } else {
213 "scalar"
214 }
215 }
216}
217
218/// Detect CPU SIMD capabilities at runtime.
219///
220/// This function uses CPU feature detection to determine which SIMD
221/// instruction sets are available on the current processor.
222///
223/// # Returns
224///
225/// A `SimdCapabilities` struct indicating which SIMD features are available.
226///
227/// # Example
228///
229/// ```ignore
230/// use oximedia_codec::simd::detect_simd;
231///
232/// let caps = detect_simd();
233/// println!("Running on: {}", caps.best_level());
234/// ```
235#[must_use]
236pub fn detect_simd() -> SimdCapabilities {
237 #[cfg(target_arch = "x86_64")]
238 {
239 SimdCapabilities {
240 avx2: is_x86_feature_detected!("avx2"),
241 avx512: is_x86_feature_detected!("avx512f")
242 && is_x86_feature_detected!("avx512bw")
243 && is_x86_feature_detected!("avx512dq"),
244 neon: false,
245 }
246 }
247
248 #[cfg(target_arch = "aarch64")]
249 {
250 // On AArch64, NEON is always available
251 SimdCapabilities {
252 avx2: false,
253 avx512: false,
254 neon: true,
255 }
256 }
257
258 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
259 {
260 SimdCapabilities::none()
261 }
262}
263
264/// Transform implementation selection.
265///
266/// This enum represents the different SIMD implementations available
267/// for transform operations.
268#[derive(Clone, Copy, Debug, PartialEq, Eq)]
269pub enum TransformImpl {
270 /// AVX-512 implementation.
271 Avx512,
272 /// AVX2 implementation.
273 Avx2,
274 /// ARM NEON implementation.
275 Neon,
276 /// Scalar fallback implementation.
277 Scalar,
278}
279
280/// Select the best transform implementation for the current CPU.
281///
282/// This function detects CPU capabilities and returns the optimal
283/// transform implementation.
284///
285/// # Returns
286///
287/// The best available `TransformImpl` for the current CPU.
288#[must_use]
289pub fn select_transform_impl() -> TransformImpl {
290 let caps = detect_simd();
291
292 if caps.has_avx512() {
293 TransformImpl::Avx512
294 } else if caps.has_avx2() {
295 TransformImpl::Avx2
296 } else if caps.has_neon() {
297 TransformImpl::Neon
298 } else {
299 TransformImpl::Scalar
300 }
301}
302
303// Static instances for each SIMD implementation
304static SCALAR_INSTANCE: ScalarFallback = ScalarFallback;
305
306#[cfg(target_arch = "x86_64")]
307static AVX2_INSTANCE: Avx2Simd = Avx2Simd;
308
309#[cfg(target_arch = "x86_64")]
310static AVX512_INSTANCE: Avx512Simd = Avx512Simd;
311
312#[cfg(target_arch = "aarch64")]
313static NEON_INSTANCE: NeonSimd = NeonSimd;
314
315/// Get the best SIMD implementation for the current CPU.
316///
317/// Returns a reference to the optimal SIMD implementation based on
318/// detected CPU capabilities. This provides dynamic dispatch to the
319/// fastest available implementation.
320///
321/// # Returns
322///
323/// A static reference to a `SimdOps` implementation.
324#[must_use]
325pub fn get_simd() -> &'static dyn SimdOps {
326 #[cfg(target_arch = "x86_64")]
327 {
328 if Avx512Simd::is_available() {
329 return &AVX512_INSTANCE;
330 } else if Avx2Simd::is_available() {
331 return &AVX2_INSTANCE;
332 }
333 }
334
335 #[cfg(target_arch = "aarch64")]
336 {
337 if NeonSimd::is_available() {
338 return &NEON_INSTANCE;
339 }
340 }
341
342 &SCALAR_INSTANCE
343}
344
345/// Get the best extended SIMD implementation for the current CPU.
346///
347/// Returns a reference to the optimal extended SIMD implementation
348/// (with additional operations like transpose and butterfly).
349///
350/// # Returns
351///
352/// A static reference to a `SimdOpsExt` implementation.
353#[must_use]
354pub fn get_simd_ext() -> &'static dyn SimdOpsExt {
355 #[cfg(target_arch = "x86_64")]
356 {
357 if Avx512Simd::is_available() {
358 return &AVX512_INSTANCE;
359 } else if Avx2Simd::is_available() {
360 return &AVX2_INSTANCE;
361 }
362 }
363
364 #[cfg(target_arch = "aarch64")]
365 {
366 if NeonSimd::is_available() {
367 return &NEON_INSTANCE;
368 }
369 }
370
371 &SCALAR_INSTANCE
372}
373
374// ============================================================================
375// Legacy Compatibility
376// ============================================================================
377
378/// Legacy scalar SIMD accessor (deprecated, use `ScalarFallback` directly).
379#[deprecated(
380 since = "0.1.0",
381 note = "Use &SCALAR_INSTANCE or ScalarFallback directly"
382)]
383#[must_use]
384pub fn scalar_simd() -> &'static ScalarFallback {
385 &SCALAR_INSTANCE
386}
387
388/// Legacy capabilities detection (deprecated, use `detect_simd` instead).
389#[deprecated(since = "0.1.0", note = "Use detect_simd() instead")]
390#[must_use]
391pub fn detect_capabilities() -> SimdCapabilities {
392 detect_simd()
393}
394
395#[cfg(test)]
396mod tests {
397 use super::*;
398
399 #[test]
400 fn test_detect_simd() {
401 let caps = detect_simd();
402
403 // Should return valid capabilities
404 let level = caps.best_level();
405 assert!(!level.is_empty());
406
407 // At least one implementation should be available
408 assert!(get_simd().is_available());
409 }
410
411 #[test]
412 fn test_simd_capabilities() {
413 let caps = SimdCapabilities::none();
414 assert!(!caps.has_avx2());
415 assert!(!caps.has_avx512());
416 assert!(!caps.has_neon());
417 assert_eq!(caps.best_level(), "scalar");
418 }
419
420 #[test]
421 fn test_get_simd() {
422 let simd = get_simd();
423 assert!(simd.is_available());
424
425 // Check that the name matches expected values
426 let name = simd.name();
427 assert!(
428 name == "scalar" || name == "avx2" || name == "avx512" || name == "neon",
429 "Unexpected SIMD name: {}",
430 name
431 );
432 }
433
434 #[test]
435 fn test_get_simd_ext() {
436 let simd = get_simd_ext();
437 assert!(simd.is_available());
438 }
439
440 #[test]
441 fn test_select_transform_impl() {
442 let impl_type = select_transform_impl();
443
444 // Should select a valid implementation
445 match impl_type {
446 TransformImpl::Avx512
447 | TransformImpl::Avx2
448 | TransformImpl::Neon
449 | TransformImpl::Scalar => {}
450 }
451 }
452
453 #[test]
454 fn test_module_reexports() {
455 // Test that all reexports work
456 let _v = I16x8::zero();
457 let _v = I32x4::zero();
458 let _v = U8x16::zero();
459
460 let _ops = sad_ops();
461 let _ops = blend_ops();
462 let _ops = dct_ops();
463 let _ops = filter_ops();
464 }
465
466 #[test]
467 fn test_architecture_specific() {
468 // Test that architecture-specific types are accessible
469 let _scalar = ScalarFallback::new();
470
471 #[cfg(target_arch = "x86_64")]
472 {
473 let _avx2 = Avx2Simd::new();
474 let _avx512 = Avx512Simd::new();
475 }
476
477 #[cfg(target_arch = "aarch64")]
478 {
479 let _neon = NeonSimd::new();
480 }
481 }
482
483 #[test]
484 fn test_codec_specific_types() {
485 // Verify codec-specific types are accessible
486 use crate::simd::scalar::ScalarFallback;
487
488 let simd = ScalarFallback::new();
489
490 // AV1
491 let _transform = TransformSimd::new(simd);
492 let _loop_filter = LoopFilterSimd::new(simd);
493 let _cdef = CdefSimd::new(simd);
494 let _intra = IntraPredSimd::new(simd);
495 let _motion = MotionCompSimd::new(simd);
496
497 // VP9
498 let _vp9_dct = Vp9DctSimd::new(simd);
499 let _vp9_interp = Vp9InterpolateSimd::new(simd);
500 let _vp9_intra = Vp9IntraPredSimd::new(simd);
501 let _vp9_lf = Vp9LoopFilterSimd::new(simd);
502 }
503
504 #[test]
505 fn test_integration_sad() {
506 let sad = sad_ops();
507
508 // Test basic SAD calculation
509 let src = [128u8; 64];
510 let ref_block = [128u8; 64];
511
512 let result = sad.sad_8x8(&src, 8, &ref_block, 8);
513 assert_eq!(result, 0);
514 }
515
516 #[test]
517 fn test_integration_blend() {
518 let blend = blend_ops();
519
520 // Test linear interpolation
521 let result = blend.lerp_u8(0, 255, 128);
522 assert!(result >= 126 && result <= 130);
523 }
524
525 #[test]
526 fn test_integration_dct() {
527 let dct = dct_ops();
528
529 // Test DCT round-trip
530 let input = [100i16; 16];
531 let mut dct_out = [0i16; 16];
532 let mut reconstructed = [0i16; 16];
533
534 dct.forward_dct_4x4(&input, &mut dct_out);
535 dct.inverse_dct_4x4(&dct_out, &mut reconstructed);
536
537 // Should be close to original
538 for i in 0..16 {
539 let diff = (input[i] - reconstructed[i]).abs();
540 assert!(
541 diff <= 2,
542 "DCT mismatch at {}: {} vs {}",
543 i,
544 input[i],
545 reconstructed[i]
546 );
547 }
548 }
549
550 #[test]
551 fn test_integration_filter() {
552 let filter = filter_ops();
553
554 // Test 2-tap filter on constant input
555 let src = [128u8; 16];
556 let mut dst = [0u8; 15];
557
558 filter.filter_h_2tap(&src, &mut dst, 15);
559
560 for &v in &dst {
561 assert_eq!(v, 128);
562 }
563 }
564}