oximedia_codec/simd/
traits.rs

1//! SIMD operation traits for video codec implementations.
2//!
3//! This module defines the core traits for SIMD operations. All implementations
4//! (scalar fallback, SSE, AVX, NEON) implement these traits, allowing codec
5//! code to be written generically.
6//!
7//! # Example
8//!
9//! ```ignore
10//! use oximedia_codec::simd::{SimdOps, get_simd_impl};
11//!
12//! let simd = get_simd_impl();
13//! let sum = simd.horizontal_sum_i16(&[1, 2, 3, 4, 5, 6, 7, 8]);
14//! ```
15
16#![forbid(unsafe_code)]
17
18use super::types::{I16x8, I32x4, U8x16};
19
20/// Core SIMD operations trait.
21///
22/// This trait defines the fundamental SIMD operations needed for video codec
23/// implementations. All operations are designed to map efficiently to
24/// hardware SIMD instructions.
25pub trait SimdOps: Send + Sync {
26    /// Get the name of this SIMD implementation.
27    fn name(&self) -> &'static str;
28
29    /// Check if this implementation is available on the current CPU.
30    fn is_available(&self) -> bool;
31
32    // ========================================================================
33    // Vector Arithmetic
34    // ========================================================================
35
36    /// Element-wise addition of two i16x8 vectors.
37    fn add_i16x8(&self, a: I16x8, b: I16x8) -> I16x8;
38
39    /// Element-wise subtraction of two i16x8 vectors.
40    fn sub_i16x8(&self, a: I16x8, b: I16x8) -> I16x8;
41
42    /// Element-wise multiplication of two i16x8 vectors.
43    fn mul_i16x8(&self, a: I16x8, b: I16x8) -> I16x8;
44
45    /// Element-wise addition of two i32x4 vectors.
46    fn add_i32x4(&self, a: I32x4, b: I32x4) -> I32x4;
47
48    /// Element-wise subtraction of two i32x4 vectors.
49    fn sub_i32x4(&self, a: I32x4, b: I32x4) -> I32x4;
50
51    // ========================================================================
52    // Min/Max/Clamp
53    // ========================================================================
54
55    /// Element-wise minimum of two i16x8 vectors.
56    fn min_i16x8(&self, a: I16x8, b: I16x8) -> I16x8;
57
58    /// Element-wise maximum of two i16x8 vectors.
59    fn max_i16x8(&self, a: I16x8, b: I16x8) -> I16x8;
60
61    /// Element-wise clamp of i16x8 vector.
62    fn clamp_i16x8(&self, v: I16x8, min: i16, max: i16) -> I16x8;
63
64    /// Element-wise minimum of two u8x16 vectors.
65    fn min_u8x16(&self, a: U8x16, b: U8x16) -> U8x16;
66
67    /// Element-wise maximum of two u8x16 vectors.
68    fn max_u8x16(&self, a: U8x16, b: U8x16) -> U8x16;
69
70    /// Element-wise clamp of u8x16 vector.
71    fn clamp_u8x16(&self, v: U8x16, min: u8, max: u8) -> U8x16;
72
73    // ========================================================================
74    // Horizontal Operations
75    // ========================================================================
76
77    /// Horizontal sum of all elements in an i16x8 vector.
78    fn horizontal_sum_i16x8(&self, v: I16x8) -> i32;
79
80    /// Horizontal sum of all elements in an i32x4 vector.
81    fn horizontal_sum_i32x4(&self, v: I32x4) -> i32;
82
83    // ========================================================================
84    // SAD (Sum of Absolute Differences)
85    // ========================================================================
86
87    /// Sum of absolute differences between two u8x16 vectors.
88    ///
89    /// Computes: sum(|a\[i\] - b\[i\]|) for all i
90    fn sad_u8x16(&self, a: U8x16, b: U8x16) -> u32;
91
92    /// Sum of absolute differences for 8 bytes.
93    fn sad_8(&self, a: &[u8], b: &[u8]) -> u32;
94
95    /// Sum of absolute differences for 16 bytes.
96    fn sad_16(&self, a: &[u8], b: &[u8]) -> u32;
97
98    // ========================================================================
99    // Widening/Narrowing
100    // ========================================================================
101
102    /// Widen u8x16 low half to i16x8.
103    fn widen_low_u8_to_i16(&self, v: U8x16) -> I16x8;
104
105    /// Widen u8x16 high half to i16x8.
106    fn widen_high_u8_to_i16(&self, v: U8x16) -> I16x8;
107
108    /// Narrow two i32x4 to i16x8 with saturation.
109    fn narrow_i32x4_to_i16x8(&self, low: I32x4, high: I32x4) -> I16x8;
110
111    // ========================================================================
112    // Multiply-Add
113    // ========================================================================
114
115    /// Multiply and add: a * b + c for i16x8.
116    fn madd_i16x8(&self, a: I16x8, b: I16x8, c: I16x8) -> I16x8;
117
118    /// Multiply pairs and add adjacent results (pmaddwd equivalent).
119    ///
120    /// Multiplies pairs of i16 elements and adds adjacent products:
121    /// result\[0\] = a\[0\]*b\[0\] + a\[1\]*b\[1\]
122    /// result\[1\] = a\[2\]*b\[2\] + a\[3\]*b\[3\]
123    /// etc.
124    fn pmaddwd(&self, a: I16x8, b: I16x8) -> I32x4;
125
126    // ========================================================================
127    // Shift Operations
128    // ========================================================================
129
130    /// Arithmetic right shift of i16x8 by immediate.
131    fn shr_i16x8(&self, v: I16x8, shift: u32) -> I16x8;
132
133    /// Logical left shift of i16x8 by immediate.
134    fn shl_i16x8(&self, v: I16x8, shift: u32) -> I16x8;
135
136    /// Arithmetic right shift of i32x4 by immediate.
137    fn shr_i32x4(&self, v: I32x4, shift: u32) -> I32x4;
138
139    /// Logical left shift of i32x4 by immediate.
140    fn shl_i32x4(&self, v: I32x4, shift: u32) -> I32x4;
141
142    // ========================================================================
143    // Averaging
144    // ========================================================================
145
146    /// Average of two u8x16 vectors (rounding up).
147    fn avg_u8x16(&self, a: U8x16, b: U8x16) -> U8x16;
148}
149
150/// Extended SIMD operations for more complex codec operations.
151pub trait SimdOpsExt: SimdOps {
152    // ========================================================================
153    // Block Operations
154    // ========================================================================
155
156    /// Load 4 bytes from memory and zero-extend to i16x8.
157    fn load4_u8_to_i16x8(&self, src: &[u8]) -> I16x8;
158
159    /// Load 8 bytes from memory and zero-extend to i16x8.
160    fn load8_u8_to_i16x8(&self, src: &[u8]) -> I16x8;
161
162    /// Store lower 4 elements of i16x8 to memory as saturated u8.
163    fn store4_i16x8_as_u8(&self, v: I16x8, dst: &mut [u8]);
164
165    /// Store lower 8 elements of i16x8 to memory as saturated u8.
166    fn store8_i16x8_as_u8(&self, v: I16x8, dst: &mut [u8]);
167
168    // ========================================================================
169    // Transpose Operations (for DCT)
170    // ========================================================================
171
172    /// Transpose 4x4 block of i16 values.
173    ///
174    /// Input: 4 rows stored in 4 I16x8 vectors (only lower 4 elements used)
175    /// Output: Transposed 4x4 block
176    fn transpose_4x4_i16(&self, rows: &[I16x8; 4]) -> [I16x8; 4];
177
178    /// Transpose 8x8 block of i16 values.
179    fn transpose_8x8_i16(&self, rows: &[I16x8; 8]) -> [I16x8; 8];
180
181    // ========================================================================
182    // Butterfly Operations (for DCT)
183    // ========================================================================
184
185    /// DCT butterfly: (a + b, a - b).
186    fn butterfly_i16x8(&self, a: I16x8, b: I16x8) -> (I16x8, I16x8);
187
188    /// DCT butterfly for i32x4.
189    fn butterfly_i32x4(&self, a: I32x4, b: I32x4) -> (I32x4, I32x4);
190}
191
192/// Trait for selecting SIMD implementation at runtime.
193pub trait SimdSelector {
194    /// Get the best available SIMD implementation for this CPU.
195    fn select(&self) -> &dyn SimdOps;
196
197    /// Get the best available extended SIMD implementation.
198    fn select_ext(&self) -> &dyn SimdOpsExt;
199}
200
201#[cfg(test)]
202mod tests {
203    use super::*;
204
205    // Basic trait bound tests
206    fn _assert_send_sync<T: Send + Sync>() {}
207
208    #[test]
209    fn test_trait_bounds() {
210        #[allow(dead_code)]
211        #[allow(clippy::used_underscore_items)]
212        fn assert_simd_ops<T: SimdOps>() {
213            _assert_send_sync::<T>();
214        }
215
216        // This test just ensures the trait bounds compile correctly
217        fn _check_bounds<T: SimdOps>(_t: &T) {
218            assert_simd_ops::<T>();
219        }
220    }
221}
oximedia_codec/simd/traits.rs

oximedia_codec/simd/
traits.rs