oximedia_codec/simd/traits.rs
1//! SIMD operation traits for video codec implementations.
2//!
3//! This module defines the core traits for SIMD operations. All implementations
4//! (scalar fallback, SSE, AVX, NEON) implement these traits, allowing codec
5//! code to be written generically.
6//!
7//! # Example
8//!
9//! ```ignore
10//! use oximedia_codec::simd::{SimdOps, get_simd_impl};
11//!
12//! let simd = get_simd_impl();
13//! let sum = simd.horizontal_sum_i16(&[1, 2, 3, 4, 5, 6, 7, 8]);
14//! ```
15
16#![forbid(unsafe_code)]
17
18use super::types::{I16x8, I32x4, U8x16};
19
20/// Core SIMD operations trait.
21///
22/// This trait defines the fundamental SIMD operations needed for video codec
23/// implementations. All operations are designed to map efficiently to
24/// hardware SIMD instructions.
25pub trait SimdOps: Send + Sync {
26 /// Get the name of this SIMD implementation.
27 fn name(&self) -> &'static str;
28
29 /// Check if this implementation is available on the current CPU.
30 fn is_available(&self) -> bool;
31
32 // ========================================================================
33 // Vector Arithmetic
34 // ========================================================================
35
36 /// Element-wise addition of two i16x8 vectors.
37 fn add_i16x8(&self, a: I16x8, b: I16x8) -> I16x8;
38
39 /// Element-wise subtraction of two i16x8 vectors.
40 fn sub_i16x8(&self, a: I16x8, b: I16x8) -> I16x8;
41
42 /// Element-wise multiplication of two i16x8 vectors.
43 fn mul_i16x8(&self, a: I16x8, b: I16x8) -> I16x8;
44
45 /// Element-wise addition of two i32x4 vectors.
46 fn add_i32x4(&self, a: I32x4, b: I32x4) -> I32x4;
47
48 /// Element-wise subtraction of two i32x4 vectors.
49 fn sub_i32x4(&self, a: I32x4, b: I32x4) -> I32x4;
50
51 // ========================================================================
52 // Min/Max/Clamp
53 // ========================================================================
54
55 /// Element-wise minimum of two i16x8 vectors.
56 fn min_i16x8(&self, a: I16x8, b: I16x8) -> I16x8;
57
58 /// Element-wise maximum of two i16x8 vectors.
59 fn max_i16x8(&self, a: I16x8, b: I16x8) -> I16x8;
60
61 /// Element-wise clamp of i16x8 vector.
62 fn clamp_i16x8(&self, v: I16x8, min: i16, max: i16) -> I16x8;
63
64 /// Element-wise minimum of two u8x16 vectors.
65 fn min_u8x16(&self, a: U8x16, b: U8x16) -> U8x16;
66
67 /// Element-wise maximum of two u8x16 vectors.
68 fn max_u8x16(&self, a: U8x16, b: U8x16) -> U8x16;
69
70 /// Element-wise clamp of u8x16 vector.
71 fn clamp_u8x16(&self, v: U8x16, min: u8, max: u8) -> U8x16;
72
73 // ========================================================================
74 // Horizontal Operations
75 // ========================================================================
76
77 /// Horizontal sum of all elements in an i16x8 vector.
78 fn horizontal_sum_i16x8(&self, v: I16x8) -> i32;
79
80 /// Horizontal sum of all elements in an i32x4 vector.
81 fn horizontal_sum_i32x4(&self, v: I32x4) -> i32;
82
83 // ========================================================================
84 // SAD (Sum of Absolute Differences)
85 // ========================================================================
86
87 /// Sum of absolute differences between two u8x16 vectors.
88 ///
89 /// Computes: sum(|a\[i\] - b\[i\]|) for all i
90 fn sad_u8x16(&self, a: U8x16, b: U8x16) -> u32;
91
92 /// Sum of absolute differences for 8 bytes.
93 fn sad_8(&self, a: &[u8], b: &[u8]) -> u32;
94
95 /// Sum of absolute differences for 16 bytes.
96 fn sad_16(&self, a: &[u8], b: &[u8]) -> u32;
97
98 // ========================================================================
99 // Widening/Narrowing
100 // ========================================================================
101
102 /// Widen u8x16 low half to i16x8.
103 fn widen_low_u8_to_i16(&self, v: U8x16) -> I16x8;
104
105 /// Widen u8x16 high half to i16x8.
106 fn widen_high_u8_to_i16(&self, v: U8x16) -> I16x8;
107
108 /// Narrow two i32x4 to i16x8 with saturation.
109 fn narrow_i32x4_to_i16x8(&self, low: I32x4, high: I32x4) -> I16x8;
110
111 // ========================================================================
112 // Multiply-Add
113 // ========================================================================
114
115 /// Multiply and add: a * b + c for i16x8.
116 fn madd_i16x8(&self, a: I16x8, b: I16x8, c: I16x8) -> I16x8;
117
118 /// Multiply pairs and add adjacent results (pmaddwd equivalent).
119 ///
120 /// Multiplies pairs of i16 elements and adds adjacent products:
121 /// result\[0\] = a\[0\]*b\[0\] + a\[1\]*b\[1\]
122 /// result\[1\] = a\[2\]*b\[2\] + a\[3\]*b\[3\]
123 /// etc.
124 fn pmaddwd(&self, a: I16x8, b: I16x8) -> I32x4;
125
126 // ========================================================================
127 // Shift Operations
128 // ========================================================================
129
130 /// Arithmetic right shift of i16x8 by immediate.
131 fn shr_i16x8(&self, v: I16x8, shift: u32) -> I16x8;
132
133 /// Logical left shift of i16x8 by immediate.
134 fn shl_i16x8(&self, v: I16x8, shift: u32) -> I16x8;
135
136 /// Arithmetic right shift of i32x4 by immediate.
137 fn shr_i32x4(&self, v: I32x4, shift: u32) -> I32x4;
138
139 /// Logical left shift of i32x4 by immediate.
140 fn shl_i32x4(&self, v: I32x4, shift: u32) -> I32x4;
141
142 // ========================================================================
143 // Averaging
144 // ========================================================================
145
146 /// Average of two u8x16 vectors (rounding up).
147 fn avg_u8x16(&self, a: U8x16, b: U8x16) -> U8x16;
148}
149
150/// Extended SIMD operations for more complex codec operations.
151pub trait SimdOpsExt: SimdOps {
152 // ========================================================================
153 // Block Operations
154 // ========================================================================
155
156 /// Load 4 bytes from memory and zero-extend to i16x8.
157 fn load4_u8_to_i16x8(&self, src: &[u8]) -> I16x8;
158
159 /// Load 8 bytes from memory and zero-extend to i16x8.
160 fn load8_u8_to_i16x8(&self, src: &[u8]) -> I16x8;
161
162 /// Store lower 4 elements of i16x8 to memory as saturated u8.
163 fn store4_i16x8_as_u8(&self, v: I16x8, dst: &mut [u8]);
164
165 /// Store lower 8 elements of i16x8 to memory as saturated u8.
166 fn store8_i16x8_as_u8(&self, v: I16x8, dst: &mut [u8]);
167
168 // ========================================================================
169 // Transpose Operations (for DCT)
170 // ========================================================================
171
172 /// Transpose 4x4 block of i16 values.
173 ///
174 /// Input: 4 rows stored in 4 I16x8 vectors (only lower 4 elements used)
175 /// Output: Transposed 4x4 block
176 fn transpose_4x4_i16(&self, rows: &[I16x8; 4]) -> [I16x8; 4];
177
178 /// Transpose 8x8 block of i16 values.
179 fn transpose_8x8_i16(&self, rows: &[I16x8; 8]) -> [I16x8; 8];
180
181 // ========================================================================
182 // Butterfly Operations (for DCT)
183 // ========================================================================
184
185 /// DCT butterfly: (a + b, a - b).
186 fn butterfly_i16x8(&self, a: I16x8, b: I16x8) -> (I16x8, I16x8);
187
188 /// DCT butterfly for i32x4.
189 fn butterfly_i32x4(&self, a: I32x4, b: I32x4) -> (I32x4, I32x4);
190}
191
192/// Trait for selecting SIMD implementation at runtime.
193pub trait SimdSelector {
194 /// Get the best available SIMD implementation for this CPU.
195 fn select(&self) -> &dyn SimdOps;
196
197 /// Get the best available extended SIMD implementation.
198 fn select_ext(&self) -> &dyn SimdOpsExt;
199}
200
201#[cfg(test)]
202mod tests {
203 use super::*;
204
205 // Basic trait bound tests
206 fn _assert_send_sync<T: Send + Sync>() {}
207
208 #[test]
209 fn test_trait_bounds() {
210 #[allow(dead_code)]
211 #[allow(clippy::used_underscore_items)]
212 fn assert_simd_ops<T: SimdOps>() {
213 _assert_send_sync::<T>();
214 }
215
216 // This test just ensures the trait bounds compile correctly
217 fn _check_bounds<T: SimdOps>(_t: &T) {
218 assert_simd_ops::<T>();
219 }
220 }
221}