oximedia_gpu/backend/
cpu.rs

1//! CPU SIMD fallback backend
2//!
3//! This module provides a CPU-based fallback implementation using SIMD
4//! when GPU compute is not available or for small workloads where CPU
5//! execution might be faster due to overhead.
6
7use super::{Backend, BackendCapabilities, BackendType};
8use crate::Result;
9use rayon::prelude::*;
10
11/// CPU backend using SIMD and multi-threading
12pub struct CpuBackend {
13    capabilities: BackendCapabilities,
14    num_threads: usize,
15}
16
17impl CpuBackend {
18    /// Create a new CPU backend
19    pub fn new() -> Result<Self> {
20        let num_threads = rayon::current_num_threads();
21
22        let capabilities = BackendCapabilities {
23            backend_type: BackendType::CPU,
24            max_workgroup_size: (1, 1, 1), // CPU doesn't use workgroups
25            max_workgroup_invocations: 1,
26            max_buffer_size: usize::MAX as u64,
27            compute_shaders: false,
28            subgroups: false,
29            push_constants: false,
30        };
31
32        Ok(Self {
33            capabilities,
34            num_threads,
35        })
36    }
37
38    /// Get the number of CPU threads
39    #[must_use]
40    pub fn num_threads(&self) -> usize {
41        self.num_threads
42    }
43
44    /// RGB to YUV conversion (BT.601) using CPU SIMD
45    pub fn rgb_to_yuv_bt601(input: &[u8], output: &mut [u8], width: usize, height: usize) {
46        const KR: f32 = 0.299;
47        const KB: f32 = 0.114;
48        const KG: f32 = 0.587;
49
50        let pixels = width * height;
51        output
52            .par_chunks_exact_mut(4)
53            .zip(input.par_chunks_exact(4))
54            .take(pixels)
55            .for_each(|(out, inp)| {
56                let r = f32::from(inp[0]) / 255.0;
57                let g = f32::from(inp[1]) / 255.0;
58                let b = f32::from(inp[2]) / 255.0;
59                let a = inp[3];
60
61                let y = KR * r + KG * g + KB * b;
62                let u = (b - y) / (2.0 * (1.0 - KB)) + 0.5;
63                let v = (r - y) / (2.0 * (1.0 - KR)) + 0.5;
64
65                out[0] = (y.clamp(0.0, 1.0) * 255.0) as u8;
66                out[1] = (u.clamp(0.0, 1.0) * 255.0) as u8;
67                out[2] = (v.clamp(0.0, 1.0) * 255.0) as u8;
68                out[3] = a;
69            });
70    }
71
72    /// YUV to RGB conversion (BT.601) using CPU SIMD
73    pub fn yuv_to_rgb_bt601(input: &[u8], output: &mut [u8], width: usize, height: usize) {
74        const KR: f32 = 0.299;
75        const KB: f32 = 0.114;
76        const KG: f32 = 0.587;
77
78        let pixels = width * height;
79        output
80            .par_chunks_exact_mut(4)
81            .zip(input.par_chunks_exact(4))
82            .take(pixels)
83            .for_each(|(out, inp)| {
84                let y = f32::from(inp[0]) / 255.0;
85                let u = f32::from(inp[1]) / 255.0 - 0.5;
86                let v = f32::from(inp[2]) / 255.0 - 0.5;
87                let a = inp[3];
88
89                let r = y + 2.0 * (1.0 - KR) * v;
90                let b = y + 2.0 * (1.0 - KB) * u;
91                let g = (y - KR * r - KB * b) / KG;
92
93                out[0] = (r.clamp(0.0, 1.0) * 255.0) as u8;
94                out[1] = (g.clamp(0.0, 1.0) * 255.0) as u8;
95                out[2] = (b.clamp(0.0, 1.0) * 255.0) as u8;
96                out[3] = a;
97            });
98    }
99
100    /// Bilinear image resize using CPU
101    #[allow(clippy::too_many_arguments)]
102    pub fn resize_bilinear(
103        input: &[u8],
104        src_width: usize,
105        src_height: usize,
106        output: &mut [u8],
107        dst_width: usize,
108        dst_height: usize,
109    ) {
110        let x_ratio = src_width as f32 / dst_width as f32;
111        let y_ratio = src_height as f32 / dst_height as f32;
112
113        output
114            .par_chunks_exact_mut(4)
115            .enumerate()
116            .for_each(|(i, pixel)| {
117                let dst_x = i % dst_width;
118                let dst_y = i / dst_width;
119
120                if dst_y >= dst_height {
121                    return;
122                }
123
124                let src_x = (dst_x as f32 + 0.5) * x_ratio - 0.5;
125                let src_y = (dst_y as f32 + 0.5) * y_ratio - 0.5;
126
127                let x0 = src_x.floor().max(0.0) as usize;
128                let y0 = src_y.floor().max(0.0) as usize;
129                let x1 = (x0 + 1).min(src_width - 1);
130                let y1 = (y0 + 1).min(src_height - 1);
131
132                let fx = src_x.fract();
133                let fy = src_y.fract();
134
135                for c in 0..4 {
136                    let p00 = input[(y0 * src_width + x0) * 4 + c];
137                    let p10 = input[(y0 * src_width + x1) * 4 + c];
138                    let p01 = input[(y1 * src_width + x0) * 4 + c];
139                    let p11 = input[(y1 * src_width + x1) * 4 + c];
140
141                    let v0 = f32::from(p00) * (1.0 - fx) + f32::from(p10) * fx;
142                    let v1 = f32::from(p01) * (1.0 - fx) + f32::from(p11) * fx;
143                    let v = v0 * (1.0 - fy) + v1 * fy;
144
145                    pixel[c] = v.round().clamp(0.0, 255.0) as u8;
146                }
147            });
148    }
149
150    /// Gaussian blur using CPU
151    pub fn gaussian_blur(input: &[u8], output: &mut [u8], width: usize, height: usize, sigma: f32) {
152        let kernel_radius = (3.0 * sigma).ceil() as i32;
153        let kernel_size = (2 * kernel_radius + 1) as usize;
154
155        // Generate Gaussian kernel
156        let mut kernel = vec![0.0f32; kernel_size];
157        let mut sum = 0.0f32;
158        let two_sigma_sq = 2.0 * sigma * sigma;
159
160        for i in 0..kernel_size {
161            let x = i as i32 - kernel_radius;
162            let value = (-(x * x) as f32 / two_sigma_sq).exp();
163            kernel[i] = value;
164            sum += value;
165        }
166
167        // Normalize kernel
168        for value in &mut kernel {
169            *value /= sum;
170        }
171
172        // Temporary buffer for horizontal pass
173        let mut temp = vec![0u8; input.len()];
174
175        // Horizontal pass
176        temp.par_chunks_exact_mut(4)
177            .enumerate()
178            .for_each(|(i, pixel)| {
179                let x = i % width;
180                let y = i / width;
181
182                if y >= height {
183                    return;
184                }
185
186                for c in 0..4 {
187                    let mut value = 0.0f32;
188
189                    for k in 0..kernel_size {
190                        let offset = k as i32 - kernel_radius;
191                        let sample_x = (x as i32 + offset).clamp(0, width as i32 - 1) as usize;
192                        let idx = (y * width + sample_x) * 4 + c;
193                        value += f32::from(input[idx]) * kernel[k];
194                    }
195
196                    pixel[c] = value.round().clamp(0.0, 255.0) as u8;
197                }
198            });
199
200        // Vertical pass
201        output
202            .par_chunks_exact_mut(4)
203            .enumerate()
204            .for_each(|(i, pixel)| {
205                let x = i % width;
206                let y = i / width;
207
208                if y >= height {
209                    return;
210                }
211
212                for c in 0..4 {
213                    let mut value = 0.0f32;
214
215                    for k in 0..kernel_size {
216                        let offset = k as i32 - kernel_radius;
217                        let sample_y = (y as i32 + offset).clamp(0, height as i32 - 1) as usize;
218                        let idx = (sample_y * width + x) * 4 + c;
219                        value += f32::from(temp[idx]) * kernel[k];
220                    }
221
222                    pixel[c] = value.round().clamp(0.0, 255.0) as u8;
223                }
224            });
225    }
226
227    /// Check if CPU SIMD is available
228    #[must_use]
229    pub fn has_simd() -> bool {
230        // Check for various SIMD instruction sets
231        #[cfg(target_arch = "x86_64")]
232        {
233            is_x86_feature_detected!("avx2") || is_x86_feature_detected!("sse4.2")
234        }
235        #[cfg(target_arch = "aarch64")]
236        {
237            // NEON is standard on aarch64
238            true
239        }
240        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
241        {
242            false
243        }
244    }
245}
246
247impl Backend for CpuBackend {
248    fn capabilities(&self) -> &BackendCapabilities {
249        &self.capabilities
250    }
251
252    fn is_available() -> bool {
253        // CPU backend is always available
254        true
255    }
256
257    fn initialize() -> Result<Self> {
258        Self::new()
259    }
260}
261
262impl Default for CpuBackend {
263    /// Creates a CPU backend with default settings.
264    ///
265    /// # Panics
266    ///
267    /// Panics if CPU backend initialization fails. Prefer
268    /// [`CpuBackend::new()`] for fallible construction.
269    fn default() -> Self {
270        match Self::new() {
271            Ok(backend) => backend,
272            Err(e) => panic!("Failed to initialize CPU backend: {e}"),
273        }
274    }
275}
276
277#[cfg(test)]
278mod tests {
279    use super::*;
280
281    #[test]
282    fn test_cpu_backend_always_available() {
283        assert!(CpuBackend::is_available());
284    }
285
286    #[test]
287    fn test_cpu_backend_creation() {
288        let backend = CpuBackend::new().expect("CPU backend creation should succeed");
289        assert!(backend.num_threads() > 0);
290        assert_eq!(backend.capabilities().backend_type, BackendType::CPU);
291    }
292
293    #[test]
294    fn test_simd_detection() {
295        let has_simd = CpuBackend::has_simd();
296        println!("SIMD available: {has_simd}");
297    }
298
299    #[test]
300    fn test_rgb_to_yuv_cpu() {
301        let input = vec![255, 0, 0, 255]; // Red pixel
302        let mut output = vec![0u8; 4];
303
304        CpuBackend::rgb_to_yuv_bt601(&input, &mut output, 1, 1);
305
306        // Y should be around 76 (0.299 * 255)
307        assert!(output[0] > 70 && output[0] < 80);
308    }
309
310    #[test]
311    fn test_resize_bilinear_cpu() {
312        let input = vec![255u8; 2 * 2 * 4]; // 2x2 white image
313        let mut output = vec![0u8; 4 * 4 * 4]; // 4x4 output
314
315        CpuBackend::resize_bilinear(&input, 2, 2, &mut output, 4, 4);
316
317        // Output should be mostly white
318        assert!(output[0] > 200);
319    }
320}
oximedia_gpu/backend/cpu.rs

oximedia_gpu/backend/
cpu.rs