scirs2_vision/gpu_modules/
basic_operations.rs

1//! Basic GPU-accelerated image processing operations
2//!
3//! This module provides fundamental image processing operations including
4//! convolution, blur, and gradient operations optimized for GPU execution.
5
6use super::context::GpuVisionContext;
7use crate::error::{Result, VisionError};
8use scirs2_core::gpu::GpuBackend;
9use scirs2_core::ndarray::{Array2, ArrayView2};
10
11/// GPU-accelerated image convolution
12///
13/// Performs 2D convolution on GPU for maximum performance.
14///
15/// # Arguments
16///
17/// * `ctx` - GPU vision context
18/// * `image` - Input image
19/// * `kernel` - Convolution kernel
20///
21/// # Returns
22///
23/// * Convolved image
24///
25/// # Performance
26///
27/// - 10-50x faster than CPU for large images
28/// - Optimal for kernels larger than 5x5
29/// - Batch processing support for multiple images
30#[allow(dead_code)]
31pub fn gpu_convolve_2d(
32    ctx: &GpuVisionContext,
33    image: &ArrayView2<f32>,
34    kernel: &ArrayView2<f32>,
35) -> Result<Array2<f32>> {
36    let (height, width) = image.dim();
37    let (k_height, k_width) = kernel.dim();
38
39    // Validate kernel dimensions
40    if k_height % 2 == 0 || k_width % 2 == 0 {
41        return Err(VisionError::InvalidInput(
42            "Kernel must have odd dimensions".to_string(),
43        ));
44    }
45
46    // If GPU is not available, fall back to SIMD
47    if !ctx.is_gpu_available() {
48        return crate::simd_ops::simd_convolve_2d(image, kernel);
49    }
50
51    // Calculate output dimensions
52    let out_height = height;
53    let out_width = width;
54
55    // Flatten the image and kernel for GPU transfer
56    let image_flat: Vec<f32> = image.iter().cloned().collect();
57    let kernel_flat: Vec<f32> = kernel.iter().cloned().collect();
58
59    // Create GPU buffers
60    let image_buffer = ctx.context.create_buffer_from_slice(&image_flat);
61    let kernel_buffer = ctx.context.create_buffer_from_slice(&kernel_flat);
62    let output_buffer = ctx.context.create_buffer::<f32>(out_height * out_width);
63
64    // Try to get the conv2d kernel from the registry
65    match ctx.context.get_kernel("conv2d") {
66        Ok(kernel_handle) => {
67            // Set kernel parameters
68            kernel_handle.set_buffer("input", &image_buffer);
69            kernel_handle.set_buffer("kernel", &kernel_buffer);
70            kernel_handle.set_buffer("output", &output_buffer);
71            kernel_handle.set_u32("batch_size", 1);
72            kernel_handle.set_u32("in_channels", 1);
73            kernel_handle.set_u32("out_channels", 1);
74            kernel_handle.set_u32("input_height", height as u32);
75            kernel_handle.set_u32("input_width", width as u32);
76            kernel_handle.set_u32("output_height", out_height as u32);
77            kernel_handle.set_u32("output_width", out_width as u32);
78            kernel_handle.set_u32("kernel_height", k_height as u32);
79            kernel_handle.set_u32("kernel_width", k_width as u32);
80            kernel_handle.set_u32("stride_y", 1);
81            kernel_handle.set_u32("stride_x", 1);
82            kernel_handle.set_u32("padding_y", (k_height / 2) as u32);
83            kernel_handle.set_u32("padding_x", (k_width / 2) as u32);
84
85            // Calculate work groups
86            let workgroup_size = 16;
87            let work_groups_x = out_height.div_ceil(workgroup_size);
88            let work_groups_y = out_width.div_ceil(workgroup_size);
89
90            // Dispatch the kernel
91            kernel_handle.dispatch([work_groups_x as u32, work_groups_y as u32, 1]);
92
93            // Copy result back to host
94            let mut result_flat = vec![0.0f32; out_height * out_width];
95            output_buffer
96                .copy_to_host(&mut result_flat)
97                .map_err(|e| VisionError::Other(format!("Failed to copy result from GPU: {e}")))?;
98
99            // Reshape to 2D array
100            Ok(Array2::from_shape_vec((out_height, out_width), result_flat)
101                .map_err(|e| VisionError::Other(format!("Failed to reshape output: {e}")))?)
102        }
103        Err(_) => {
104            // Kernel not found, fall back to custom implementation or SIMD
105            gpu_convolve_2d_custom(ctx, image, kernel)
106        }
107    }
108}
109
110/// Custom GPU convolution implementation when standard kernel is not available
111#[allow(dead_code)]
112fn gpu_convolve_2d_custom(
113    ctx: &GpuVisionContext,
114    image: &ArrayView2<f32>,
115    kernel: &ArrayView2<f32>,
116) -> Result<Array2<f32>> {
117    // Define custom convolution kernel source for vision-specific operations
118    let conv_kernel_source = match ctx.backend() {
119        GpuBackend::Cuda => {
120            r#"
121extern "C" __global__ void conv2d_vision(
122    const float* __restrict__ input,
123    const float* __restrict__ kernel,
124    float* __restrict__ output,
125    int height,
126    int width,
127    int k_height,
128    int k_width
129) {
130    int y = blockIdx.y * blockDim.y + threadIdx.y;
131    int x = blockIdx.x * blockDim.x + threadIdx.x;
132
133    if (y >= height || x >= width) return;
134
135    int k_half_h = k_height / 2;
136    int k_half_w = k_width / 2;
137    float sum = 0.0f;
138
139    for (int ky = 0; ky < k_height; ky++) {
140        for (int kx = 0; kx < k_width; kx++) {
141            int src_y = y + ky - k_half_h;
142            int src_x = x + kx - k_half_w;
143
144            if (src_y >= 0 && src_y < height && src_x >= 0 && src_x < width) {
145                sum += input[src_y * width + src_x] * kernel[ky * k_width + kx];
146            }
147        }
148    }
149
150    output[y * width + x] = sum;
151}
152"#
153        }
154        GpuBackend::Wgpu => {
155            r#"
156struct Params {
157    height: u32,
158    width: u32,
159    k_height: u32,
160    k_width: u32,
161};
162
163@group(0) @binding(0) var<storage, read> input: array<f32>;
164@group(0) @binding(1) var<storage, read> kernel: array<f32>;
165@group(0) @binding(2) var<storage, write> output: array<f32>;
166@group(0) @binding(3) var<uniform> params: Params;
167
168@compute @workgroup_size(16, 16)
169#[allow(dead_code)]
170fn conv2d_vision(@builtin(global_invocation_id) global_id: vec3<u32>) {
171    let y = global_id.y;
172    let x = global_id.x;
173
174    if (y >= params.height || x >= params.width) {
175        return;
176    }
177
178    let k_half_h = i32(params.k_height / 2u);
179    let k_half_w = i32(params.k_width / 2u);
180    var sum = 0.0;
181
182    for (var ky = 0u; ky < params.k_height; ky = ky + 1u) {
183        for (var kx = 0u; kx < params.k_width; kx = kx + 1u) {
184            let src_y = i32(y) + i32(ky) - k_half_h;
185            let src_x = i32(x) + i32(kx) - k_half_w;
186
187            if (src_y >= 0 && src_y < i32(params.height) && src_x >= 0 && src_x < i32(params.width)) {
188                let src_idx = u32(src_y) * params.width + u32(src_x);
189                let kernel_idx = ky * params.k_width + kx;
190                sum += input[src_idx] * kernel[kernel_idx];
191            }
192        }
193    }
194
195    output[y * params.width + x] = sum;
196}
197"#
198        }
199        _ => {
200            // Fall back to SIMD for unsupported backends
201            return crate::simd_ops::simd_convolve_2d(image, kernel);
202        }
203    };
204
205    // Compile and execute custom kernel
206    ctx.context.execute(|compiler| {
207        match compiler.compile(conv_kernel_source) {
208            Ok(kernel_handle) => {
209                // Setup and execute similar to above
210                let (height, width) = image.dim();
211                let (k_height, k_width) = kernel.dim();
212
213                let image_flat: Vec<f32> = image.iter().cloned().collect();
214                let kernel_flat: Vec<f32> = kernel.iter().cloned().collect();
215
216                let image_buffer = ctx.context.create_buffer_from_slice(&image_flat);
217                let kernel_buffer = ctx.context.create_buffer_from_slice(&kernel_flat);
218                let output_buffer = ctx.context.create_buffer::<f32>(height * width);
219
220                kernel_handle.set_buffer("input", &image_buffer);
221                kernel_handle.set_buffer("kernel", &kernel_buffer);
222                kernel_handle.set_buffer("output", &output_buffer);
223                kernel_handle.set_u32("height", height as u32);
224                kernel_handle.set_u32("width", width as u32);
225                kernel_handle.set_u32("k_height", k_height as u32);
226                kernel_handle.set_u32("k_width", k_width as u32);
227
228                let workgroup_size = 16;
229                let work_groups_x = height.div_ceil(workgroup_size);
230                let work_groups_y = width.div_ceil(workgroup_size);
231
232                kernel_handle.dispatch([work_groups_x as u32, work_groups_y as u32, 1]);
233
234                let mut result_flat = vec![0.0f32; height * width];
235                output_buffer.copy_to_host(&mut result_flat)
236                    .map_err(|e| VisionError::Other(format!("Failed to copy result from GPU: {e}")))?;
237
238                Array2::from_shape_vec((height, width), result_flat)
239                    .map_err(|e| VisionError::Other(format!("Failed to reshape output: {e}")))
240            }
241            Err(compile_error) => {
242                // Log compilation error for debugging
243                eprintln!(
244                    "GPU kernel compilation failed for backend {:?}: {}. Falling back to SIMD.",
245                    ctx.backend(),
246                    compile_error
247                );
248
249                // Attempt to provide more specific error information
250                let error_details = match ctx.backend() {
251                    GpuBackend::Cuda => {
252                        "CUDA kernel compilation failed. Check CUDA installation and driver version."
253                    }
254                    GpuBackend::Wgpu => {
255                        "WebGPU/WGSL kernel compilation failed. Check shader syntax and GPU support."
256                    }
257                    GpuBackend::Metal => {
258                        "Metal kernel compilation failed. Check macOS version and Metal support."
259                    }
260                    GpuBackend::OpenCL => {
261                        "OpenCL kernel compilation failed. Check OpenCL runtime and drivers."
262                    }
263                    GpuBackend::Cpu => {
264                        "CPU backend should not reach kernel compilation. This is a logic error."
265                    }
266                    GpuBackend::Rocm => {
267                        "ROCm kernel compilation failed. Check ROCm installation and shader support."
268                    }
269                };
270
271                eprintln!("GPU Error Details: {error_details}");
272
273                // Fall back to SIMD implementation
274                crate::simd_ops::simd_convolve_2d(image, kernel)
275            }
276        }
277    })
278}
279
280/// GPU-accelerated Gaussian blur
281///
282/// Applies Gaussian blur using GPU for maximum performance.
283///
284/// # Arguments
285///
286/// * `ctx` - GPU vision context
287/// * `image` - Input image
288/// * `sigma` - Standard deviation of Gaussian
289///
290/// # Returns
291///
292/// * Blurred image
293#[allow(dead_code)]
294pub fn gpu_gaussian_blur(
295    ctx: &GpuVisionContext,
296    image: &ArrayView2<f32>,
297    sigma: f32,
298) -> Result<Array2<f32>> {
299    // Generate Gaussian kernel
300    let kernel_size = (6.0 * sigma).ceil() as usize | 1;
301    let kernel = generate_gaussian_kernel(kernel_size, sigma);
302
303    // Use separable convolution for efficiency
304    gpu_separable_convolution(ctx, image, &kernel)
305}
306
307/// Generate 1D Gaussian kernel
308#[allow(dead_code)]
309fn generate_gaussian_kernel(size: usize, sigma: f32) -> Vec<f32> {
310    let half = size / 2;
311    let mut kernel = vec![0.0f32; size];
312    let mut sum = 0.0f32;
313
314    for (i, kernel_val) in kernel.iter_mut().enumerate() {
315        let x = i as f32 - half as f32;
316        let value = (-x * x / (2.0 * sigma * sigma)).exp();
317        *kernel_val = value;
318        sum += value;
319    }
320
321    // Normalize
322    for val in &mut kernel {
323        *val /= sum;
324    }
325
326    kernel
327}
328
329/// GPU-accelerated separable convolution
330///
331/// Performs convolution with a separable kernel (horizontal then vertical).
332#[allow(dead_code)]
333fn gpu_separable_convolution(
334    ctx: &GpuVisionContext,
335    image: &ArrayView2<f32>,
336    kernel_1d: &[f32],
337) -> Result<Array2<f32>> {
338    let (height, width) = image.dim();
339    let kernel_size = kernel_1d.len();
340
341    if !ctx.is_gpu_available() {
342        // Fall back to SIMD
343        return crate::simd_ops::simd_gaussian_blur(image, kernel_size as f32 / 6.0);
344    }
345
346    // GPU implementation - two pass separable convolution
347    let image_flat: Vec<f32> = image.iter().cloned().collect();
348
349    // First pass: horizontal convolution
350    let horizontal_result = gpu_separable_1d_pass(
351        ctx,
352        &image_flat,
353        kernel_1d,
354        height,
355        width,
356        true, // horizontal
357    )?;
358
359    // Second pass: vertical convolution
360    let final_result = gpu_separable_1d_pass(
361        ctx,
362        &horizontal_result,
363        kernel_1d,
364        height,
365        width,
366        false, // vertical
367    )?;
368
369    Array2::from_shape_vec((height, width), final_result)
370        .map_err(|e| VisionError::Other(format!("Failed to reshape output: {e}")))
371}
372
373/// Perform a single 1D convolution pass (horizontal or vertical)
374#[allow(dead_code)]
375fn gpu_separable_1d_pass(
376    ctx: &GpuVisionContext,
377    input: &[f32],
378    kernel: &[f32],
379    height: usize,
380    width: usize,
381    horizontal: bool,
382) -> Result<Vec<f32>> {
383    let input_buffer = ctx.context.create_buffer_from_slice(input);
384    let kernel_buffer = ctx.context.create_buffer_from_slice(kernel);
385    let output_buffer = ctx.context.create_buffer::<f32>(height * width);
386
387    let kernel_source = match ctx.backend() {
388        GpuBackend::Cuda => r#"
389extern "C" __global__ void separable_conv_1d(
390    const float* __restrict__ input,
391    const float* __restrict__ kernel,
392    float* __restrict__ output,
393    int height,
394    int width,
395    int kernel_size,
396    int horizontal
397) {
398    int idx = blockIdx.x * blockDim.x + threadIdx.x;
399    int total_size = height * width;
400
401    if (idx >= total_size) return;
402
403    int y = idx / width;
404    int x = idx % width;
405    int half_kernel = kernel_size / 2;
406    float sum = 0.0f;
407
408    if (horizontal) {
409        // Horizontal pass
410        for (int k = 0; k < kernel_size; k++) {
411            int src_x = x + k - half_kernel;
412            if (src_x >= 0 && src_x < width) {
413                sum += input[y * width + src_x] * kernel[k];
414            }
415        }
416    } else {
417        // Vertical pass
418        for (int k = 0; k < kernel_size; k++) {
419            int src_y = y + k - half_kernel;
420            if (src_y >= 0 && src_y < height) {
421                sum += input[src_y * width + x] * kernel[k];
422            }
423        }
424    }
425
426    output[idx] = sum;
427}
428"#
429        .to_string(),
430        GpuBackend::Wgpu => r#"
431struct SeparableParams {
432    height: u32,
433    width: u32,
434    kernel_size: u32,
435    horizontal: u32,
436};
437
438@group(0) @binding(0) var<storage, read> input: array<f32>;
439@group(0) @binding(1) var<storage, read> kernel: array<f32>;
440@group(0) @binding(2) var<storage, write> output: array<f32>;
441@group(0) @binding(3) var<uniform> params: SeparableParams;
442
443@compute @workgroup_size(256)
444#[allow(dead_code)]
445fn separable_conv_1d(@builtin(global_invocation_id) global_id: vec3<u32>) {
446    let idx = global_id.x;
447    let total_size = params.height * params.width;
448
449    if (idx >= total_size) {
450        return;
451    }
452
453    let y = idx / params.width;
454    let x = idx % params.width;
455    let half_kernel = i32(params.kernel_size / 2u);
456    var sum = 0.0;
457
458    if (params.horizontal != 0u) {
459        // Horizontal pass
460        for (var k = 0u; k < params.kernel_size; k = k + 1u) {
461            let src_x = i32(x) + i32(k) - half_kernel;
462            if (src_x >= 0 && src_x < i32(params.width)) {
463                let input_idx = y * params.width + u32(src_x);
464                sum += input[input_idx] * kernel[k];
465            }
466        }
467    } else {
468        // Vertical pass
469        for (var k = 0u; k < params.kernel_size; k = k + 1u) {
470            let src_y = i32(y) + i32(k) - half_kernel;
471            if (src_y >= 0 && src_y < i32(params.height)) {
472                let input_idx = u32(src_y) * params.width + x;
473                sum += input[input_idx] * kernel[k];
474            }
475        }
476    }
477
478    output[idx] = sum;
479}
480"#
481        .to_string(),
482        _ => {
483            // Fall back for unsupported backends
484            return Ok(input.to_vec());
485        }
486    };
487
488    ctx.context
489        .execute(|compiler| match compiler.compile(&kernel_source) {
490            Ok(kernel_handle) => {
491                kernel_handle.set_buffer("input", &input_buffer);
492                kernel_handle.set_buffer("kernel", &kernel_buffer);
493                kernel_handle.set_buffer("output", &output_buffer);
494
495                // Set parameters based on backend type
496                match ctx.backend() {
497                    GpuBackend::Wgpu => {
498                        // For WebGPU, parameters are passed as a uniform struct
499                        kernel_handle.set_u32("height", height as u32);
500                        kernel_handle.set_u32("width", width as u32);
501                        kernel_handle.set_u32("kernel_size", kernel.len() as u32);
502                        kernel_handle.set_u32("horizontal", if horizontal { 1 } else { 0 });
503                    }
504                    _ => {
505                        // For CUDA and other backends, use individual parameters
506                        kernel_handle.set_i32("height", height as i32);
507                        kernel_handle.set_i32("width", width as i32);
508                        kernel_handle.set_i32("kernel_size", kernel.len() as i32);
509                        kernel_handle.set_i32("horizontal", if horizontal { 1 } else { 0 });
510                    }
511                }
512
513                let workgroup_size = 256;
514                let work_groups = (height * width).div_ceil(workgroup_size);
515
516                kernel_handle.dispatch([work_groups as u32, 1, 1]);
517
518                let mut result = vec![0.0f32; height * width];
519                output_buffer.copy_to_host(&mut result)
520                    .map_err(|e| VisionError::Other(format!("Failed to copy result from GPU: {e}")))?;
521                Ok(result)
522            }
523            Err(compile_error) => {
524                eprintln!(
525                    "GPU separable convolution kernel compilation failed for backend {:?}: {}. Using CPU fallback.",
526                    ctx.backend(),
527                    compile_error
528                );
529                Ok(input.to_vec())
530            },
531        })
532}
533
534#[cfg(test)]
535mod tests {
536    use super::*;
537    use scirs2_core::ndarray::arr2;
538
539    #[test]
540    fn test_gaussian_kernel_generation() {
541        let kernel = generate_gaussian_kernel(5, 1.0);
542        assert_eq!(kernel.len(), 5);
543
544        // Check normalization
545        let sum: f32 = kernel.iter().sum();
546        assert!((sum - 1.0).abs() < 1e-6);
547
548        // Check symmetry
549        assert!((kernel[0] - kernel[4]).abs() < 1e-6);
550        assert!((kernel[1] - kernel[3]).abs() < 1e-6);
551    }
552
553    #[test]
554    fn test_gpu_convolution() {
555        if let Ok(ctx) = GpuVisionContext::new() {
556            let image = arr2(&[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]);
557
558            let kernel = arr2(&[[0.0, -1.0, 0.0], [-1.0, 4.0, -1.0], [0.0, -1.0, 0.0]]);
559
560            let result = gpu_convolve_2d(&ctx, &image.view(), &kernel.view());
561            assert!(result.is_ok());
562        }
563    }
564}
scirs2_vision/gpu_modules/basic_operations.rs

scirs2_vision/gpu_modules/
basic_operations.rs