scirs2_vision/gpu_modules/
basic_operations.rs1use super::context::GpuVisionContext;
7use crate::error::{Result, VisionError};
8use scirs2_core::gpu::GpuBackend;
9use scirs2_core::ndarray::{Array2, ArrayView2};
10
11#[allow(dead_code)]
31pub fn gpu_convolve_2d(
32 ctx: &GpuVisionContext,
33 image: &ArrayView2<f32>,
34 kernel: &ArrayView2<f32>,
35) -> Result<Array2<f32>> {
36 let (height, width) = image.dim();
37 let (k_height, k_width) = kernel.dim();
38
39 if k_height % 2 == 0 || k_width % 2 == 0 {
41 return Err(VisionError::InvalidInput(
42 "Kernel must have odd dimensions".to_string(),
43 ));
44 }
45
46 if !ctx.is_gpu_available() {
48 return crate::simd_ops::simd_convolve_2d(image, kernel);
49 }
50
51 let out_height = height;
53 let out_width = width;
54
55 let image_flat: Vec<f32> = image.iter().cloned().collect();
57 let kernel_flat: Vec<f32> = kernel.iter().cloned().collect();
58
59 let image_buffer = ctx.context.create_buffer_from_slice(&image_flat);
61 let kernel_buffer = ctx.context.create_buffer_from_slice(&kernel_flat);
62 let output_buffer = ctx.context.create_buffer::<f32>(out_height * out_width);
63
64 match ctx.context.get_kernel("conv2d") {
66 Ok(kernel_handle) => {
67 kernel_handle.set_buffer("input", &image_buffer);
69 kernel_handle.set_buffer("kernel", &kernel_buffer);
70 kernel_handle.set_buffer("output", &output_buffer);
71 kernel_handle.set_u32("batch_size", 1);
72 kernel_handle.set_u32("in_channels", 1);
73 kernel_handle.set_u32("out_channels", 1);
74 kernel_handle.set_u32("input_height", height as u32);
75 kernel_handle.set_u32("input_width", width as u32);
76 kernel_handle.set_u32("output_height", out_height as u32);
77 kernel_handle.set_u32("output_width", out_width as u32);
78 kernel_handle.set_u32("kernel_height", k_height as u32);
79 kernel_handle.set_u32("kernel_width", k_width as u32);
80 kernel_handle.set_u32("stride_y", 1);
81 kernel_handle.set_u32("stride_x", 1);
82 kernel_handle.set_u32("padding_y", (k_height / 2) as u32);
83 kernel_handle.set_u32("padding_x", (k_width / 2) as u32);
84
85 let workgroup_size = 16;
87 let work_groups_x = out_height.div_ceil(workgroup_size);
88 let work_groups_y = out_width.div_ceil(workgroup_size);
89
90 kernel_handle.dispatch([work_groups_x as u32, work_groups_y as u32, 1]);
92
93 let mut result_flat = vec![0.0f32; out_height * out_width];
95 output_buffer
96 .copy_to_host(&mut result_flat)
97 .map_err(|e| VisionError::Other(format!("Failed to copy result from GPU: {e}")))?;
98
99 Ok(Array2::from_shape_vec((out_height, out_width), result_flat)
101 .map_err(|e| VisionError::Other(format!("Failed to reshape output: {e}")))?)
102 }
103 Err(_) => {
104 gpu_convolve_2d_custom(ctx, image, kernel)
106 }
107 }
108}
109
110#[allow(dead_code)]
112fn gpu_convolve_2d_custom(
113 ctx: &GpuVisionContext,
114 image: &ArrayView2<f32>,
115 kernel: &ArrayView2<f32>,
116) -> Result<Array2<f32>> {
117 let conv_kernel_source = match ctx.backend() {
119 GpuBackend::Cuda => {
120 r#"
121extern "C" __global__ void conv2d_vision(
122 const float* __restrict__ input,
123 const float* __restrict__ kernel,
124 float* __restrict__ output,
125 int height,
126 int width,
127 int k_height,
128 int k_width
129) {
130 int y = blockIdx.y * blockDim.y + threadIdx.y;
131 int x = blockIdx.x * blockDim.x + threadIdx.x;
132
133 if (y >= height || x >= width) return;
134
135 int k_half_h = k_height / 2;
136 int k_half_w = k_width / 2;
137 float sum = 0.0f;
138
139 for (int ky = 0; ky < k_height; ky++) {
140 for (int kx = 0; kx < k_width; kx++) {
141 int src_y = y + ky - k_half_h;
142 int src_x = x + kx - k_half_w;
143
144 if (src_y >= 0 && src_y < height && src_x >= 0 && src_x < width) {
145 sum += input[src_y * width + src_x] * kernel[ky * k_width + kx];
146 }
147 }
148 }
149
150 output[y * width + x] = sum;
151}
152"#
153 }
154 GpuBackend::Wgpu => {
155 r#"
156struct Params {
157 height: u32,
158 width: u32,
159 k_height: u32,
160 k_width: u32,
161};
162
163@group(0) @binding(0) var<storage, read> input: array<f32>;
164@group(0) @binding(1) var<storage, read> kernel: array<f32>;
165@group(0) @binding(2) var<storage, write> output: array<f32>;
166@group(0) @binding(3) var<uniform> params: Params;
167
168@compute @workgroup_size(16, 16)
169#[allow(dead_code)]
170fn conv2d_vision(@builtin(global_invocation_id) global_id: vec3<u32>) {
171 let y = global_id.y;
172 let x = global_id.x;
173
174 if (y >= params.height || x >= params.width) {
175 return;
176 }
177
178 let k_half_h = i32(params.k_height / 2u);
179 let k_half_w = i32(params.k_width / 2u);
180 var sum = 0.0;
181
182 for (var ky = 0u; ky < params.k_height; ky = ky + 1u) {
183 for (var kx = 0u; kx < params.k_width; kx = kx + 1u) {
184 let src_y = i32(y) + i32(ky) - k_half_h;
185 let src_x = i32(x) + i32(kx) - k_half_w;
186
187 if (src_y >= 0 && src_y < i32(params.height) && src_x >= 0 && src_x < i32(params.width)) {
188 let src_idx = u32(src_y) * params.width + u32(src_x);
189 let kernel_idx = ky * params.k_width + kx;
190 sum += input[src_idx] * kernel[kernel_idx];
191 }
192 }
193 }
194
195 output[y * params.width + x] = sum;
196}
197"#
198 }
199 _ => {
200 return crate::simd_ops::simd_convolve_2d(image, kernel);
202 }
203 };
204
205 ctx.context.execute(|compiler| {
207 match compiler.compile(conv_kernel_source) {
208 Ok(kernel_handle) => {
209 let (height, width) = image.dim();
211 let (k_height, k_width) = kernel.dim();
212
213 let image_flat: Vec<f32> = image.iter().cloned().collect();
214 let kernel_flat: Vec<f32> = kernel.iter().cloned().collect();
215
216 let image_buffer = ctx.context.create_buffer_from_slice(&image_flat);
217 let kernel_buffer = ctx.context.create_buffer_from_slice(&kernel_flat);
218 let output_buffer = ctx.context.create_buffer::<f32>(height * width);
219
220 kernel_handle.set_buffer("input", &image_buffer);
221 kernel_handle.set_buffer("kernel", &kernel_buffer);
222 kernel_handle.set_buffer("output", &output_buffer);
223 kernel_handle.set_u32("height", height as u32);
224 kernel_handle.set_u32("width", width as u32);
225 kernel_handle.set_u32("k_height", k_height as u32);
226 kernel_handle.set_u32("k_width", k_width as u32);
227
228 let workgroup_size = 16;
229 let work_groups_x = height.div_ceil(workgroup_size);
230 let work_groups_y = width.div_ceil(workgroup_size);
231
232 kernel_handle.dispatch([work_groups_x as u32, work_groups_y as u32, 1]);
233
234 let mut result_flat = vec![0.0f32; height * width];
235 output_buffer.copy_to_host(&mut result_flat)
236 .map_err(|e| VisionError::Other(format!("Failed to copy result from GPU: {e}")))?;
237
238 Array2::from_shape_vec((height, width), result_flat)
239 .map_err(|e| VisionError::Other(format!("Failed to reshape output: {e}")))
240 }
241 Err(compile_error) => {
242 eprintln!(
244 "GPU kernel compilation failed for backend {:?}: {}. Falling back to SIMD.",
245 ctx.backend(),
246 compile_error
247 );
248
249 let error_details = match ctx.backend() {
251 GpuBackend::Cuda => {
252 "CUDA kernel compilation failed. Check CUDA installation and driver version."
253 }
254 GpuBackend::Wgpu => {
255 "WebGPU/WGSL kernel compilation failed. Check shader syntax and GPU support."
256 }
257 GpuBackend::Metal => {
258 "Metal kernel compilation failed. Check macOS version and Metal support."
259 }
260 GpuBackend::OpenCL => {
261 "OpenCL kernel compilation failed. Check OpenCL runtime and drivers."
262 }
263 GpuBackend::Cpu => {
264 "CPU backend should not reach kernel compilation. This is a logic error."
265 }
266 GpuBackend::Rocm => {
267 "ROCm kernel compilation failed. Check ROCm installation and shader support."
268 }
269 };
270
271 eprintln!("GPU Error Details: {error_details}");
272
273 crate::simd_ops::simd_convolve_2d(image, kernel)
275 }
276 }
277 })
278}
279
280#[allow(dead_code)]
294pub fn gpu_gaussian_blur(
295 ctx: &GpuVisionContext,
296 image: &ArrayView2<f32>,
297 sigma: f32,
298) -> Result<Array2<f32>> {
299 let kernel_size = (6.0 * sigma).ceil() as usize | 1;
301 let kernel = generate_gaussian_kernel(kernel_size, sigma);
302
303 gpu_separable_convolution(ctx, image, &kernel)
305}
306
307#[allow(dead_code)]
309fn generate_gaussian_kernel(size: usize, sigma: f32) -> Vec<f32> {
310 let half = size / 2;
311 let mut kernel = vec![0.0f32; size];
312 let mut sum = 0.0f32;
313
314 for (i, kernel_val) in kernel.iter_mut().enumerate() {
315 let x = i as f32 - half as f32;
316 let value = (-x * x / (2.0 * sigma * sigma)).exp();
317 *kernel_val = value;
318 sum += value;
319 }
320
321 for val in &mut kernel {
323 *val /= sum;
324 }
325
326 kernel
327}
328
329#[allow(dead_code)]
333fn gpu_separable_convolution(
334 ctx: &GpuVisionContext,
335 image: &ArrayView2<f32>,
336 kernel_1d: &[f32],
337) -> Result<Array2<f32>> {
338 let (height, width) = image.dim();
339 let kernel_size = kernel_1d.len();
340
341 if !ctx.is_gpu_available() {
342 return crate::simd_ops::simd_gaussian_blur(image, kernel_size as f32 / 6.0);
344 }
345
346 let image_flat: Vec<f32> = image.iter().cloned().collect();
348
349 let horizontal_result = gpu_separable_1d_pass(
351 ctx,
352 &image_flat,
353 kernel_1d,
354 height,
355 width,
356 true, )?;
358
359 let final_result = gpu_separable_1d_pass(
361 ctx,
362 &horizontal_result,
363 kernel_1d,
364 height,
365 width,
366 false, )?;
368
369 Array2::from_shape_vec((height, width), final_result)
370 .map_err(|e| VisionError::Other(format!("Failed to reshape output: {e}")))
371}
372
373#[allow(dead_code)]
375fn gpu_separable_1d_pass(
376 ctx: &GpuVisionContext,
377 input: &[f32],
378 kernel: &[f32],
379 height: usize,
380 width: usize,
381 horizontal: bool,
382) -> Result<Vec<f32>> {
383 let input_buffer = ctx.context.create_buffer_from_slice(input);
384 let kernel_buffer = ctx.context.create_buffer_from_slice(kernel);
385 let output_buffer = ctx.context.create_buffer::<f32>(height * width);
386
387 let kernel_source = match ctx.backend() {
388 GpuBackend::Cuda => r#"
389extern "C" __global__ void separable_conv_1d(
390 const float* __restrict__ input,
391 const float* __restrict__ kernel,
392 float* __restrict__ output,
393 int height,
394 int width,
395 int kernel_size,
396 int horizontal
397) {
398 int idx = blockIdx.x * blockDim.x + threadIdx.x;
399 int total_size = height * width;
400
401 if (idx >= total_size) return;
402
403 int y = idx / width;
404 int x = idx % width;
405 int half_kernel = kernel_size / 2;
406 float sum = 0.0f;
407
408 if (horizontal) {
409 // Horizontal pass
410 for (int k = 0; k < kernel_size; k++) {
411 int src_x = x + k - half_kernel;
412 if (src_x >= 0 && src_x < width) {
413 sum += input[y * width + src_x] * kernel[k];
414 }
415 }
416 } else {
417 // Vertical pass
418 for (int k = 0; k < kernel_size; k++) {
419 int src_y = y + k - half_kernel;
420 if (src_y >= 0 && src_y < height) {
421 sum += input[src_y * width + x] * kernel[k];
422 }
423 }
424 }
425
426 output[idx] = sum;
427}
428"#
429 .to_string(),
430 GpuBackend::Wgpu => r#"
431struct SeparableParams {
432 height: u32,
433 width: u32,
434 kernel_size: u32,
435 horizontal: u32,
436};
437
438@group(0) @binding(0) var<storage, read> input: array<f32>;
439@group(0) @binding(1) var<storage, read> kernel: array<f32>;
440@group(0) @binding(2) var<storage, write> output: array<f32>;
441@group(0) @binding(3) var<uniform> params: SeparableParams;
442
443@compute @workgroup_size(256)
444#[allow(dead_code)]
445fn separable_conv_1d(@builtin(global_invocation_id) global_id: vec3<u32>) {
446 let idx = global_id.x;
447 let total_size = params.height * params.width;
448
449 if (idx >= total_size) {
450 return;
451 }
452
453 let y = idx / params.width;
454 let x = idx % params.width;
455 let half_kernel = i32(params.kernel_size / 2u);
456 var sum = 0.0;
457
458 if (params.horizontal != 0u) {
459 // Horizontal pass
460 for (var k = 0u; k < params.kernel_size; k = k + 1u) {
461 let src_x = i32(x) + i32(k) - half_kernel;
462 if (src_x >= 0 && src_x < i32(params.width)) {
463 let input_idx = y * params.width + u32(src_x);
464 sum += input[input_idx] * kernel[k];
465 }
466 }
467 } else {
468 // Vertical pass
469 for (var k = 0u; k < params.kernel_size; k = k + 1u) {
470 let src_y = i32(y) + i32(k) - half_kernel;
471 if (src_y >= 0 && src_y < i32(params.height)) {
472 let input_idx = u32(src_y) * params.width + x;
473 sum += input[input_idx] * kernel[k];
474 }
475 }
476 }
477
478 output[idx] = sum;
479}
480"#
481 .to_string(),
482 _ => {
483 return Ok(input.to_vec());
485 }
486 };
487
488 ctx.context
489 .execute(|compiler| match compiler.compile(&kernel_source) {
490 Ok(kernel_handle) => {
491 kernel_handle.set_buffer("input", &input_buffer);
492 kernel_handle.set_buffer("kernel", &kernel_buffer);
493 kernel_handle.set_buffer("output", &output_buffer);
494
495 match ctx.backend() {
497 GpuBackend::Wgpu => {
498 kernel_handle.set_u32("height", height as u32);
500 kernel_handle.set_u32("width", width as u32);
501 kernel_handle.set_u32("kernel_size", kernel.len() as u32);
502 kernel_handle.set_u32("horizontal", if horizontal { 1 } else { 0 });
503 }
504 _ => {
505 kernel_handle.set_i32("height", height as i32);
507 kernel_handle.set_i32("width", width as i32);
508 kernel_handle.set_i32("kernel_size", kernel.len() as i32);
509 kernel_handle.set_i32("horizontal", if horizontal { 1 } else { 0 });
510 }
511 }
512
513 let workgroup_size = 256;
514 let work_groups = (height * width).div_ceil(workgroup_size);
515
516 kernel_handle.dispatch([work_groups as u32, 1, 1]);
517
518 let mut result = vec![0.0f32; height * width];
519 output_buffer.copy_to_host(&mut result)
520 .map_err(|e| VisionError::Other(format!("Failed to copy result from GPU: {e}")))?;
521 Ok(result)
522 }
523 Err(compile_error) => {
524 eprintln!(
525 "GPU separable convolution kernel compilation failed for backend {:?}: {}. Using CPU fallback.",
526 ctx.backend(),
527 compile_error
528 );
529 Ok(input.to_vec())
530 },
531 })
532}
533
534#[cfg(test)]
535mod tests {
536 use super::*;
537 use scirs2_core::ndarray::arr2;
538
539 #[test]
540 fn test_gaussian_kernel_generation() {
541 let kernel = generate_gaussian_kernel(5, 1.0);
542 assert_eq!(kernel.len(), 5);
543
544 let sum: f32 = kernel.iter().sum();
546 assert!((sum - 1.0).abs() < 1e-6);
547
548 assert!((kernel[0] - kernel[4]).abs() < 1e-6);
550 assert!((kernel[1] - kernel[3]).abs() < 1e-6);
551 }
552
553 #[test]
554 fn test_gpu_convolution() {
555 if let Ok(ctx) = GpuVisionContext::new() {
556 let image = arr2(&[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]);
557
558 let kernel = arr2(&[[0.0, -1.0, 0.0], [-1.0, 4.0, -1.0], [0.0, -1.0, 0.0]]);
559
560 let result = gpu_convolve_2d(&ctx, &image.view(), &kernel.view());
561 assert!(result.is_ok());
562 }
563 }
564}