scirs2_vision/gpu_modules/
batch_processing.rs1use super::{basic_operations::gpu_convolve_2d, context::GpuVisionContext};
7use crate::error::{Result, VisionError};
8use scirs2_core::gpu::GpuBackend;
9use scirs2_core::ndarray::{Array2, ArrayView2};
10
11#[allow(dead_code)]
25pub fn gpu_batch_process<F>(
26 ctx: &GpuVisionContext,
27 images: &[ArrayView2<f32>],
28 operation: F,
29) -> Result<Vec<Array2<f32>>>
30where
31 F: Fn(&GpuVisionContext, &ArrayView2<f32>) -> Result<Array2<f32>>,
32{
33 images.iter().map(|img| operation(ctx, img)).collect()
34}
35
36pub struct GpuMemoryPool {
40 buffers: std::collections::HashMap<usize, Vec<scirs2_core::gpu::GpuBuffer<f32>>>,
41 max_pool_size: usize,
42}
43
44impl Default for GpuMemoryPool {
45 fn default() -> Self {
46 Self::new()
47 }
48}
49
50impl GpuMemoryPool {
51 pub fn new() -> Self {
53 Self {
54 buffers: std::collections::HashMap::new(),
55 max_pool_size: 50, }
57 }
58
59 pub fn get_buffer(
61 &mut self,
62 ctx: &GpuVisionContext,
63 size: usize,
64 ) -> scirs2_core::gpu::GpuBuffer<f32> {
65 if let Some(pool) = self.buffers.get_mut(&size) {
66 if let Some(buffer) = pool.pop() {
67 return buffer;
68 }
69 }
70
71 ctx.context.create_buffer::<f32>(size)
73 }
74
75 pub fn return_buffer(&mut self, size: usize, buffer: scirs2_core::gpu::GpuBuffer<f32>) {
77 let pool = self.buffers.entry(size).or_default();
78 if pool.len() < self.max_pool_size {
79 pool.push(buffer);
80 }
81 }
83
84 pub fn clear(&mut self) {
86 self.buffers.clear();
87 }
88}
89
90#[allow(dead_code)]
98pub fn gpu_batch_convolve_2d(
99 ctx: &GpuVisionContext,
100 images: &[ArrayView2<f32>],
101 kernel: &ArrayView2<f32>,
102) -> Result<Vec<Array2<f32>>> {
103 if images.is_empty() {
104 return Ok(Vec::new());
105 }
106
107 let (height, width) = images[0].dim();
108 let batch_size = images.len();
109 let (k_height, k_width) = kernel.dim();
110
111 for (i, image) in images.iter().enumerate() {
113 if image.dim() != (height, width) {
114 return Err(VisionError::InvalidInput(format!(
115 "Image {i} has different dimensions"
116 )));
117 }
118 }
119
120 if !ctx.is_gpu_available() {
121 return images
123 .iter()
124 .map(|img| crate::simd_ops::simd_convolve_2d(img, kernel))
125 .collect();
126 }
127
128 let total_size = batch_size * height * width;
130 let mut batch_data = Vec::with_capacity(total_size);
131
132 for image in images {
133 batch_data.extend(image.iter().copied());
134 }
135
136 let kernel_flat: Vec<f32> = kernel.iter().copied().collect();
137
138 let batch_buffer = ctx.context.create_buffer_from_slice(&batch_data);
140 let kernel_buffer = ctx.context.create_buffer_from_slice(&kernel_flat);
141 let output_buffer = ctx.context.create_buffer::<f32>(total_size);
142
143 let batch_kernel_source = match ctx.backend() {
145 GpuBackend::Cuda => {
146 r#"
147extern "C" __global__ void batch_conv2d(
148 const float* __restrict__ input,
149 const float* __restrict__ kernel,
150 float* __restrict__ output,
151 int batch_size,
152 int height,
153 int width,
154 int k_height,
155 int k_width
156) {
157 int batch = blockIdx.z;
158 int y = blockIdx.y * blockDim.y + threadIdx.y;
159 int x = blockIdx.x * blockDim.x + threadIdx.x;
160
161 if (batch >= batch_size || y >= height || x >= width) return;
162
163 int k_half_h = k_height / 2;
164 int k_half_w = k_width / 2;
165 float sum = 0.0f;
166 int imagesize = height * width;
167 int batch_offset = batch * imagesize;
168
169 for (int ky = 0; ky < k_height; ky++) {
170 for (int kx = 0; kx < k_width; kx++) {
171 int src_y = y + ky - k_half_h;
172 int src_x = x + kx - k_half_w;
173
174 if (src_y >= 0 && src_y < height && src_x >= 0 && src_x < width) {
175 int src_idx = batch_offset + src_y * width + src_x;
176 int kernel_idx = ky * k_width + kx;
177 sum += input[src_idx] * kernel[kernel_idx];
178 }
179 }
180 }
181
182 output[batch_offset + y * width + x] = sum;
183}
184"#
185 }
186 GpuBackend::Wgpu => {
187 r#"
188struct BatchParams {
189 batch_size: u32,
190 height: u32,
191 width: u32,
192 k_height: u32,
193 k_width: u32,
194};
195
196@group(0) @binding(0) var<storage, read> input: array<f32>;
197@group(0) @binding(1) var<storage, read> kernel: array<f32>;
198@group(0) @binding(2) var<storage, write> output: array<f32>;
199@group(0) @binding(3) var<uniform> params: BatchParams;
200
201@compute @workgroup_size(8, 8, 4)
202#[allow(dead_code)]
203fn batch_conv2d(@builtin(global_invocation_id) global_id: vec3<u32>) {
204 let batch = global_id.z;
205 let y = global_id.y;
206 let x = global_id.x;
207
208 if (batch >= params.batch_size || y >= params.height || x >= params.width) {
209 return;
210 }
211
212 let k_half_h = i32(params.k_height / 2u);
213 let k_half_w = i32(params.k_width / 2u);
214 var sum = 0.0;
215 let imagesize = params.height * params.width;
216 let batch_offset = batch * imagesize;
217
218 for (var ky = 0u; ky < params.k_height; ky = ky + 1u) {
219 for (var kx = 0u; kx < params.k_width; kx = kx + 1u) {
220 let src_y = i32(y) + i32(ky) - k_half_h;
221 let src_x = i32(x) + i32(kx) - k_half_w;
222
223 if (src_y >= 0 && src_y < i32(params.height) && src_x >= 0 && src_x < i32(params.width)) {
224 let src_idx = batch_offset + u32(src_y) * params.width + u32(src_x);
225 let kernel_idx = ky * params.k_width + kx;
226 sum += input[src_idx] * kernel[kernel_idx];
227 }
228 }
229 }
230
231 output[batch_offset + y * params.width + x] = sum;
232}
233"#
234 }
235 _ => {
236 return images
238 .iter()
239 .map(|img| crate::simd_ops::simd_convolve_2d(img, kernel))
240 .collect();
241 }
242 };
243
244 ctx.context.execute(|compiler| {
245 match compiler.compile(batch_kernel_source) {
246 Ok(kernel_handle) => {
247 kernel_handle.set_buffer("input", &batch_buffer);
248 kernel_handle.set_buffer("kernel", &kernel_buffer);
249 kernel_handle.set_buffer("output", &output_buffer);
250 kernel_handle.set_u32("batch_size", batch_size as u32);
251 kernel_handle.set_u32("height", height as u32);
252 kernel_handle.set_u32("width", width as u32);
253 kernel_handle.set_u32("k_height", k_height as u32);
254 kernel_handle.set_u32("k_width", k_width as u32);
255
256 let workgroup_size = 8;
257 let work_groups_x = height.div_ceil(workgroup_size);
258 let work_groups_y = width.div_ceil(workgroup_size);
259 let work_groups_z = batch_size.div_ceil(4); kernel_handle.dispatch([
262 work_groups_x as u32,
263 work_groups_y as u32,
264 work_groups_z as u32,
265 ]);
266
267 let mut result_flat = vec![0.0f32; total_size];
268 output_buffer.copy_to_host(&mut result_flat).map_err(|e| {
269 VisionError::Other(format!("Failed to copy result from GPU: {e}"))
270 })?;
271
272 let mut results = Vec::with_capacity(batch_size);
274 for i in 0..batch_size {
275 let start = i * height * width;
276 let end = start + height * width;
277 let image_data = &result_flat[start..end];
278
279 let result_array = Array2::from_shape_vec((height, width), image_data.to_vec())
280 .map_err(|e| {
281 VisionError::Other(format!("Failed to reshape output: {e}"))
282 })?;
283
284 results.push(result_array);
285 }
286
287 Ok(results)
288 }
289 Err(_) => {
290 images
292 .iter()
293 .map(|img| crate::simd_ops::simd_convolve_2d(img, kernel))
294 .collect()
295 }
296 }
297 })
298}
299
300pub struct AsyncGpuProcessor {
304 context: GpuVisionContext,
305 #[allow(dead_code)]
306 memory_pool: GpuMemoryPool,
307}
308
309impl AsyncGpuProcessor {
310 pub fn new() -> Result<Self> {
312 Ok(Self {
313 context: GpuVisionContext::new()?,
314 memory_pool: GpuMemoryPool::new(),
315 })
316 }
317
318 pub async fn process_async(
320 &mut self,
321 image: &ArrayView2<'_, f32>,
322 operation: GpuOperation,
323 ) -> Result<Array2<f32>> {
324 match operation {
325 GpuOperation::Convolution(kernel) => {
326 gpu_convolve_2d(&self.context, image, &kernel.view())
327 }
328 GpuOperation::GaussianBlur(sigma) => {
329 super::basic_operations::gpu_gaussian_blur(&self.context, image, sigma)
330 }
331 GpuOperation::SobelEdges => {
332 let (_, _, magnitude) =
333 super::feature_detection::gpu_sobel_gradients(&self.context, image)?;
334 Ok(magnitude)
335 }
336 }
337 }
338}
339
340pub enum GpuOperation {
342 Convolution(Array2<f32>),
344 GaussianBlur(f32),
346 SobelEdges,
348}
349
350pub struct GpuBenchmark {
352 ctx: GpuVisionContext,
353}
354
355impl GpuBenchmark {
356 pub fn new() -> Result<Self> {
358 Ok(Self {
359 ctx: GpuVisionContext::new()?,
360 })
361 }
362
363 pub fn benchmark_convolution(&self, imagesize: (usize, usize), kernel_size: usize) -> f64 {
365 use std::time::Instant;
366
367 let image = Array2::zeros(imagesize);
368 let kernel = Array2::ones((kernel_size, kernel_size));
369
370 let start = Instant::now();
371 let _ = gpu_convolve_2d(&self.ctx, &image.view(), &kernel.view());
372
373 start.elapsed().as_secs_f64()
374 }
375}
376
377pub struct GpuPerformanceProfiler {
379 operation_times: std::collections::HashMap<String, Vec<std::time::Duration>>,
380 memory_usage: Vec<(std::time::Instant, usize)>,
381}
382
383impl Default for GpuPerformanceProfiler {
384 fn default() -> Self {
385 Self::new()
386 }
387}
388
389impl GpuPerformanceProfiler {
390 pub fn new() -> Self {
392 Self {
393 operation_times: std::collections::HashMap::new(),
394 memory_usage: Vec::new(),
395 }
396 }
397
398 pub fn start_timing(&self, _operation: &str) -> std::time::Instant {
400 std::time::Instant::now()
401 }
402
403 pub fn end_timing(&mut self, operation: &str, start: std::time::Instant) {
405 let duration = start.elapsed();
406 self.operation_times
407 .entry(operation.to_string())
408 .or_default()
409 .push(duration);
410 }
411
412 pub fn record_memory_usage(&mut self, bytes: usize) {
414 self.memory_usage.push((std::time::Instant::now(), bytes));
415 }
416
417 pub fn average_time(&self, operation: &str) -> Option<std::time::Duration> {
419 if let Some(times) = self.operation_times.get(operation) {
420 if !times.is_empty() {
421 let total: std::time::Duration = times.iter().sum();
422 Some(total / times.len() as u32)
423 } else {
424 None
425 }
426 } else {
427 None
428 }
429 }
430
431 pub fn summary(&self) -> String {
433 let mut summary = String::new();
434 summary.push_str("GPU Performance Summary:\n");
435
436 for (operation, times) in &self.operation_times {
437 if !times.is_empty() {
438 let avg = times.iter().sum::<std::time::Duration>() / times.len() as u32;
439 let min = times.iter().min().expect("Operation failed");
440 let max = times.iter().max().expect("Operation failed");
441
442 let avg_ms = avg.as_secs_f64() * 1000.0;
443 let min_ms = min.as_secs_f64() * 1000.0;
444 let max_ms = max.as_secs_f64() * 1000.0;
445 let count = times.len();
446 summary.push_str(&format!(
447 " {operation}: avg={avg_ms:.2}ms, min={min_ms:.2}ms, max={max_ms:.2}ms, count={count}\n"
448 ));
449 }
450 }
451
452 summary
453 }
454}