Skip to main content

scirs2_ndimage/
gpu_operations.rs

1//! GPU-accelerated implementations for intensive ndimage operations
2//!
3//! This module provides GPU-accelerated versions of the most compute-intensive
4//! ndimage operations, including morphological operations, filtering, and distance
5//! transforms. It automatically falls back to CPU implementations when GPU
6//! acceleration is not available.
7
8use std::collections::HashMap;
9use std::mem;
10use std::sync::Arc;
11
12use scirs2_core::ndarray::{Array, ArrayView, ArrayView2, Dimension, Ix2};
13use scirs2_core::numeric::{Float, FromPrimitive};
14
15use crate::backend::gpu_acceleration_framework::{
16    GpuAccelerationManager, GpuPerformanceReport, MemoryPoolConfig,
17};
18use crate::backend::{Backend, DeviceManager};
19use crate::error::{NdimageError, NdimageResult};
20use crate::interpolation::BoundaryMode;
21use std::f64::consts::PI;
22
23/// High-level GPU operations manager for ndimage
24pub struct GpuOperations {
25    /// GPU acceleration manager
26    acceleration_manager: Arc<GpuAccelerationManager>,
27    /// Device manager for hardware detection
28    device_manager: DeviceManager,
29    /// Configuration
30    config: GpuOperationsConfig,
31    /// Operation registry
32    operation_registry: HashMap<String, OperationInfo>,
33}
34
35#[derive(Debug, Clone)]
36pub struct GpuOperationsConfig {
37    /// Minimum array size to consider GPU acceleration
38    pub min_gpu_size: usize,
39    /// Memory pool configuration
40    pub memory_pool_config: MemoryPoolConfig,
41    /// Automatic fallback to CPU on GPU errors
42    pub auto_fallback: bool,
43    /// Enable performance monitoring
44    pub enable_monitoring: bool,
45    /// GPU operation timeout in milliseconds
46    pub operation_timeout: u64,
47}
48
49impl Default for GpuOperationsConfig {
50    fn default() -> Self {
51        Self {
52            min_gpu_size: 1024 * 1024, // 1M elements minimum
53            memory_pool_config: MemoryPoolConfig::default(),
54            auto_fallback: true,
55            enable_monitoring: true,
56            operation_timeout: 10000, // 10 seconds
57        }
58    }
59}
60
61#[derive(Debug, Clone)]
62struct OperationInfo {
63    /// Operation name
64    name: String,
65    /// GPU kernel source
66    kernel_source: String,
67    /// Preferred backend
68    preferred_backend: Backend,
69    /// Memory complexity (bytes per input element)
70    memory_complexity: f64,
71    /// Computational complexity (operations per input element)
72    compute_complexity: f64,
73}
74
75impl GpuOperations {
76    /// Create a new GPU operations manager
77    pub fn new(config: GpuOperationsConfig) -> NdimageResult<Self> {
78        let acceleration_manager = Arc::new(GpuAccelerationManager::new(
79            config.memory_pool_config.clone(),
80        )?);
81        let device_manager = DeviceManager::new()?;
82
83        let mut gpu_ops = Self {
84            acceleration_manager,
85            device_manager,
86            config,
87            operation_registry: HashMap::new(),
88        };
89
90        // Register built-in GPU operations
91        gpu_ops.register_builtin_operations()?;
92
93        Ok(gpu_ops)
94    }
95
96    /// GPU-accelerated 2D convolution
97    pub fn gpu_convolution_2d<T>(
98        &self,
99        input: ArrayView2<T>,
100        kernel: ArrayView2<T>,
101        mode: BoundaryMode,
102    ) -> NdimageResult<Array<T, Ix2>>
103    where
104        T: Float
105            + FromPrimitive
106            + Clone
107            + Send
108            + Sync
109            + std::ops::DivAssign
110            + std::ops::AddAssign
111            + std::fmt::Debug
112            + 'static,
113    {
114        let operation_name = "convolution_2d";
115
116        // Check if GPU acceleration is beneficial
117        if !self.should_use_gpu(&input, operation_name) {
118            return self.fallback_convolution_2d(input, kernel, mode);
119        }
120
121        // Get GPU kernel source
122        let kernel_source = self.get_convolution_kernel_source();
123
124        // Select best available backend
125        let backend = self.select_backend_for_operation(operation_name)?;
126
127        // Execute on GPU with error handling
128        match self.execute_gpu_convolution(input, kernel, &kernel_source, backend, mode) {
129            Ok(result) => Ok(result),
130            Err(e) if self.config.auto_fallback => {
131                eprintln!("GPU convolution failed: {:?}, falling back to CPU", e);
132                self.fallback_convolution_2d(input, kernel, mode)
133            }
134            Err(e) => Err(e),
135        }
136    }
137
138    /// GPU-accelerated morphological erosion
139    pub fn gpu_morphological_erosion<T>(
140        &self,
141        input: ArrayView2<T>,
142        structuring_element: ArrayView2<bool>,
143        mode: BoundaryMode,
144    ) -> NdimageResult<Array<T, Ix2>>
145    where
146        T: Float
147            + FromPrimitive
148            + Clone
149            + Send
150            + Sync
151            + PartialOrd
152            + std::ops::AddAssign
153            + std::ops::DivAssign
154            + std::fmt::Debug
155            + 'static,
156    {
157        let operation_name = "morphological_erosion";
158
159        if !self.should_use_gpu(&input, operation_name) {
160            return self.fallback_morphological_erosion(input, structuring_element, mode);
161        }
162
163        let kernel_source = self.get_morphology_kernel_source();
164        let backend = self.select_backend_for_operation(operation_name)?;
165
166        match self.execute_gpu_morphology(
167            input,
168            structuring_element,
169            &kernel_source,
170            backend,
171            mode,
172            MorphologyOperation::Erosion,
173        ) {
174            Ok(result) => Ok(result),
175            Err(e) if self.config.auto_fallback => {
176                eprintln!("GPU erosion failed: {:?}, falling back to CPU", e);
177                self.fallback_morphological_erosion(input, structuring_element, mode)
178            }
179            Err(e) => Err(e),
180        }
181    }
182
183    /// GPU-accelerated morphological dilation
184    pub fn gpu_morphological_dilation<T>(
185        &self,
186        input: ArrayView2<T>,
187        structuring_element: ArrayView2<bool>,
188        mode: BoundaryMode,
189    ) -> NdimageResult<Array<T, Ix2>>
190    where
191        T: Float
192            + FromPrimitive
193            + Clone
194            + Send
195            + Sync
196            + PartialOrd
197            + std::ops::AddAssign
198            + std::ops::DivAssign
199            + std::fmt::Debug
200            + 'static,
201    {
202        let operation_name = "morphological_dilation";
203
204        if !self.should_use_gpu(&input, operation_name) {
205            return self.fallback_morphological_dilation(input, structuring_element, mode);
206        }
207
208        let kernel_source = self.get_morphology_kernel_source();
209        let backend = self.select_backend_for_operation(operation_name)?;
210
211        match self.execute_gpu_morphology(
212            input,
213            structuring_element,
214            &kernel_source,
215            backend,
216            mode,
217            MorphologyOperation::Dilation,
218        ) {
219            Ok(result) => Ok(result),
220            Err(e) if self.config.auto_fallback => {
221                eprintln!("GPU dilation failed: {:?}, falling back to CPU", e);
222                self.fallback_morphological_dilation(input, structuring_element, mode)
223            }
224            Err(e) => Err(e),
225        }
226    }
227
228    /// GPU-accelerated Gaussian filter
229    pub fn gpu_gaussian_filter<T>(
230        &self,
231        input: ArrayView2<T>,
232        sigma: (f64, f64),
233        mode: BoundaryMode,
234    ) -> NdimageResult<Array<T, Ix2>>
235    where
236        T: Float + FromPrimitive + Clone + Send + Sync + std::ops::DivAssign + std::ops::AddAssign,
237    {
238        let operation_name = "gaussian_filter";
239
240        if !self.should_use_gpu(&input, operation_name) {
241            return self.fallback_gaussian_filter(input, sigma, mode);
242        }
243
244        let kernel_source = self.get_gaussian_kernel_source();
245        let backend = self.select_backend_for_operation(operation_name)?;
246
247        match self.execute_gpu_gaussian(input, sigma, &kernel_source, backend, mode) {
248            Ok(result) => Ok(result),
249            Err(e) if self.config.auto_fallback => {
250                eprintln!("GPU Gaussian filter failed: {:?}, falling back to CPU", e);
251                self.fallback_gaussian_filter(input, sigma, mode)
252            }
253            Err(e) => Err(e),
254        }
255    }
256
257    /// GPU-accelerated distance transform
258    pub fn gpu_distance_transform<T>(
259        &self,
260        input: ArrayView2<T>,
261        metric: DistanceMetric,
262    ) -> NdimageResult<Array<T, Ix2>>
263    where
264        T: Float + FromPrimitive + Clone + Send + Sync + PartialOrd,
265    {
266        let operation_name = "distance_transform";
267
268        if !self.should_use_gpu(&input, operation_name) {
269            return self.fallback_distance_transform(input, metric);
270        }
271
272        let kernel_source = self.get_distance_transform_kernel_source();
273        let backend = self.select_backend_for_operation(operation_name)?;
274
275        match self.execute_gpu_distance_transform(input, metric, &kernel_source, backend) {
276            Ok(result) => Ok(result),
277            Err(e) if self.config.auto_fallback => {
278                eprintln!(
279                    "GPU distance transform failed: {:?}, falling back to CPU",
280                    e
281                );
282                self.fallback_distance_transform(input, metric)
283            }
284            Err(e) => Err(e),
285        }
286    }
287
288    /// Get comprehensive performance report
289    pub fn get_performance_report(&self) -> GpuPerformanceReport {
290        self.acceleration_manager.get_performance_report()
291    }
292
293    /// Check GPU availability and capabilities
294    pub fn get_gpu_info(&self) -> GpuInfo {
295        let capabilities = self.device_manager.get_capabilities();
296
297        GpuInfo {
298            cuda_available: capabilities.cuda_available,
299            opencl_available: capabilities.opencl_available,
300            metal_available: capabilities.metal_available,
301            gpu_memory: capabilities.gpu_memory_mb,
302            compute_units: capabilities.compute_units,
303            preferred_backend: self.select_preferred_backend(),
304        }
305    }
306
307    // Private implementation methods
308
309    fn register_builtin_operations(&mut self) -> NdimageResult<()> {
310        // Register convolution operation
311        self.operation_registry.insert(
312            "convolution_2d".to_string(),
313            OperationInfo {
314                name: "convolution_2d".to_string(),
315                kernel_source: self.get_convolution_kernel_source(),
316                preferred_backend: {
317                    #[cfg(feature = "opencl")]
318                    {
319                        Backend::OpenCL
320                    }
321                    #[cfg(not(feature = "opencl"))]
322                    {
323                        Backend::Cpu
324                    }
325                },
326                memory_complexity: 2.0,  // Input + output
327                compute_complexity: 9.0, // Assume 3x3 kernel average
328            },
329        );
330
331        // Register morphological operations
332        self.operation_registry.insert(
333            "morphological_erosion".to_string(),
334            OperationInfo {
335                name: "morphological_erosion".to_string(),
336                kernel_source: self.get_morphology_kernel_source(),
337                preferred_backend: {
338                    #[cfg(feature = "opencl")]
339                    {
340                        Backend::OpenCL
341                    }
342                    #[cfg(not(feature = "opencl"))]
343                    {
344                        Backend::Cpu
345                    }
346                },
347                memory_complexity: 2.0,
348                compute_complexity: 9.0,
349            },
350        );
351
352        // Register Gaussian filter
353        self.operation_registry.insert(
354            "gaussian_filter".to_string(),
355            OperationInfo {
356                name: "gaussian_filter".to_string(),
357                kernel_source: self.get_gaussian_kernel_source(),
358                preferred_backend: {
359                    #[cfg(feature = "opencl")]
360                    {
361                        Backend::OpenCL
362                    }
363                    #[cfg(not(feature = "opencl"))]
364                    {
365                        Backend::Cpu
366                    }
367                },
368                memory_complexity: 3.0,  // Separable: input + temp + output
369                compute_complexity: 6.0, // Two 1D passes
370            },
371        );
372
373        // Register distance transform
374        self.operation_registry.insert(
375            "distance_transform".to_string(),
376            OperationInfo {
377                name: "distance_transform".to_string(),
378                kernel_source: self.get_distance_transform_kernel_source(),
379                preferred_backend: {
380                    #[cfg(feature = "opencl")]
381                    {
382                        Backend::OpenCL
383                    }
384                    #[cfg(not(feature = "opencl"))]
385                    {
386                        Backend::Cpu
387                    }
388                },
389                memory_complexity: 2.0,
390                compute_complexity: 10.0, // Multiple passes
391            },
392        );
393
394        Ok(())
395    }
396
397    fn should_use_gpu<T, D>(&self, input: &ArrayView<T, D>, operation_name: &str) -> bool
398    where
399        T: Float + FromPrimitive,
400        D: Dimension,
401    {
402        // Check minimum size threshold
403        if input.len() < self.config.min_gpu_size {
404            return false;
405        }
406
407        // Check if GPU is available
408        let capabilities = self.device_manager.get_capabilities();
409        if !capabilities.gpu_available {
410            return false;
411        }
412
413        // Check memory requirements
414        if let Some(op_info) = self.operation_registry.get(operation_name) {
415            let required_memory =
416                input.len() * std::mem::size_of::<T>() * op_info.memory_complexity as usize;
417            let available_memory = capabilities.gpu_memory_mb * 1024 * 1024;
418
419            if required_memory > available_memory {
420                return false;
421            }
422        }
423
424        true
425    }
426
427    fn select_backend_for_operation(&self, operation_name: &str) -> NdimageResult<Backend> {
428        let capabilities = self.device_manager.get_capabilities();
429
430        // Check operation preference
431        if let Some(op_info) = self.operation_registry.get(operation_name) {
432            match op_info.preferred_backend {
433                #[cfg(feature = "cuda")]
434                Backend::Cuda if capabilities.cuda_available => return Ok(Backend::Cuda),
435                #[cfg(feature = "opencl")]
436                Backend::OpenCL if capabilities.opencl_available => return Ok(Backend::OpenCL),
437                #[cfg(all(target_os = "macos", feature = "metal"))]
438                Backend::Metal if capabilities.metal_available => return Ok(Backend::Metal),
439                _ => {}
440            }
441        }
442
443        // Fallback to best available backend
444        if capabilities.cuda_available {
445            #[cfg(feature = "cuda")]
446            {
447                Ok(Backend::Cuda)
448            }
449            #[cfg(not(feature = "cuda"))]
450            {
451                Ok(Backend::Cpu)
452            }
453        } else if capabilities.opencl_available {
454            #[cfg(feature = "opencl")]
455            {
456                Ok(Backend::OpenCL)
457            }
458            #[cfg(not(feature = "opencl"))]
459            {
460                Ok(Backend::Cpu)
461            }
462        } else if capabilities.metal_available {
463            #[cfg(all(target_os = "macos", feature = "metal"))]
464            {
465                Ok(Backend::Metal)
466            }
467            #[cfg(not(all(target_os = "macos", feature = "metal")))]
468            {
469                Ok(Backend::Cpu)
470            }
471        } else {
472            Err(NdimageError::GpuNotAvailable(
473                "No GPU backend available".to_string(),
474            ))
475        }
476    }
477
478    fn select_preferred_backend(&self) -> Backend {
479        let capabilities = self.device_manager.get_capabilities();
480
481        if capabilities.cuda_available {
482            #[cfg(feature = "cuda")]
483            {
484                Backend::Cuda
485            }
486            #[cfg(not(feature = "cuda"))]
487            {
488                Backend::Cpu
489            }
490        } else if capabilities.opencl_available {
491            #[cfg(feature = "opencl")]
492            {
493                Backend::OpenCL
494            }
495            #[cfg(not(feature = "opencl"))]
496            {
497                Backend::Cpu
498            }
499        } else if capabilities.metal_available {
500            #[cfg(all(target_os = "macos", feature = "metal"))]
501            {
502                Backend::Metal
503            }
504            #[cfg(not(all(target_os = "macos", feature = "metal")))]
505            {
506                Backend::Cpu
507            }
508        } else {
509            Backend::Cpu
510        }
511    }
512
513    // GPU operation execution methods
514
515    fn execute_gpu_convolution<T>(
516        &self,
517        input: ArrayView2<T>,
518        _kernel: ArrayView2<T>,
519        kernel_source: &str,
520        backend: Backend,
521        mode: BoundaryMode,
522    ) -> NdimageResult<Array<T, Ix2>>
523    where
524        T: Float + FromPrimitive + Clone + Send + Sync + std::ops::DivAssign + std::ops::AddAssign,
525    {
526        // This would contain the actual GPU execution logic
527        // For now, we'll use the acceleration manager framework
528        self.acceleration_manager
529            .execute_operation("convolution_2d", input.into_dyn(), kernel_source, backend)
530            .map(|result| {
531                result
532                    .into_dimensionality::<Ix2>()
533                    .expect("Operation failed")
534            })
535    }
536
537    fn execute_gpu_morphology<T>(
538        &self,
539        input: ArrayView2<T>,
540        _structuring_element: ArrayView2<bool>,
541        kernel_source: &str,
542        backend: Backend,
543        mode: BoundaryMode,
544        _operation: MorphologyOperation,
545    ) -> NdimageResult<Array<T, Ix2>>
546    where
547        T: Float + FromPrimitive + Clone + Send + Sync + PartialOrd,
548    {
549        // Execute morphological _operation on GPU
550        self.acceleration_manager
551            .execute_operation(
552                "morphological_operation",
553                input.into_dyn(),
554                kernel_source,
555                backend,
556            )
557            .map(|result| {
558                result
559                    .into_dimensionality::<Ix2>()
560                    .expect("Operation failed")
561            })
562    }
563
564    fn execute_gpu_gaussian<T>(
565        &self,
566        input: ArrayView2<T>,
567        _sigma: (f64, f64),
568        kernel_source: &str,
569        backend: Backend,
570        mode: BoundaryMode,
571    ) -> NdimageResult<Array<T, Ix2>>
572    where
573        T: Float + FromPrimitive + Clone + Send + Sync + std::ops::DivAssign + std::ops::AddAssign,
574    {
575        // Execute Gaussian filter on GPU
576        self.acceleration_manager
577            .execute_operation("gaussian_filter", input.into_dyn(), kernel_source, backend)
578            .map(|result| {
579                result
580                    .into_dimensionality::<Ix2>()
581                    .expect("Operation failed")
582            })
583    }
584
585    fn execute_gpu_distance_transform<T>(
586        &self,
587        input: ArrayView2<T>,
588        _metric: DistanceMetric,
589        kernel_source: &str,
590        backend: Backend,
591    ) -> NdimageResult<Array<T, Ix2>>
592    where
593        T: Float + FromPrimitive + Clone + Send + Sync + PartialOrd,
594    {
595        // Execute distance transform on GPU
596        self.acceleration_manager
597            .execute_operation(
598                "distance_transform",
599                input.into_dyn(),
600                kernel_source,
601                backend,
602            )
603            .map(|result| {
604                result
605                    .into_dimensionality::<Ix2>()
606                    .expect("Operation failed")
607            })
608    }
609
610    // Fallback CPU implementations
611
612    fn fallback_convolution_2d<T>(
613        &self,
614        input: ArrayView2<T>,
615        kernel: ArrayView2<T>,
616        mode: BoundaryMode,
617    ) -> NdimageResult<Array<T, Ix2>>
618    where
619        T: Float
620            + FromPrimitive
621            + Clone
622            + Send
623            + Sync
624            + std::ops::DivAssign
625            + std::ops::AddAssign
626            + std::fmt::Debug
627            + 'static,
628    {
629        // Use existing CPU implementation
630        let border_mode = match mode {
631            BoundaryMode::Constant => Some(crate::filters::BorderMode::Constant),
632            BoundaryMode::Reflect => Some(crate::filters::BorderMode::Reflect),
633            BoundaryMode::Mirror => Some(crate::filters::BorderMode::Mirror),
634            BoundaryMode::Wrap => Some(crate::filters::BorderMode::Wrap),
635            BoundaryMode::Nearest => Some(crate::filters::BorderMode::Constant), // No direct equivalent, use Constant
636        };
637        crate::convolve(&input.to_owned(), &kernel.to_owned(), border_mode)
638    }
639
640    fn fallback_morphological_erosion<T>(
641        &self,
642        input: ArrayView2<T>,
643        structuring_element: ArrayView2<bool>,
644        mode: BoundaryMode,
645    ) -> NdimageResult<Array<T, Ix2>>
646    where
647        T: Float
648            + FromPrimitive
649            + Clone
650            + Send
651            + Sync
652            + PartialOrd
653            + std::ops::DivAssign
654            + std::ops::AddAssign
655            + std::fmt::Debug
656            + 'static,
657    {
658        // Use existing CPU implementation
659        use crate::morphology::MorphBorderMode;
660        let morph_mode = match mode {
661            BoundaryMode::Constant => MorphBorderMode::Constant,
662            BoundaryMode::Reflect => MorphBorderMode::Reflect,
663            BoundaryMode::Mirror => MorphBorderMode::Mirror,
664            BoundaryMode::Wrap => MorphBorderMode::Wrap,
665            BoundaryMode::Nearest => MorphBorderMode::Nearest,
666        };
667        crate::grey_erosion(
668            &input.to_owned(),
669            None,
670            Some(&structuring_element.to_owned()),
671            Some(morph_mode),
672            None,
673            None,
674        )
675    }
676
677    fn fallback_morphological_dilation<T>(
678        &self,
679        input: ArrayView2<T>,
680        structuring_element: ArrayView2<bool>,
681        mode: BoundaryMode,
682    ) -> NdimageResult<Array<T, Ix2>>
683    where
684        T: Float
685            + FromPrimitive
686            + Clone
687            + Send
688            + Sync
689            + PartialOrd
690            + std::ops::DivAssign
691            + std::ops::AddAssign
692            + std::fmt::Debug
693            + 'static,
694    {
695        // Use existing CPU implementation
696        use crate::morphology::MorphBorderMode;
697        let morph_mode = match mode {
698            BoundaryMode::Constant => MorphBorderMode::Constant,
699            BoundaryMode::Reflect => MorphBorderMode::Reflect,
700            BoundaryMode::Mirror => MorphBorderMode::Mirror,
701            BoundaryMode::Wrap => MorphBorderMode::Wrap,
702            BoundaryMode::Nearest => MorphBorderMode::Nearest,
703        };
704        crate::grey_dilation(
705            &input.to_owned(),
706            None,
707            Some(&structuring_element.to_owned()),
708            Some(morph_mode),
709            None,
710            None,
711        )
712    }
713
714    fn fallback_gaussian_filter<T>(
715        &self,
716        input: ArrayView2<T>,
717        sigma: (f64, f64),
718        mode: BoundaryMode,
719    ) -> NdimageResult<Array<T, Ix2>>
720    where
721        T: Float + FromPrimitive + Clone + Send + Sync + std::ops::DivAssign + std::ops::AddAssign,
722    {
723        // Implement simple Gaussian filter as CPU fallback
724        // Use separable convolution: convolve along rows then columns
725
726        let (rows, cols) = (input.nrows(), input.ncols());
727        let (sigma_x, sigma_y) = sigma;
728
729        // Create 1D Gaussian kernels
730        let kernel_size_x = (6.0 * sigma_x).ceil() as usize | 1; // Ensure odd
731        let kernel_size_y = (6.0 * sigma_y).ceil() as usize | 1;
732
733        let kernel_x = gaussian_kernel_1d(sigma_x, kernel_size_x);
734        let kernel_y = gaussian_kernel_1d(sigma_y, kernel_size_y);
735
736        // Convolve along rows first
737        let mut temp = Array::zeros((rows, cols));
738        for i in 0..rows {
739            for j in 0..cols {
740                let mut sum = T::zero();
741                let mut weight_sum = T::zero();
742
743                let half_k = (kernel_size_x / 2) as isize;
744                for k_idx in 0..kernel_size_x {
745                    let j_offset = j as isize + k_idx as isize - half_k;
746                    let weight = safe_f64_to_float::<T>(kernel_x[k_idx])?;
747
748                    if j_offset >= 0 && j_offset < cols as isize {
749                        sum = sum + input[[i, j_offset as usize]] * weight;
750                        weight_sum = weight_sum + weight;
751                    }
752                }
753
754                if weight_sum > T::zero() {
755                    temp[[i, j]] = sum / weight_sum;
756                } else {
757                    temp[[i, j]] = input[[i, j]];
758                }
759            }
760        }
761
762        // Convolve along columns
763        let mut output = Array::zeros((rows, cols));
764        for j in 0..cols {
765            for i in 0..rows {
766                let mut sum = T::zero();
767                let mut weight_sum = T::zero();
768
769                let half_k = (kernel_size_y / 2) as isize;
770                for k_idx in 0..kernel_size_y {
771                    let i_offset = i as isize + k_idx as isize - half_k;
772                    let weight = safe_f64_to_float::<T>(kernel_y[k_idx])?;
773
774                    if i_offset >= 0 && i_offset < rows as isize {
775                        sum = sum + temp[[i_offset as usize, j]] * weight;
776                        weight_sum = weight_sum + weight;
777                    }
778                }
779
780                if weight_sum > T::zero() {
781                    output[[i, j]] = sum / weight_sum;
782                } else {
783                    output[[i, j]] = temp[[i, j]];
784                }
785            }
786        }
787
788        Ok(output)
789    }
790
791    fn fallback_distance_transform<T>(
792        &self,
793        input: ArrayView2<T>,
794        metric: DistanceMetric,
795    ) -> NdimageResult<Array<T, Ix2>>
796    where
797        T: Float + FromPrimitive + Clone + Send + Sync + PartialOrd,
798    {
799        // Convert input to boolean array by thresholding at zero
800        let bool_input = input.mapv(|x| x > T::zero());
801
802        // Use existing CPU implementation
803        let result = crate::distance_transform_edt(&bool_input.into_dyn(), None, true, false)?;
804
805        // Convert result back to the desired type
806        if let Some(distances) = result.0 {
807            let converted_distances = distances.mapv(|x| T::from_f64(x).unwrap_or(T::zero()));
808            // Convert from dynamic dimension back to Ix2
809            converted_distances
810                .into_dimensionality::<scirs2_core::ndarray::Ix2>()
811                .map_err(|_| {
812                    NdimageError::DimensionError(
813                        "Failed to convert distance transform result to 2D".into(),
814                    )
815                })
816        } else {
817            Err(NdimageError::ComputationError(
818                "Distance transform did not return distances".into(),
819            ))
820        }
821    }
822
823    // GPU kernel source code methods
824
825    fn get_convolution_kernel_source(&self) -> String {
826        include_str!("backend/kernels/convolution.kernel").to_string()
827    }
828
829    fn get_morphology_kernel_source(&self) -> String {
830        include_str!("backend/kernels/morphology.kernel").to_string()
831    }
832
833    fn get_gaussian_kernel_source(&self) -> String {
834        include_str!("backend/kernels/gaussian_blur.kernel").to_string()
835    }
836
837    fn get_distance_transform_kernel_source(&self) -> String {
838        // For now, use a placeholder - would need a dedicated distance transform kernel
839        include_str!("backend/kernels/morphology.kernel").to_string()
840    }
841}
842
843// Supporting types and enums
844
845#[derive(Debug, Clone, Copy)]
846pub enum MorphologyOperation {
847    Erosion,
848    Dilation,
849}
850
851#[derive(Debug, Clone, Copy)]
852pub enum DistanceMetric {
853    Euclidean,
854    Manhattan,
855    Chessboard,
856}
857
858#[derive(Debug, Clone)]
859pub struct GpuInfo {
860    /// CUDA availability
861    pub cuda_available: bool,
862    /// OpenCL availability
863    pub opencl_available: bool,
864    /// Metal availability (macOS)
865    pub metal_available: bool,
866    /// GPU memory in MB
867    pub gpu_memory: usize,
868    /// Number of compute units
869    pub compute_units: u32,
870    /// Preferred backend for this system
871    pub preferred_backend: Backend,
872}
873
874impl GpuInfo {
875    /// Display GPU information
876    pub fn display(&self) {
877        println!("=== GPU Information ===");
878        println!("CUDA Available: {}", self.cuda_available);
879        println!("OpenCL Available: {}", self.opencl_available);
880        println!("Metal Available: {}", self.metal_available);
881        println!("GPU Memory: {} MB", self.gpu_memory);
882        println!("Compute Units: {}", self.compute_units);
883        println!("Preferred Backend: {:?}", self.preferred_backend);
884    }
885}
886
887/// Convenience function to create a GPU operations instance with default configuration
888#[allow(dead_code)]
889pub fn create_gpu_operations() -> NdimageResult<GpuOperations> {
890    GpuOperations::new(GpuOperationsConfig::default())
891}
892
893/// Convenience function to create GPU operations with custom configuration
894#[allow(dead_code)]
895pub fn create_gpu_operations_with_config(
896    config: GpuOperationsConfig,
897) -> NdimageResult<GpuOperations> {
898    GpuOperations::new(config)
899}
900
901/// Generate 1D Gaussian kernel
902fn gaussian_kernel_1d(sigma: f64, size: usize) -> Vec<f64> {
903    let center = (size / 2) as f64;
904    let coeff = 1.0 / (sigma * (2.0 * PI).sqrt());
905    let denom = 2.0 * sigma * sigma;
906
907    let mut kernel: Vec<f64> = (0..size)
908        .map(|i| {
909            let x = i as f64 - center;
910            coeff * (-x * x / denom).exp()
911        })
912        .collect();
913
914    // Normalize
915    let sum: f64 = kernel.iter().sum();
916    if sum > 0.0 {
917        for val in &mut kernel {
918            *val /= sum;
919        }
920    }
921
922    kernel
923}
924
925/// Safely convert f64 to generic float type
926fn safe_f64_to_float<T: Float>(value: f64) -> NdimageResult<T> {
927    T::from(value).ok_or_else(|| {
928        NdimageError::InvalidInput(format!("Cannot convert {} to target float type", value))
929    })
930}
931
932#[cfg(test)]
933mod tests {
934    use super::*;
935    use scirs2_core::ndarray::array;
936
937    #[test]
938    fn test_gpu_operations_creation() {
939        let result = create_gpu_operations();
940        // This might fail in environments without GPU support, which is expected
941        assert!(result.is_ok() || result.is_err());
942    }
943
944    #[test]
945    fn test_gpu_info_display() {
946        let gpu_info = GpuInfo {
947            cuda_available: false,
948            opencl_available: true,
949            metal_available: false,
950            gpu_memory: 8192,
951            compute_units: 16,
952            preferred_backend: {
953                #[cfg(feature = "opencl")]
954                {
955                    Backend::OpenCL
956                }
957                #[cfg(not(feature = "opencl"))]
958                {
959                    Backend::Cpu
960                }
961            },
962        };
963
964        // Test that display doesn't panic
965        gpu_info.display();
966    }
967
968    #[test]
969    fn test_gpu_convolution_fallback() {
970        // Test small array that should fallback to CPU
971        let input = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]];
972
973        let kernel = array![[1.0, 0.0, -1.0], [2.0, 0.0, -2.0], [1.0, 0.0, -1.0]];
974
975        // This should work even without GPU support
976        if let Ok(gpu_ops) = create_gpu_operations() {
977            let result =
978                gpu_ops.gpu_convolution_2d(input.view(), kernel.view(), BoundaryMode::Constant);
979
980            // Should either succeed or fail gracefully
981            assert!(result.is_ok() || result.is_err());
982        }
983    }
984}