Skip to main content

scirs2_ndimage/
gpu_chunked.rs

1//! GPU-accelerated chunked image processing
2//!
3//! This module provides GPU-accelerated versions of chunked image processing
4//! operations. It integrates with the backend module to automatically select
5//! the best available GPU backend (CUDA, OpenCL, Metal).
6//!
7//! # Features
8//!
9//! - **Automatic GPU detection**: Selects the best available GPU backend
10//! - **Hybrid processing**: Falls back to CPU for small chunks
11//! - **Memory management**: Efficient GPU memory allocation and pooling
12//! - **Async transfers**: Overlapping compute and data transfer
13//!
14//! # Example
15//!
16//! ```rust,no_run
17//! use scirs2_ndimage::gpu_chunked::{GpuChunkedProcessor, GpuChunkingConfig};
18//! use scirs2_core::ndarray::Array2;
19//!
20//! let config = GpuChunkingConfig::default();
21//! let processor = GpuChunkedProcessor::new(config).unwrap();
22//!
23//! let large_image = Array2::<f64>::zeros((10000, 10000));
24//!
25//! // Process on GPU if available, with automatic fallback
26//! let result = processor.gaussian_filter(&large_image.view(), 2.0).unwrap();
27//! ```
28
29use std::fmt::Debug;
30use std::marker::PhantomData;
31use std::sync::atomic::{AtomicUsize, Ordering};
32use std::sync::Arc;
33
34use scirs2_core::ndarray::{self as ndarray, Array2, ArrayView2, Axis};
35use scirs2_core::numeric::{Float, FromPrimitive, NumCast, Zero};
36
37use crate::backend::{Backend, DeviceManager};
38use crate::chunked_processing::{
39    ChunkOperation, ChunkRegion, ChunkRegionIterator, ChunkedImageProcessor, ChunkingConfig,
40};
41use crate::error::{NdimageError, NdimageResult};
42use crate::filters::BorderMode;
43use crate::morphology::MorphBorderMode;
44
45// ============================================================================
46// Configuration
47// ============================================================================
48
49/// Configuration for GPU-accelerated chunked processing
50#[derive(Debug, Clone)]
51pub struct GpuChunkingConfig {
52    /// Maximum chunk size in bytes for GPU processing
53    pub max_gpu_chunk_bytes: usize,
54
55    /// Minimum chunk size for GPU processing (smaller uses CPU)
56    pub min_gpu_chunk_elements: usize,
57
58    /// Maximum overlap in pixels
59    pub max_overlap_pixels: usize,
60
61    /// Enable automatic fallback to CPU
62    pub auto_fallback: bool,
63
64    /// Preferred backend (None = auto-detect)
65    pub preferred_backend: Option<Backend>,
66
67    /// GPU memory fraction to use (0.0 - 1.0)
68    pub gpu_memory_fraction: f64,
69
70    /// Number of streams for async processing
71    pub num_streams: usize,
72
73    /// Enable profiling
74    pub enable_profiling: bool,
75}
76
77impl Default for GpuChunkingConfig {
78    fn default() -> Self {
79        Self {
80            max_gpu_chunk_bytes: 256 * 1024 * 1024, // 256 MB per chunk
81            min_gpu_chunk_elements: 256 * 256,      // 64K elements minimum
82            max_overlap_pixels: 64,
83            auto_fallback: true,
84            preferred_backend: None,
85            gpu_memory_fraction: 0.7,
86            num_streams: 2,
87            enable_profiling: false,
88        }
89    }
90}
91
92// ============================================================================
93// GPU Capabilities
94// ============================================================================
95
96/// GPU capabilities for a specific device
97#[derive(Debug, Clone)]
98pub struct GpuCapabilities {
99    /// Backend type
100    pub backend: Backend,
101
102    /// Device name
103    pub device_name: String,
104
105    /// Available memory in bytes
106    pub memory_bytes: usize,
107
108    /// Number of compute units
109    pub compute_units: u32,
110
111    /// Maximum work group size
112    pub max_work_group_size: usize,
113
114    /// Whether the device supports double precision
115    pub supports_double: bool,
116}
117
118impl GpuCapabilities {
119    /// Check if capable of processing an array of given size
120    pub fn can_process(&self, array_bytes: usize, element_size: usize) -> bool {
121        // Need at least 2x array size for input and output
122        let required_memory = array_bytes * 2;
123        required_memory < self.memory_bytes
124    }
125}
126
127// ============================================================================
128// GPU Chunked Processor
129// ============================================================================
130
131/// GPU-accelerated chunked image processor
132pub struct GpuChunkedProcessor {
133    config: GpuChunkingConfig,
134    device_manager: DeviceManager,
135    capabilities: Option<GpuCapabilities>,
136    cpu_fallback: ChunkedImageProcessor,
137    stats: GpuProcessingStats,
138}
139
140/// Statistics about GPU processing
141#[derive(Debug, Default)]
142pub struct GpuProcessingStats {
143    /// Chunks processed on GPU
144    pub gpu_chunks: AtomicUsize,
145
146    /// Chunks processed on CPU (fallback)
147    pub cpu_chunks: AtomicUsize,
148
149    /// Total bytes transferred to GPU
150    pub bytes_to_gpu: AtomicUsize,
151
152    /// Total bytes transferred from GPU
153    pub bytes_from_gpu: AtomicUsize,
154
155    /// Total GPU compute time in milliseconds
156    pub gpu_compute_ms: AtomicUsize,
157}
158
159impl GpuChunkedProcessor {
160    /// Create a new GPU chunked processor
161    pub fn new(config: GpuChunkingConfig) -> NdimageResult<Self> {
162        let device_manager = DeviceManager::new()?;
163        let capabilities = Self::detect_capabilities(&device_manager, &config)?;
164
165        let cpu_config = ChunkingConfig {
166            max_chunk_bytes: config.max_gpu_chunk_bytes,
167            max_overlap_pixels: config.max_overlap_pixels,
168            enable_parallel: true,
169            ..Default::default()
170        };
171
172        Ok(Self {
173            config,
174            device_manager,
175            capabilities,
176            cpu_fallback: ChunkedImageProcessor::new(cpu_config),
177            stats: GpuProcessingStats::default(),
178        })
179    }
180
181    /// Create with default configuration
182    pub fn with_defaults() -> NdimageResult<Self> {
183        Self::new(GpuChunkingConfig::default())
184    }
185
186    /// Detect GPU capabilities
187    fn detect_capabilities(
188        device_manager: &DeviceManager,
189        config: &GpuChunkingConfig,
190    ) -> NdimageResult<Option<GpuCapabilities>> {
191        let dev_caps = device_manager.get_capabilities();
192
193        if !dev_caps.gpu_available {
194            return Ok(None);
195        }
196
197        // Select best backend
198        let backend = if let Some(preferred) = config.preferred_backend {
199            preferred
200        } else if dev_caps.cuda_available {
201            #[cfg(feature = "cuda")]
202            {
203                Backend::Cuda
204            }
205            #[cfg(not(feature = "cuda"))]
206            {
207                return Ok(None);
208            }
209        } else if dev_caps.opencl_available {
210            #[cfg(feature = "opencl")]
211            {
212                Backend::OpenCL
213            }
214            #[cfg(not(feature = "opencl"))]
215            {
216                return Ok(None);
217            }
218        } else if dev_caps.metal_available {
219            #[cfg(all(target_os = "macos", feature = "metal"))]
220            {
221                Backend::Metal
222            }
223            #[cfg(not(all(target_os = "macos", feature = "metal")))]
224            {
225                return Ok(None);
226            }
227        } else {
228            return Ok(None);
229        };
230
231        Ok(Some(GpuCapabilities {
232            backend,
233            device_name: "GPU Device".to_string(),
234            memory_bytes: dev_caps.gpu_memory_mb * 1024 * 1024,
235            compute_units: dev_caps.compute_units,
236            max_work_group_size: 256,
237            supports_double: true,
238        }))
239    }
240
241    /// Check if GPU processing is available
242    pub fn gpu_available(&self) -> bool {
243        self.capabilities.is_some()
244    }
245
246    /// Get GPU capabilities if available
247    pub fn capabilities(&self) -> Option<&GpuCapabilities> {
248        self.capabilities.as_ref()
249    }
250
251    /// Should we use GPU for a given chunk size?
252    fn should_use_gpu(&self, chunk_elements: usize, element_size: usize) -> bool {
253        if !self.gpu_available() {
254            return false;
255        }
256
257        if chunk_elements < self.config.min_gpu_chunk_elements {
258            return false;
259        }
260
261        if let Some(caps) = &self.capabilities {
262            let chunk_bytes = chunk_elements * element_size;
263            caps.can_process(chunk_bytes, element_size)
264        } else {
265            false
266        }
267    }
268
269    /// Process an image with GPU acceleration
270    pub fn process<T, Op>(&self, input: &ArrayView2<T>, operation: &Op) -> NdimageResult<Array2<T>>
271    where
272        T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static,
273        Op: GpuChunkOperation<T>,
274    {
275        let element_size = std::mem::size_of::<T>();
276        let image_shape = (input.nrows(), input.ncols());
277
278        // Calculate chunk size
279        let chunk_size = self.calculate_chunk_size(element_size);
280        let overlap = operation.required_overlap();
281
282        // Create output array
283        let mut output = Array2::zeros(image_shape);
284
285        // Create chunk iterator
286        let chunk_iter = ChunkRegionIterator::new(image_shape, chunk_size, overlap);
287
288        // Process chunks
289        for region in chunk_iter {
290            // Extract chunk
291            let rows = region.padded_start.0..region.padded_end.0;
292            let cols = region.padded_start.1..region.padded_end.1;
293            let chunk = input.slice(ndarray::s![rows, cols]).to_owned();
294
295            // Decide GPU vs CPU
296            let result = if self.should_use_gpu(chunk.len(), element_size)
297                && GpuChunkOperation::supports_gpu(operation)
298            {
299                // GPU processing
300                self.stats.gpu_chunks.fetch_add(1, Ordering::Relaxed);
301                let byte_count = chunk.len() * std::mem::size_of::<T>();
302                self.stats
303                    .bytes_to_gpu
304                    .fetch_add(byte_count, Ordering::Relaxed);
305
306                let gpu_result = GpuChunkOperation::apply_gpu(
307                    operation,
308                    &chunk.view(),
309                    self.capabilities.as_ref(),
310                )?;
311
312                self.stats.bytes_from_gpu.fetch_add(
313                    gpu_result.len() * std::mem::size_of::<T>(),
314                    Ordering::Relaxed,
315                );
316                gpu_result
317            } else {
318                // CPU fallback
319                self.stats.cpu_chunks.fetch_add(1, Ordering::Relaxed);
320                operation.apply(&chunk.view())?
321            };
322
323            // Insert result
324            self.insert_chunk_result(&mut output, &result.view(), &region)?;
325        }
326
327        Ok(output)
328    }
329
330    /// Calculate optimal chunk size for GPU
331    fn calculate_chunk_size(&self, element_size: usize) -> (usize, usize) {
332        let target_elements = self.config.max_gpu_chunk_bytes / element_size;
333        let base_size = ((target_elements as f64).sqrt() as usize).max(32);
334        (base_size, base_size)
335    }
336
337    /// Insert a processed chunk into the output array
338    fn insert_chunk_result<T: Float + Clone>(
339        &self,
340        output: &mut Array2<T>,
341        chunk: &ArrayView2<T>,
342        region: &ChunkRegion,
343    ) -> NdimageResult<()> {
344        let overlap = region.overlap();
345        let core_start_row = overlap.0 .0;
346        let core_start_col = overlap.0 .1;
347        let core_end_row = chunk.nrows() - overlap.1 .0;
348        let core_end_col = chunk.ncols() - overlap.1 .1;
349
350        let core_slice = chunk.slice(ndarray::s![
351            core_start_row..core_end_row,
352            core_start_col..core_end_col
353        ]);
354
355        output
356            .slice_mut(ndarray::s![
357                region.start.0..region.end.0,
358                region.start.1..region.end.1
359            ])
360            .assign(&core_slice);
361
362        Ok(())
363    }
364
365    /// Get processing statistics
366    pub fn stats(&self) -> &GpuProcessingStats {
367        &self.stats
368    }
369
370    /// Reset statistics
371    pub fn reset_stats(&self) {
372        self.stats.gpu_chunks.store(0, Ordering::Relaxed);
373        self.stats.cpu_chunks.store(0, Ordering::Relaxed);
374        self.stats.bytes_to_gpu.store(0, Ordering::Relaxed);
375        self.stats.bytes_from_gpu.store(0, Ordering::Relaxed);
376        self.stats.gpu_compute_ms.store(0, Ordering::Relaxed);
377    }
378
379    // ========================================================================
380    // Convenience methods
381    // ========================================================================
382
383    /// Apply Gaussian filter with GPU acceleration
384    pub fn gaussian_filter<T>(&self, input: &ArrayView2<T>, sigma: f64) -> NdimageResult<Array2<T>>
385    where
386        T: Float
387            + FromPrimitive
388            + Debug
389            + Clone
390            + Send
391            + Sync
392            + Zero
393            + 'static
394            + std::ops::AddAssign
395            + std::ops::DivAssign,
396    {
397        let operation = GpuGaussianFilter::new(sigma, BorderMode::Reflect);
398        self.process(input, &operation)
399    }
400
401    /// Apply uniform filter with GPU acceleration
402    pub fn uniform_filter<T>(&self, input: &ArrayView2<T>, size: usize) -> NdimageResult<Array2<T>>
403    where
404        T: Float
405            + FromPrimitive
406            + Debug
407            + Clone
408            + Send
409            + Sync
410            + Zero
411            + 'static
412            + std::ops::AddAssign
413            + std::ops::DivAssign,
414    {
415        let operation = GpuUniformFilter::new(size, BorderMode::Reflect);
416        self.process(input, &operation)
417    }
418
419    /// Apply morphological erosion with GPU acceleration
420    pub fn grey_erosion<T>(&self, input: &ArrayView2<T>, size: usize) -> NdimageResult<Array2<T>>
421    where
422        T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
423    {
424        let operation = GpuGreyErosion::new(size, MorphBorderMode::Constant);
425        self.process(input, &operation)
426    }
427
428    /// Apply morphological dilation with GPU acceleration
429    pub fn grey_dilation<T>(&self, input: &ArrayView2<T>, size: usize) -> NdimageResult<Array2<T>>
430    where
431        T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
432    {
433        let operation = GpuGreyDilation::new(size, MorphBorderMode::Constant);
434        self.process(input, &operation)
435    }
436}
437
438// ============================================================================
439// GPU Chunk Operation Trait
440// ============================================================================
441
442/// Trait for operations that can be GPU-accelerated
443pub trait GpuChunkOperation<T>: ChunkOperation<T>
444where
445    T: Float + FromPrimitive + Debug + Clone + Send + Sync,
446{
447    /// Whether this operation supports GPU acceleration
448    fn supports_gpu(&self) -> bool {
449        true
450    }
451
452    /// Apply the operation on GPU
453    fn apply_gpu(
454        &self,
455        chunk: &ArrayView2<T>,
456        capabilities: Option<&GpuCapabilities>,
457    ) -> NdimageResult<Array2<T>> {
458        // Default: fall back to CPU implementation
459        self.apply(chunk)
460    }
461
462    /// Estimate GPU memory requirements
463    fn gpu_memory_estimate(&self, input_elements: usize, element_size: usize) -> usize {
464        // Default: input + output + working memory
465        input_elements * element_size * 3
466    }
467}
468
469// ============================================================================
470// GPU-Accelerated Operations
471// ============================================================================
472
473/// GPU-accelerated Gaussian filter
474pub struct GpuGaussianFilter {
475    sigma: f64,
476    border_mode: BorderMode,
477    kernel_radius: usize,
478}
479
480impl GpuGaussianFilter {
481    pub fn new(sigma: f64, border_mode: BorderMode) -> Self {
482        let kernel_radius = (sigma * 4.0).ceil() as usize;
483        Self {
484            sigma,
485            border_mode,
486            kernel_radius,
487        }
488    }
489}
490
491impl<T> ChunkOperation<T> for GpuGaussianFilter
492where
493    T: Float
494        + FromPrimitive
495        + Debug
496        + Clone
497        + Send
498        + Sync
499        + Zero
500        + 'static
501        + std::ops::AddAssign
502        + std::ops::DivAssign,
503{
504    fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
505        let chunk_f64 = chunk.mapv(|x| x.to_f64().unwrap_or(0.0));
506        let result =
507            crate::filters::gaussian_filter(&chunk_f64, self.sigma, Some(self.border_mode), None)?;
508        Ok(result.mapv(|x| T::from_f64(x).unwrap_or_else(T::zero)))
509    }
510
511    fn required_overlap(&self) -> usize {
512        self.kernel_radius
513    }
514
515    fn name(&self) -> &str {
516        "gpu_gaussian_filter"
517    }
518}
519
520impl<T> GpuChunkOperation<T> for GpuGaussianFilter
521where
522    T: Float
523        + FromPrimitive
524        + Debug
525        + Clone
526        + Send
527        + Sync
528        + Zero
529        + 'static
530        + std::ops::AddAssign
531        + std::ops::DivAssign,
532{
533    fn supports_gpu(&self) -> bool {
534        true
535    }
536
537    fn apply_gpu(
538        &self,
539        chunk: &ArrayView2<T>,
540        capabilities: Option<&GpuCapabilities>,
541    ) -> NdimageResult<Array2<T>> {
542        // In a full implementation, this would use the GPU backend
543        // For now, we use the CPU implementation with potential future GPU extension
544        #[cfg(feature = "gpu")]
545        if let Some(caps) = capabilities {
546            // TODO: Implement actual GPU kernel execution
547            // For now, fall back to CPU
548            return self.apply(chunk);
549        }
550
551        self.apply(chunk)
552    }
553
554    fn gpu_memory_estimate(&self, input_elements: usize, element_size: usize) -> usize {
555        // Gaussian filter needs: input + output + kernel
556        let kernel_size = (self.kernel_radius * 2 + 1).pow(2);
557        (input_elements * 2 + kernel_size) * element_size
558    }
559}
560
561/// GPU-accelerated uniform filter
562pub struct GpuUniformFilter {
563    size: usize,
564    border_mode: BorderMode,
565}
566
567impl GpuUniformFilter {
568    pub fn new(size: usize, border_mode: BorderMode) -> Self {
569        Self { size, border_mode }
570    }
571}
572
573impl<T> ChunkOperation<T> for GpuUniformFilter
574where
575    T: Float
576        + FromPrimitive
577        + Debug
578        + Clone
579        + Send
580        + Sync
581        + Zero
582        + 'static
583        + std::ops::AddAssign
584        + std::ops::DivAssign,
585{
586    fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
587        let chunk_f64 = chunk.mapv(|x| x.to_f64().unwrap_or(0.0));
588        let result = crate::filters::uniform_filter(
589            &chunk_f64,
590            &[self.size, self.size],
591            Some(self.border_mode),
592            None,
593        )?;
594        Ok(result.mapv(|x| T::from_f64(x).unwrap_or_else(T::zero)))
595    }
596
597    fn required_overlap(&self) -> usize {
598        self.size / 2 + 1
599    }
600
601    fn name(&self) -> &str {
602        "gpu_uniform_filter"
603    }
604}
605
606impl<T> GpuChunkOperation<T> for GpuUniformFilter
607where
608    T: Float
609        + FromPrimitive
610        + Debug
611        + Clone
612        + Send
613        + Sync
614        + Zero
615        + 'static
616        + std::ops::AddAssign
617        + std::ops::DivAssign,
618{
619    fn supports_gpu(&self) -> bool {
620        true
621    }
622}
623
624/// GPU-accelerated grey erosion
625pub struct GpuGreyErosion {
626    size: usize,
627    border_mode: MorphBorderMode,
628}
629
630impl GpuGreyErosion {
631    pub fn new(size: usize, border_mode: MorphBorderMode) -> Self {
632        Self { size, border_mode }
633    }
634}
635
636impl<T> ChunkOperation<T> for GpuGreyErosion
637where
638    T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
639{
640    fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
641        let chunk_f64 = chunk.mapv(|x| x.to_f64().unwrap_or(0.0));
642        let chunk_owned = chunk_f64.to_owned();
643
644        let result = crate::morphology::grey_erosion(
645            &chunk_owned,
646            Some(&[self.size, self.size]),
647            None,
648            Some(self.border_mode),
649            None,
650            None,
651        )?;
652
653        Ok(result.mapv(|x| T::from_f64(x).unwrap_or_else(T::zero)))
654    }
655
656    fn required_overlap(&self) -> usize {
657        self.size / 2 + 1
658    }
659
660    fn name(&self) -> &str {
661        "gpu_grey_erosion"
662    }
663}
664
665impl<T> GpuChunkOperation<T> for GpuGreyErosion
666where
667    T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
668{
669    fn supports_gpu(&self) -> bool {
670        true
671    }
672}
673
674/// GPU-accelerated grey dilation
675pub struct GpuGreyDilation {
676    size: usize,
677    border_mode: MorphBorderMode,
678}
679
680impl GpuGreyDilation {
681    pub fn new(size: usize, border_mode: MorphBorderMode) -> Self {
682        Self { size, border_mode }
683    }
684}
685
686impl<T> ChunkOperation<T> for GpuGreyDilation
687where
688    T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
689{
690    fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
691        let chunk_f64 = chunk.mapv(|x| x.to_f64().unwrap_or(0.0));
692        let chunk_owned = chunk_f64.to_owned();
693
694        let result = crate::morphology::grey_dilation(
695            &chunk_owned,
696            Some(&[self.size, self.size]),
697            None,
698            Some(self.border_mode),
699            None,
700            None,
701        )?;
702
703        Ok(result.mapv(|x| T::from_f64(x).unwrap_or_else(T::zero)))
704    }
705
706    fn required_overlap(&self) -> usize {
707        self.size / 2 + 1
708    }
709
710    fn name(&self) -> &str {
711        "gpu_grey_dilation"
712    }
713}
714
715impl<T> GpuChunkOperation<T> for GpuGreyDilation
716where
717    T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
718{
719    fn supports_gpu(&self) -> bool {
720        true
721    }
722}
723
724// ============================================================================
725// GPU Memory Pool
726// ============================================================================
727
728/// Simple GPU memory pool for efficient allocation
729#[cfg(feature = "gpu")]
730pub struct GpuMemoryPool {
731    allocations: Vec<GpuAllocation>,
732    total_allocated: usize,
733    max_memory: usize,
734}
735
736#[cfg(feature = "gpu")]
737struct GpuAllocation {
738    ptr: usize,
739    size: usize,
740    in_use: bool,
741}
742
743#[cfg(feature = "gpu")]
744impl GpuMemoryPool {
745    /// Create a new memory pool with given max size
746    pub fn new(max_memory: usize) -> Self {
747        Self {
748            allocations: Vec::new(),
749            total_allocated: 0,
750            max_memory,
751        }
752    }
753
754    /// Allocate memory from the pool
755    pub fn allocate(&mut self, size: usize) -> NdimageResult<usize> {
756        // Look for an existing free allocation of sufficient size
757        for alloc in &mut self.allocations {
758            if !alloc.in_use && alloc.size >= size {
759                alloc.in_use = true;
760                return Ok(alloc.ptr);
761            }
762        }
763
764        // Need new allocation
765        if self.total_allocated + size > self.max_memory {
766            return Err(NdimageError::MemoryError(
767                "GPU memory pool exhausted".into(),
768            ));
769        }
770
771        // In a real implementation, this would call the GPU allocation API
772        let ptr = self.total_allocated;
773        self.allocations.push(GpuAllocation {
774            ptr,
775            size,
776            in_use: true,
777        });
778        self.total_allocated += size;
779
780        Ok(ptr)
781    }
782
783    /// Free memory back to the pool
784    pub fn free(&mut self, ptr: usize) {
785        for alloc in &mut self.allocations {
786            if alloc.ptr == ptr {
787                alloc.in_use = false;
788                return;
789            }
790        }
791    }
792
793    /// Get available memory
794    pub fn available(&self) -> usize {
795        self.max_memory - self.total_allocated
796    }
797}
798
799// ============================================================================
800// Tests
801// ============================================================================
802
803#[cfg(test)]
804mod tests {
805    use super::*;
806    use scirs2_core::ndarray::Array2;
807
808    #[test]
809    fn test_gpu_config_default() {
810        let config = GpuChunkingConfig::default();
811        assert!(config.max_gpu_chunk_bytes > 0);
812        assert!(config.auto_fallback);
813    }
814
815    #[test]
816    fn test_gpu_processor_creation() {
817        let result = GpuChunkedProcessor::with_defaults();
818        // May or may not have GPU available
819        assert!(result.is_ok());
820    }
821
822    #[test]
823    fn test_gpu_gaussian_filter() {
824        if let Ok(processor) = GpuChunkedProcessor::with_defaults() {
825            let input = Array2::<f64>::ones((50, 50));
826            let result = processor.gaussian_filter(&input.view(), 1.0);
827
828            assert!(result.is_ok());
829            let output = result.expect("Should succeed");
830            assert_eq!(output.shape(), input.shape());
831        }
832    }
833
834    #[test]
835    fn test_gpu_uniform_filter() {
836        if let Ok(processor) = GpuChunkedProcessor::with_defaults() {
837            let input = Array2::<f64>::ones((50, 50));
838            let result = processor.uniform_filter(&input.view(), 3);
839
840            assert!(result.is_ok());
841            let output = result.expect("Should succeed");
842            assert_eq!(output.shape(), input.shape());
843        }
844    }
845
846    #[test]
847    fn test_gpu_morphology() {
848        if let Ok(processor) = GpuChunkedProcessor::with_defaults() {
849            let input = Array2::<f64>::ones((50, 50));
850
851            let eroded = processor.grey_erosion(&input.view(), 3);
852            assert!(eroded.is_ok());
853
854            let dilated = processor.grey_dilation(&input.view(), 3);
855            assert!(dilated.is_ok());
856        }
857    }
858
859    #[test]
860    fn test_gpu_stats() {
861        if let Ok(processor) = GpuChunkedProcessor::with_defaults() {
862            processor.reset_stats();
863
864            let input = Array2::<f64>::ones((100, 100));
865            let _ = processor.gaussian_filter(&input.view(), 1.0);
866
867            let stats = processor.stats();
868            let total =
869                stats.gpu_chunks.load(Ordering::Relaxed) + stats.cpu_chunks.load(Ordering::Relaxed);
870            assert!(total > 0);
871        }
872    }
873
874    #[test]
875    fn test_gpu_capabilities() {
876        if let Ok(processor) = GpuChunkedProcessor::with_defaults() {
877            if let Some(caps) = processor.capabilities() {
878                assert!(caps.memory_bytes > 0);
879                assert!(caps.compute_units > 0);
880            }
881        }
882    }
883
884    #[test]
885    fn test_gpu_operation_trait() {
886        let op: GpuGaussianFilter = GpuGaussianFilter::new(1.0, BorderMode::Reflect);
887        assert!(GpuChunkOperation::<f64>::supports_gpu(&op));
888        assert!(ChunkOperation::<f64>::required_overlap(&op) > 0);
889        assert_eq!(ChunkOperation::<f64>::name(&op), "gpu_gaussian_filter");
890    }
891
892    #[cfg(feature = "gpu")]
893    #[test]
894    fn test_gpu_memory_pool() {
895        let mut pool = GpuMemoryPool::new(1024 * 1024);
896
897        let alloc1 = pool.allocate(1024);
898        assert!(alloc1.is_ok());
899
900        let alloc2 = pool.allocate(2048);
901        assert!(alloc2.is_ok());
902
903        pool.free(alloc1.expect("Should have ptr"));
904        assert!(pool.available() < 1024 * 1024);
905    }
906}