1use std::fmt::Debug;
30use std::marker::PhantomData;
31use std::sync::atomic::{AtomicUsize, Ordering};
32use std::sync::Arc;
33
34use scirs2_core::ndarray::{self as ndarray, Array2, ArrayView2, Axis};
35use scirs2_core::numeric::{Float, FromPrimitive, NumCast, Zero};
36
37use crate::backend::{Backend, DeviceManager};
38use crate::chunked_processing::{
39 ChunkOperation, ChunkRegion, ChunkRegionIterator, ChunkedImageProcessor, ChunkingConfig,
40};
41use crate::error::{NdimageError, NdimageResult};
42use crate::filters::BorderMode;
43use crate::morphology::MorphBorderMode;
44
45#[derive(Debug, Clone)]
51pub struct GpuChunkingConfig {
52 pub max_gpu_chunk_bytes: usize,
54
55 pub min_gpu_chunk_elements: usize,
57
58 pub max_overlap_pixels: usize,
60
61 pub auto_fallback: bool,
63
64 pub preferred_backend: Option<Backend>,
66
67 pub gpu_memory_fraction: f64,
69
70 pub num_streams: usize,
72
73 pub enable_profiling: bool,
75}
76
77impl Default for GpuChunkingConfig {
78 fn default() -> Self {
79 Self {
80 max_gpu_chunk_bytes: 256 * 1024 * 1024, min_gpu_chunk_elements: 256 * 256, max_overlap_pixels: 64,
83 auto_fallback: true,
84 preferred_backend: None,
85 gpu_memory_fraction: 0.7,
86 num_streams: 2,
87 enable_profiling: false,
88 }
89 }
90}
91
92#[derive(Debug, Clone)]
98pub struct GpuCapabilities {
99 pub backend: Backend,
101
102 pub device_name: String,
104
105 pub memory_bytes: usize,
107
108 pub compute_units: u32,
110
111 pub max_work_group_size: usize,
113
114 pub supports_double: bool,
116}
117
118impl GpuCapabilities {
119 pub fn can_process(&self, array_bytes: usize, element_size: usize) -> bool {
121 let required_memory = array_bytes * 2;
123 required_memory < self.memory_bytes
124 }
125}
126
127pub struct GpuChunkedProcessor {
133 config: GpuChunkingConfig,
134 device_manager: DeviceManager,
135 capabilities: Option<GpuCapabilities>,
136 cpu_fallback: ChunkedImageProcessor,
137 stats: GpuProcessingStats,
138}
139
140#[derive(Debug, Default)]
142pub struct GpuProcessingStats {
143 pub gpu_chunks: AtomicUsize,
145
146 pub cpu_chunks: AtomicUsize,
148
149 pub bytes_to_gpu: AtomicUsize,
151
152 pub bytes_from_gpu: AtomicUsize,
154
155 pub gpu_compute_ms: AtomicUsize,
157}
158
159impl GpuChunkedProcessor {
160 pub fn new(config: GpuChunkingConfig) -> NdimageResult<Self> {
162 let device_manager = DeviceManager::new()?;
163 let capabilities = Self::detect_capabilities(&device_manager, &config)?;
164
165 let cpu_config = ChunkingConfig {
166 max_chunk_bytes: config.max_gpu_chunk_bytes,
167 max_overlap_pixels: config.max_overlap_pixels,
168 enable_parallel: true,
169 ..Default::default()
170 };
171
172 Ok(Self {
173 config,
174 device_manager,
175 capabilities,
176 cpu_fallback: ChunkedImageProcessor::new(cpu_config),
177 stats: GpuProcessingStats::default(),
178 })
179 }
180
181 pub fn with_defaults() -> NdimageResult<Self> {
183 Self::new(GpuChunkingConfig::default())
184 }
185
186 fn detect_capabilities(
188 device_manager: &DeviceManager,
189 config: &GpuChunkingConfig,
190 ) -> NdimageResult<Option<GpuCapabilities>> {
191 let dev_caps = device_manager.get_capabilities();
192
193 if !dev_caps.gpu_available {
194 return Ok(None);
195 }
196
197 let backend = if let Some(preferred) = config.preferred_backend {
199 preferred
200 } else if dev_caps.cuda_available {
201 #[cfg(feature = "cuda")]
202 {
203 Backend::Cuda
204 }
205 #[cfg(not(feature = "cuda"))]
206 {
207 return Ok(None);
208 }
209 } else if dev_caps.opencl_available {
210 #[cfg(feature = "opencl")]
211 {
212 Backend::OpenCL
213 }
214 #[cfg(not(feature = "opencl"))]
215 {
216 return Ok(None);
217 }
218 } else if dev_caps.metal_available {
219 #[cfg(all(target_os = "macos", feature = "metal"))]
220 {
221 Backend::Metal
222 }
223 #[cfg(not(all(target_os = "macos", feature = "metal")))]
224 {
225 return Ok(None);
226 }
227 } else {
228 return Ok(None);
229 };
230
231 Ok(Some(GpuCapabilities {
232 backend,
233 device_name: "GPU Device".to_string(),
234 memory_bytes: dev_caps.gpu_memory_mb * 1024 * 1024,
235 compute_units: dev_caps.compute_units,
236 max_work_group_size: 256,
237 supports_double: true,
238 }))
239 }
240
241 pub fn gpu_available(&self) -> bool {
243 self.capabilities.is_some()
244 }
245
246 pub fn capabilities(&self) -> Option<&GpuCapabilities> {
248 self.capabilities.as_ref()
249 }
250
251 fn should_use_gpu(&self, chunk_elements: usize, element_size: usize) -> bool {
253 if !self.gpu_available() {
254 return false;
255 }
256
257 if chunk_elements < self.config.min_gpu_chunk_elements {
258 return false;
259 }
260
261 if let Some(caps) = &self.capabilities {
262 let chunk_bytes = chunk_elements * element_size;
263 caps.can_process(chunk_bytes, element_size)
264 } else {
265 false
266 }
267 }
268
269 pub fn process<T, Op>(&self, input: &ArrayView2<T>, operation: &Op) -> NdimageResult<Array2<T>>
271 where
272 T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static,
273 Op: GpuChunkOperation<T>,
274 {
275 let element_size = std::mem::size_of::<T>();
276 let image_shape = (input.nrows(), input.ncols());
277
278 let chunk_size = self.calculate_chunk_size(element_size);
280 let overlap = operation.required_overlap();
281
282 let mut output = Array2::zeros(image_shape);
284
285 let chunk_iter = ChunkRegionIterator::new(image_shape, chunk_size, overlap);
287
288 for region in chunk_iter {
290 let rows = region.padded_start.0..region.padded_end.0;
292 let cols = region.padded_start.1..region.padded_end.1;
293 let chunk = input.slice(ndarray::s![rows, cols]).to_owned();
294
295 let result = if self.should_use_gpu(chunk.len(), element_size)
297 && GpuChunkOperation::supports_gpu(operation)
298 {
299 self.stats.gpu_chunks.fetch_add(1, Ordering::Relaxed);
301 let byte_count = chunk.len() * std::mem::size_of::<T>();
302 self.stats
303 .bytes_to_gpu
304 .fetch_add(byte_count, Ordering::Relaxed);
305
306 let gpu_result = GpuChunkOperation::apply_gpu(
307 operation,
308 &chunk.view(),
309 self.capabilities.as_ref(),
310 )?;
311
312 self.stats.bytes_from_gpu.fetch_add(
313 gpu_result.len() * std::mem::size_of::<T>(),
314 Ordering::Relaxed,
315 );
316 gpu_result
317 } else {
318 self.stats.cpu_chunks.fetch_add(1, Ordering::Relaxed);
320 operation.apply(&chunk.view())?
321 };
322
323 self.insert_chunk_result(&mut output, &result.view(), ®ion)?;
325 }
326
327 Ok(output)
328 }
329
330 fn calculate_chunk_size(&self, element_size: usize) -> (usize, usize) {
332 let target_elements = self.config.max_gpu_chunk_bytes / element_size;
333 let base_size = ((target_elements as f64).sqrt() as usize).max(32);
334 (base_size, base_size)
335 }
336
337 fn insert_chunk_result<T: Float + Clone>(
339 &self,
340 output: &mut Array2<T>,
341 chunk: &ArrayView2<T>,
342 region: &ChunkRegion,
343 ) -> NdimageResult<()> {
344 let overlap = region.overlap();
345 let core_start_row = overlap.0 .0;
346 let core_start_col = overlap.0 .1;
347 let core_end_row = chunk.nrows() - overlap.1 .0;
348 let core_end_col = chunk.ncols() - overlap.1 .1;
349
350 let core_slice = chunk.slice(ndarray::s![
351 core_start_row..core_end_row,
352 core_start_col..core_end_col
353 ]);
354
355 output
356 .slice_mut(ndarray::s![
357 region.start.0..region.end.0,
358 region.start.1..region.end.1
359 ])
360 .assign(&core_slice);
361
362 Ok(())
363 }
364
365 pub fn stats(&self) -> &GpuProcessingStats {
367 &self.stats
368 }
369
370 pub fn reset_stats(&self) {
372 self.stats.gpu_chunks.store(0, Ordering::Relaxed);
373 self.stats.cpu_chunks.store(0, Ordering::Relaxed);
374 self.stats.bytes_to_gpu.store(0, Ordering::Relaxed);
375 self.stats.bytes_from_gpu.store(0, Ordering::Relaxed);
376 self.stats.gpu_compute_ms.store(0, Ordering::Relaxed);
377 }
378
379 pub fn gaussian_filter<T>(&self, input: &ArrayView2<T>, sigma: f64) -> NdimageResult<Array2<T>>
385 where
386 T: Float
387 + FromPrimitive
388 + Debug
389 + Clone
390 + Send
391 + Sync
392 + Zero
393 + 'static
394 + std::ops::AddAssign
395 + std::ops::DivAssign,
396 {
397 let operation = GpuGaussianFilter::new(sigma, BorderMode::Reflect);
398 self.process(input, &operation)
399 }
400
401 pub fn uniform_filter<T>(&self, input: &ArrayView2<T>, size: usize) -> NdimageResult<Array2<T>>
403 where
404 T: Float
405 + FromPrimitive
406 + Debug
407 + Clone
408 + Send
409 + Sync
410 + Zero
411 + 'static
412 + std::ops::AddAssign
413 + std::ops::DivAssign,
414 {
415 let operation = GpuUniformFilter::new(size, BorderMode::Reflect);
416 self.process(input, &operation)
417 }
418
419 pub fn grey_erosion<T>(&self, input: &ArrayView2<T>, size: usize) -> NdimageResult<Array2<T>>
421 where
422 T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
423 {
424 let operation = GpuGreyErosion::new(size, MorphBorderMode::Constant);
425 self.process(input, &operation)
426 }
427
428 pub fn grey_dilation<T>(&self, input: &ArrayView2<T>, size: usize) -> NdimageResult<Array2<T>>
430 where
431 T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
432 {
433 let operation = GpuGreyDilation::new(size, MorphBorderMode::Constant);
434 self.process(input, &operation)
435 }
436}
437
438pub trait GpuChunkOperation<T>: ChunkOperation<T>
444where
445 T: Float + FromPrimitive + Debug + Clone + Send + Sync,
446{
447 fn supports_gpu(&self) -> bool {
449 true
450 }
451
452 fn apply_gpu(
454 &self,
455 chunk: &ArrayView2<T>,
456 capabilities: Option<&GpuCapabilities>,
457 ) -> NdimageResult<Array2<T>> {
458 self.apply(chunk)
460 }
461
462 fn gpu_memory_estimate(&self, input_elements: usize, element_size: usize) -> usize {
464 input_elements * element_size * 3
466 }
467}
468
469pub struct GpuGaussianFilter {
475 sigma: f64,
476 border_mode: BorderMode,
477 kernel_radius: usize,
478}
479
480impl GpuGaussianFilter {
481 pub fn new(sigma: f64, border_mode: BorderMode) -> Self {
482 let kernel_radius = (sigma * 4.0).ceil() as usize;
483 Self {
484 sigma,
485 border_mode,
486 kernel_radius,
487 }
488 }
489}
490
491impl<T> ChunkOperation<T> for GpuGaussianFilter
492where
493 T: Float
494 + FromPrimitive
495 + Debug
496 + Clone
497 + Send
498 + Sync
499 + Zero
500 + 'static
501 + std::ops::AddAssign
502 + std::ops::DivAssign,
503{
504 fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
505 let chunk_f64 = chunk.mapv(|x| x.to_f64().unwrap_or(0.0));
506 let result =
507 crate::filters::gaussian_filter(&chunk_f64, self.sigma, Some(self.border_mode), None)?;
508 Ok(result.mapv(|x| T::from_f64(x).unwrap_or_else(T::zero)))
509 }
510
511 fn required_overlap(&self) -> usize {
512 self.kernel_radius
513 }
514
515 fn name(&self) -> &str {
516 "gpu_gaussian_filter"
517 }
518}
519
520impl<T> GpuChunkOperation<T> for GpuGaussianFilter
521where
522 T: Float
523 + FromPrimitive
524 + Debug
525 + Clone
526 + Send
527 + Sync
528 + Zero
529 + 'static
530 + std::ops::AddAssign
531 + std::ops::DivAssign,
532{
533 fn supports_gpu(&self) -> bool {
534 true
535 }
536
537 fn apply_gpu(
538 &self,
539 chunk: &ArrayView2<T>,
540 capabilities: Option<&GpuCapabilities>,
541 ) -> NdimageResult<Array2<T>> {
542 #[cfg(feature = "gpu")]
545 if let Some(caps) = capabilities {
546 return self.apply(chunk);
549 }
550
551 self.apply(chunk)
552 }
553
554 fn gpu_memory_estimate(&self, input_elements: usize, element_size: usize) -> usize {
555 let kernel_size = (self.kernel_radius * 2 + 1).pow(2);
557 (input_elements * 2 + kernel_size) * element_size
558 }
559}
560
561pub struct GpuUniformFilter {
563 size: usize,
564 border_mode: BorderMode,
565}
566
567impl GpuUniformFilter {
568 pub fn new(size: usize, border_mode: BorderMode) -> Self {
569 Self { size, border_mode }
570 }
571}
572
573impl<T> ChunkOperation<T> for GpuUniformFilter
574where
575 T: Float
576 + FromPrimitive
577 + Debug
578 + Clone
579 + Send
580 + Sync
581 + Zero
582 + 'static
583 + std::ops::AddAssign
584 + std::ops::DivAssign,
585{
586 fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
587 let chunk_f64 = chunk.mapv(|x| x.to_f64().unwrap_or(0.0));
588 let result = crate::filters::uniform_filter(
589 &chunk_f64,
590 &[self.size, self.size],
591 Some(self.border_mode),
592 None,
593 )?;
594 Ok(result.mapv(|x| T::from_f64(x).unwrap_or_else(T::zero)))
595 }
596
597 fn required_overlap(&self) -> usize {
598 self.size / 2 + 1
599 }
600
601 fn name(&self) -> &str {
602 "gpu_uniform_filter"
603 }
604}
605
606impl<T> GpuChunkOperation<T> for GpuUniformFilter
607where
608 T: Float
609 + FromPrimitive
610 + Debug
611 + Clone
612 + Send
613 + Sync
614 + Zero
615 + 'static
616 + std::ops::AddAssign
617 + std::ops::DivAssign,
618{
619 fn supports_gpu(&self) -> bool {
620 true
621 }
622}
623
624pub struct GpuGreyErosion {
626 size: usize,
627 border_mode: MorphBorderMode,
628}
629
630impl GpuGreyErosion {
631 pub fn new(size: usize, border_mode: MorphBorderMode) -> Self {
632 Self { size, border_mode }
633 }
634}
635
636impl<T> ChunkOperation<T> for GpuGreyErosion
637where
638 T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
639{
640 fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
641 let chunk_f64 = chunk.mapv(|x| x.to_f64().unwrap_or(0.0));
642 let chunk_owned = chunk_f64.to_owned();
643
644 let result = crate::morphology::grey_erosion(
645 &chunk_owned,
646 Some(&[self.size, self.size]),
647 None,
648 Some(self.border_mode),
649 None,
650 None,
651 )?;
652
653 Ok(result.mapv(|x| T::from_f64(x).unwrap_or_else(T::zero)))
654 }
655
656 fn required_overlap(&self) -> usize {
657 self.size / 2 + 1
658 }
659
660 fn name(&self) -> &str {
661 "gpu_grey_erosion"
662 }
663}
664
665impl<T> GpuChunkOperation<T> for GpuGreyErosion
666where
667 T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
668{
669 fn supports_gpu(&self) -> bool {
670 true
671 }
672}
673
674pub struct GpuGreyDilation {
676 size: usize,
677 border_mode: MorphBorderMode,
678}
679
680impl GpuGreyDilation {
681 pub fn new(size: usize, border_mode: MorphBorderMode) -> Self {
682 Self { size, border_mode }
683 }
684}
685
686impl<T> ChunkOperation<T> for GpuGreyDilation
687where
688 T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
689{
690 fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
691 let chunk_f64 = chunk.mapv(|x| x.to_f64().unwrap_or(0.0));
692 let chunk_owned = chunk_f64.to_owned();
693
694 let result = crate::morphology::grey_dilation(
695 &chunk_owned,
696 Some(&[self.size, self.size]),
697 None,
698 Some(self.border_mode),
699 None,
700 None,
701 )?;
702
703 Ok(result.mapv(|x| T::from_f64(x).unwrap_or_else(T::zero)))
704 }
705
706 fn required_overlap(&self) -> usize {
707 self.size / 2 + 1
708 }
709
710 fn name(&self) -> &str {
711 "gpu_grey_dilation"
712 }
713}
714
715impl<T> GpuChunkOperation<T> for GpuGreyDilation
716where
717 T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
718{
719 fn supports_gpu(&self) -> bool {
720 true
721 }
722}
723
724#[cfg(feature = "gpu")]
730pub struct GpuMemoryPool {
731 allocations: Vec<GpuAllocation>,
732 total_allocated: usize,
733 max_memory: usize,
734}
735
736#[cfg(feature = "gpu")]
737struct GpuAllocation {
738 ptr: usize,
739 size: usize,
740 in_use: bool,
741}
742
743#[cfg(feature = "gpu")]
744impl GpuMemoryPool {
745 pub fn new(max_memory: usize) -> Self {
747 Self {
748 allocations: Vec::new(),
749 total_allocated: 0,
750 max_memory,
751 }
752 }
753
754 pub fn allocate(&mut self, size: usize) -> NdimageResult<usize> {
756 for alloc in &mut self.allocations {
758 if !alloc.in_use && alloc.size >= size {
759 alloc.in_use = true;
760 return Ok(alloc.ptr);
761 }
762 }
763
764 if self.total_allocated + size > self.max_memory {
766 return Err(NdimageError::MemoryError(
767 "GPU memory pool exhausted".into(),
768 ));
769 }
770
771 let ptr = self.total_allocated;
773 self.allocations.push(GpuAllocation {
774 ptr,
775 size,
776 in_use: true,
777 });
778 self.total_allocated += size;
779
780 Ok(ptr)
781 }
782
783 pub fn free(&mut self, ptr: usize) {
785 for alloc in &mut self.allocations {
786 if alloc.ptr == ptr {
787 alloc.in_use = false;
788 return;
789 }
790 }
791 }
792
793 pub fn available(&self) -> usize {
795 self.max_memory - self.total_allocated
796 }
797}
798
799#[cfg(test)]
804mod tests {
805 use super::*;
806 use scirs2_core::ndarray::Array2;
807
808 #[test]
809 fn test_gpu_config_default() {
810 let config = GpuChunkingConfig::default();
811 assert!(config.max_gpu_chunk_bytes > 0);
812 assert!(config.auto_fallback);
813 }
814
815 #[test]
816 fn test_gpu_processor_creation() {
817 let result = GpuChunkedProcessor::with_defaults();
818 assert!(result.is_ok());
820 }
821
822 #[test]
823 fn test_gpu_gaussian_filter() {
824 if let Ok(processor) = GpuChunkedProcessor::with_defaults() {
825 let input = Array2::<f64>::ones((50, 50));
826 let result = processor.gaussian_filter(&input.view(), 1.0);
827
828 assert!(result.is_ok());
829 let output = result.expect("Should succeed");
830 assert_eq!(output.shape(), input.shape());
831 }
832 }
833
834 #[test]
835 fn test_gpu_uniform_filter() {
836 if let Ok(processor) = GpuChunkedProcessor::with_defaults() {
837 let input = Array2::<f64>::ones((50, 50));
838 let result = processor.uniform_filter(&input.view(), 3);
839
840 assert!(result.is_ok());
841 let output = result.expect("Should succeed");
842 assert_eq!(output.shape(), input.shape());
843 }
844 }
845
846 #[test]
847 fn test_gpu_morphology() {
848 if let Ok(processor) = GpuChunkedProcessor::with_defaults() {
849 let input = Array2::<f64>::ones((50, 50));
850
851 let eroded = processor.grey_erosion(&input.view(), 3);
852 assert!(eroded.is_ok());
853
854 let dilated = processor.grey_dilation(&input.view(), 3);
855 assert!(dilated.is_ok());
856 }
857 }
858
859 #[test]
860 fn test_gpu_stats() {
861 if let Ok(processor) = GpuChunkedProcessor::with_defaults() {
862 processor.reset_stats();
863
864 let input = Array2::<f64>::ones((100, 100));
865 let _ = processor.gaussian_filter(&input.view(), 1.0);
866
867 let stats = processor.stats();
868 let total =
869 stats.gpu_chunks.load(Ordering::Relaxed) + stats.cpu_chunks.load(Ordering::Relaxed);
870 assert!(total > 0);
871 }
872 }
873
874 #[test]
875 fn test_gpu_capabilities() {
876 if let Ok(processor) = GpuChunkedProcessor::with_defaults() {
877 if let Some(caps) = processor.capabilities() {
878 assert!(caps.memory_bytes > 0);
879 assert!(caps.compute_units > 0);
880 }
881 }
882 }
883
884 #[test]
885 fn test_gpu_operation_trait() {
886 let op: GpuGaussianFilter = GpuGaussianFilter::new(1.0, BorderMode::Reflect);
887 assert!(GpuChunkOperation::<f64>::supports_gpu(&op));
888 assert!(ChunkOperation::<f64>::required_overlap(&op) > 0);
889 assert_eq!(ChunkOperation::<f64>::name(&op), "gpu_gaussian_filter");
890 }
891
892 #[cfg(feature = "gpu")]
893 #[test]
894 fn test_gpu_memory_pool() {
895 let mut pool = GpuMemoryPool::new(1024 * 1024);
896
897 let alloc1 = pool.allocate(1024);
898 assert!(alloc1.is_ok());
899
900 let alloc2 = pool.allocate(2048);
901 assert!(alloc2.is_ok());
902
903 pool.free(alloc1.expect("Should have ptr"));
904 assert!(pool.available() < 1024 * 1024);
905 }
906}