use std::fmt::Debug;
use std::marker::PhantomData;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use scirs2_core::ndarray::{self as ndarray, Array2, ArrayView2, Axis};
use scirs2_core::numeric::{Float, FromPrimitive, NumCast, Zero};
use crate::backend::{Backend, DeviceManager};
use crate::chunked_processing::{
ChunkOperation, ChunkRegion, ChunkRegionIterator, ChunkedImageProcessor, ChunkingConfig,
};
use crate::error::{NdimageError, NdimageResult};
use crate::filters::BorderMode;
use crate::morphology::MorphBorderMode;
#[derive(Debug, Clone)]
pub struct GpuChunkingConfig {
pub max_gpu_chunk_bytes: usize,
pub min_gpu_chunk_elements: usize,
pub max_overlap_pixels: usize,
pub auto_fallback: bool,
pub preferred_backend: Option<Backend>,
pub gpu_memory_fraction: f64,
pub num_streams: usize,
pub enable_profiling: bool,
}
impl Default for GpuChunkingConfig {
fn default() -> Self {
Self {
max_gpu_chunk_bytes: 256 * 1024 * 1024, min_gpu_chunk_elements: 256 * 256, max_overlap_pixels: 64,
auto_fallback: true,
preferred_backend: None,
gpu_memory_fraction: 0.7,
num_streams: 2,
enable_profiling: false,
}
}
}
#[derive(Debug, Clone)]
pub struct GpuCapabilities {
pub backend: Backend,
pub device_name: String,
pub memory_bytes: usize,
pub compute_units: u32,
pub max_work_group_size: usize,
pub supports_double: bool,
}
impl GpuCapabilities {
pub fn can_process(&self, array_bytes: usize, element_size: usize) -> bool {
let required_memory = array_bytes * 2;
required_memory < self.memory_bytes
}
}
pub struct GpuChunkedProcessor {
config: GpuChunkingConfig,
device_manager: DeviceManager,
capabilities: Option<GpuCapabilities>,
cpu_fallback: ChunkedImageProcessor,
stats: GpuProcessingStats,
}
#[derive(Debug, Default)]
pub struct GpuProcessingStats {
pub gpu_chunks: AtomicUsize,
pub cpu_chunks: AtomicUsize,
pub bytes_to_gpu: AtomicUsize,
pub bytes_from_gpu: AtomicUsize,
pub gpu_compute_ms: AtomicUsize,
}
impl GpuChunkedProcessor {
pub fn new(config: GpuChunkingConfig) -> NdimageResult<Self> {
let device_manager = DeviceManager::new()?;
let capabilities = Self::detect_capabilities(&device_manager, &config)?;
let cpu_config = ChunkingConfig {
max_chunk_bytes: config.max_gpu_chunk_bytes,
max_overlap_pixels: config.max_overlap_pixels,
enable_parallel: true,
..Default::default()
};
Ok(Self {
config,
device_manager,
capabilities,
cpu_fallback: ChunkedImageProcessor::new(cpu_config),
stats: GpuProcessingStats::default(),
})
}
pub fn with_defaults() -> NdimageResult<Self> {
Self::new(GpuChunkingConfig::default())
}
fn detect_capabilities(
device_manager: &DeviceManager,
config: &GpuChunkingConfig,
) -> NdimageResult<Option<GpuCapabilities>> {
let dev_caps = device_manager.get_capabilities();
if !dev_caps.gpu_available {
return Ok(None);
}
let backend = if let Some(preferred) = config.preferred_backend {
preferred
} else if dev_caps.cuda_available {
#[cfg(feature = "cuda")]
{
Backend::Cuda
}
#[cfg(not(feature = "cuda"))]
{
return Ok(None);
}
} else if dev_caps.opencl_available {
#[cfg(feature = "opencl")]
{
Backend::OpenCL
}
#[cfg(not(feature = "opencl"))]
{
return Ok(None);
}
} else if dev_caps.metal_available {
#[cfg(all(target_os = "macos", feature = "metal"))]
{
Backend::Metal
}
#[cfg(not(all(target_os = "macos", feature = "metal")))]
{
return Ok(None);
}
} else {
return Ok(None);
};
Ok(Some(GpuCapabilities {
backend,
device_name: "GPU Device".to_string(),
memory_bytes: dev_caps.gpu_memory_mb * 1024 * 1024,
compute_units: dev_caps.compute_units,
max_work_group_size: 256,
supports_double: true,
}))
}
pub fn gpu_available(&self) -> bool {
self.capabilities.is_some()
}
pub fn capabilities(&self) -> Option<&GpuCapabilities> {
self.capabilities.as_ref()
}
fn should_use_gpu(&self, chunk_elements: usize, element_size: usize) -> bool {
if !self.gpu_available() {
return false;
}
if chunk_elements < self.config.min_gpu_chunk_elements {
return false;
}
if let Some(caps) = &self.capabilities {
let chunk_bytes = chunk_elements * element_size;
caps.can_process(chunk_bytes, element_size)
} else {
false
}
}
pub fn process<T, Op>(&self, input: &ArrayView2<T>, operation: &Op) -> NdimageResult<Array2<T>>
where
T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static,
Op: GpuChunkOperation<T>,
{
let element_size = std::mem::size_of::<T>();
let image_shape = (input.nrows(), input.ncols());
let chunk_size = self.calculate_chunk_size(element_size);
let overlap = operation.required_overlap();
let mut output = Array2::zeros(image_shape);
let chunk_iter = ChunkRegionIterator::new(image_shape, chunk_size, overlap);
for region in chunk_iter {
let rows = region.padded_start.0..region.padded_end.0;
let cols = region.padded_start.1..region.padded_end.1;
let chunk = input.slice(ndarray::s![rows, cols]).to_owned();
let result = if self.should_use_gpu(chunk.len(), element_size)
&& GpuChunkOperation::supports_gpu(operation)
{
self.stats.gpu_chunks.fetch_add(1, Ordering::Relaxed);
let byte_count = chunk.len() * std::mem::size_of::<T>();
self.stats
.bytes_to_gpu
.fetch_add(byte_count, Ordering::Relaxed);
let gpu_result = GpuChunkOperation::apply_gpu(
operation,
&chunk.view(),
self.capabilities.as_ref(),
)?;
self.stats.bytes_from_gpu.fetch_add(
gpu_result.len() * std::mem::size_of::<T>(),
Ordering::Relaxed,
);
gpu_result
} else {
self.stats.cpu_chunks.fetch_add(1, Ordering::Relaxed);
operation.apply(&chunk.view())?
};
self.insert_chunk_result(&mut output, &result.view(), ®ion)?;
}
Ok(output)
}
fn calculate_chunk_size(&self, element_size: usize) -> (usize, usize) {
let target_elements = self.config.max_gpu_chunk_bytes / element_size;
let base_size = ((target_elements as f64).sqrt() as usize).max(32);
(base_size, base_size)
}
fn insert_chunk_result<T: Float + Clone>(
&self,
output: &mut Array2<T>,
chunk: &ArrayView2<T>,
region: &ChunkRegion,
) -> NdimageResult<()> {
let overlap = region.overlap();
let core_start_row = overlap.0 .0;
let core_start_col = overlap.0 .1;
let core_end_row = chunk.nrows() - overlap.1 .0;
let core_end_col = chunk.ncols() - overlap.1 .1;
let core_slice = chunk.slice(ndarray::s![
core_start_row..core_end_row,
core_start_col..core_end_col
]);
output
.slice_mut(ndarray::s![
region.start.0..region.end.0,
region.start.1..region.end.1
])
.assign(&core_slice);
Ok(())
}
pub fn stats(&self) -> &GpuProcessingStats {
&self.stats
}
pub fn reset_stats(&self) {
self.stats.gpu_chunks.store(0, Ordering::Relaxed);
self.stats.cpu_chunks.store(0, Ordering::Relaxed);
self.stats.bytes_to_gpu.store(0, Ordering::Relaxed);
self.stats.bytes_from_gpu.store(0, Ordering::Relaxed);
self.stats.gpu_compute_ms.store(0, Ordering::Relaxed);
}
pub fn gaussian_filter<T>(&self, input: &ArrayView2<T>, sigma: f64) -> NdimageResult<Array2<T>>
where
T: Float
+ FromPrimitive
+ Debug
+ Clone
+ Send
+ Sync
+ Zero
+ 'static
+ std::ops::AddAssign
+ std::ops::DivAssign,
{
let operation = GpuGaussianFilter::new(sigma, BorderMode::Reflect);
self.process(input, &operation)
}
pub fn uniform_filter<T>(&self, input: &ArrayView2<T>, size: usize) -> NdimageResult<Array2<T>>
where
T: Float
+ FromPrimitive
+ Debug
+ Clone
+ Send
+ Sync
+ Zero
+ 'static
+ std::ops::AddAssign
+ std::ops::DivAssign,
{
let operation = GpuUniformFilter::new(size, BorderMode::Reflect);
self.process(input, &operation)
}
pub fn grey_erosion<T>(&self, input: &ArrayView2<T>, size: usize) -> NdimageResult<Array2<T>>
where
T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
{
let operation = GpuGreyErosion::new(size, MorphBorderMode::Constant);
self.process(input, &operation)
}
pub fn grey_dilation<T>(&self, input: &ArrayView2<T>, size: usize) -> NdimageResult<Array2<T>>
where
T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
{
let operation = GpuGreyDilation::new(size, MorphBorderMode::Constant);
self.process(input, &operation)
}
}
pub trait GpuChunkOperation<T>: ChunkOperation<T>
where
T: Float + FromPrimitive + Debug + Clone + Send + Sync,
{
fn supports_gpu(&self) -> bool {
true
}
fn apply_gpu(
&self,
chunk: &ArrayView2<T>,
capabilities: Option<&GpuCapabilities>,
) -> NdimageResult<Array2<T>> {
self.apply(chunk)
}
fn gpu_memory_estimate(&self, input_elements: usize, element_size: usize) -> usize {
input_elements * element_size * 3
}
}
pub struct GpuGaussianFilter {
sigma: f64,
border_mode: BorderMode,
kernel_radius: usize,
}
impl GpuGaussianFilter {
pub fn new(sigma: f64, border_mode: BorderMode) -> Self {
let kernel_radius = (sigma * 4.0).ceil() as usize;
Self {
sigma,
border_mode,
kernel_radius,
}
}
}
impl<T> ChunkOperation<T> for GpuGaussianFilter
where
T: Float
+ FromPrimitive
+ Debug
+ Clone
+ Send
+ Sync
+ Zero
+ 'static
+ std::ops::AddAssign
+ std::ops::DivAssign,
{
fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
let chunk_f64 = chunk.mapv(|x| x.to_f64().unwrap_or(0.0));
let result =
crate::filters::gaussian_filter(&chunk_f64, self.sigma, Some(self.border_mode), None)?;
Ok(result.mapv(|x| T::from_f64(x).unwrap_or_else(T::zero)))
}
fn required_overlap(&self) -> usize {
self.kernel_radius
}
fn name(&self) -> &str {
"gpu_gaussian_filter"
}
}
impl<T> GpuChunkOperation<T> for GpuGaussianFilter
where
T: Float
+ FromPrimitive
+ Debug
+ Clone
+ Send
+ Sync
+ Zero
+ 'static
+ std::ops::AddAssign
+ std::ops::DivAssign,
{
fn supports_gpu(&self) -> bool {
true
}
fn apply_gpu(
&self,
chunk: &ArrayView2<T>,
capabilities: Option<&GpuCapabilities>,
) -> NdimageResult<Array2<T>> {
#[cfg(feature = "gpu")]
if let Some(caps) = capabilities {
return self.apply(chunk);
}
self.apply(chunk)
}
fn gpu_memory_estimate(&self, input_elements: usize, element_size: usize) -> usize {
let kernel_size = (self.kernel_radius * 2 + 1).pow(2);
(input_elements * 2 + kernel_size) * element_size
}
}
pub struct GpuUniformFilter {
size: usize,
border_mode: BorderMode,
}
impl GpuUniformFilter {
pub fn new(size: usize, border_mode: BorderMode) -> Self {
Self { size, border_mode }
}
}
impl<T> ChunkOperation<T> for GpuUniformFilter
where
T: Float
+ FromPrimitive
+ Debug
+ Clone
+ Send
+ Sync
+ Zero
+ 'static
+ std::ops::AddAssign
+ std::ops::DivAssign,
{
fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
let chunk_f64 = chunk.mapv(|x| x.to_f64().unwrap_or(0.0));
let result = crate::filters::uniform_filter(
&chunk_f64,
&[self.size, self.size],
Some(self.border_mode),
None,
)?;
Ok(result.mapv(|x| T::from_f64(x).unwrap_or_else(T::zero)))
}
fn required_overlap(&self) -> usize {
self.size / 2 + 1
}
fn name(&self) -> &str {
"gpu_uniform_filter"
}
}
impl<T> GpuChunkOperation<T> for GpuUniformFilter
where
T: Float
+ FromPrimitive
+ Debug
+ Clone
+ Send
+ Sync
+ Zero
+ 'static
+ std::ops::AddAssign
+ std::ops::DivAssign,
{
fn supports_gpu(&self) -> bool {
true
}
}
pub struct GpuGreyErosion {
size: usize,
border_mode: MorphBorderMode,
}
impl GpuGreyErosion {
pub fn new(size: usize, border_mode: MorphBorderMode) -> Self {
Self { size, border_mode }
}
}
impl<T> ChunkOperation<T> for GpuGreyErosion
where
T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
{
fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
let chunk_f64 = chunk.mapv(|x| x.to_f64().unwrap_or(0.0));
let chunk_owned = chunk_f64.to_owned();
let result = crate::morphology::grey_erosion(
&chunk_owned,
Some(&[self.size, self.size]),
None,
Some(self.border_mode),
None,
None,
)?;
Ok(result.mapv(|x| T::from_f64(x).unwrap_or_else(T::zero)))
}
fn required_overlap(&self) -> usize {
self.size / 2 + 1
}
fn name(&self) -> &str {
"gpu_grey_erosion"
}
}
impl<T> GpuChunkOperation<T> for GpuGreyErosion
where
T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
{
fn supports_gpu(&self) -> bool {
true
}
}
pub struct GpuGreyDilation {
size: usize,
border_mode: MorphBorderMode,
}
impl GpuGreyDilation {
pub fn new(size: usize, border_mode: MorphBorderMode) -> Self {
Self { size, border_mode }
}
}
impl<T> ChunkOperation<T> for GpuGreyDilation
where
T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
{
fn apply(&self, chunk: &ArrayView2<T>) -> NdimageResult<Array2<T>> {
let chunk_f64 = chunk.mapv(|x| x.to_f64().unwrap_or(0.0));
let chunk_owned = chunk_f64.to_owned();
let result = crate::morphology::grey_dilation(
&chunk_owned,
Some(&[self.size, self.size]),
None,
Some(self.border_mode),
None,
None,
)?;
Ok(result.mapv(|x| T::from_f64(x).unwrap_or_else(T::zero)))
}
fn required_overlap(&self) -> usize {
self.size / 2 + 1
}
fn name(&self) -> &str {
"gpu_grey_dilation"
}
}
impl<T> GpuChunkOperation<T> for GpuGreyDilation
where
T: Float + FromPrimitive + Debug + Clone + Send + Sync + Zero + 'static + PartialOrd,
{
fn supports_gpu(&self) -> bool {
true
}
}
#[cfg(feature = "gpu")]
pub struct GpuMemoryPool {
allocations: Vec<GpuAllocation>,
total_allocated: usize,
max_memory: usize,
}
#[cfg(feature = "gpu")]
struct GpuAllocation {
ptr: usize,
size: usize,
in_use: bool,
}
#[cfg(feature = "gpu")]
impl GpuMemoryPool {
pub fn new(max_memory: usize) -> Self {
Self {
allocations: Vec::new(),
total_allocated: 0,
max_memory,
}
}
pub fn allocate(&mut self, size: usize) -> NdimageResult<usize> {
for alloc in &mut self.allocations {
if !alloc.in_use && alloc.size >= size {
alloc.in_use = true;
return Ok(alloc.ptr);
}
}
if self.total_allocated + size > self.max_memory {
return Err(NdimageError::MemoryError(
"GPU memory pool exhausted".into(),
));
}
let ptr = self.total_allocated;
self.allocations.push(GpuAllocation {
ptr,
size,
in_use: true,
});
self.total_allocated += size;
Ok(ptr)
}
pub fn free(&mut self, ptr: usize) {
for alloc in &mut self.allocations {
if alloc.ptr == ptr {
alloc.in_use = false;
return;
}
}
}
pub fn available(&self) -> usize {
self.max_memory - self.total_allocated
}
}
#[cfg(test)]
mod tests {
use super::*;
use scirs2_core::ndarray::Array2;
#[test]
fn test_gpu_config_default() {
let config = GpuChunkingConfig::default();
assert!(config.max_gpu_chunk_bytes > 0);
assert!(config.auto_fallback);
}
#[test]
fn test_gpu_processor_creation() {
let result = GpuChunkedProcessor::with_defaults();
assert!(result.is_ok());
}
#[test]
fn test_gpu_gaussian_filter() {
if let Ok(processor) = GpuChunkedProcessor::with_defaults() {
let input = Array2::<f64>::ones((50, 50));
let result = processor.gaussian_filter(&input.view(), 1.0);
assert!(result.is_ok());
let output = result.expect("Should succeed");
assert_eq!(output.shape(), input.shape());
}
}
#[test]
fn test_gpu_uniform_filter() {
if let Ok(processor) = GpuChunkedProcessor::with_defaults() {
let input = Array2::<f64>::ones((50, 50));
let result = processor.uniform_filter(&input.view(), 3);
assert!(result.is_ok());
let output = result.expect("Should succeed");
assert_eq!(output.shape(), input.shape());
}
}
#[test]
fn test_gpu_morphology() {
if let Ok(processor) = GpuChunkedProcessor::with_defaults() {
let input = Array2::<f64>::ones((50, 50));
let eroded = processor.grey_erosion(&input.view(), 3);
assert!(eroded.is_ok());
let dilated = processor.grey_dilation(&input.view(), 3);
assert!(dilated.is_ok());
}
}
#[test]
fn test_gpu_stats() {
if let Ok(processor) = GpuChunkedProcessor::with_defaults() {
processor.reset_stats();
let input = Array2::<f64>::ones((100, 100));
let _ = processor.gaussian_filter(&input.view(), 1.0);
let stats = processor.stats();
let total =
stats.gpu_chunks.load(Ordering::Relaxed) + stats.cpu_chunks.load(Ordering::Relaxed);
assert!(total > 0);
}
}
#[test]
fn test_gpu_capabilities() {
if let Ok(processor) = GpuChunkedProcessor::with_defaults() {
if let Some(caps) = processor.capabilities() {
assert!(caps.memory_bytes > 0);
assert!(caps.compute_units > 0);
}
}
}
#[test]
fn test_gpu_operation_trait() {
let op: GpuGaussianFilter = GpuGaussianFilter::new(1.0, BorderMode::Reflect);
assert!(GpuChunkOperation::<f64>::supports_gpu(&op));
assert!(ChunkOperation::<f64>::required_overlap(&op) > 0);
assert_eq!(ChunkOperation::<f64>::name(&op), "gpu_gaussian_filter");
}
#[cfg(feature = "gpu")]
#[test]
fn test_gpu_memory_pool() {
let mut pool = GpuMemoryPool::new(1024 * 1024);
let alloc1 = pool.allocate(1024);
assert!(alloc1.is_ok());
let alloc2 = pool.allocate(2048);
assert!(alloc2.is_ok());
pool.free(alloc1.expect("Should have ptr"));
assert!(pool.available() < 1024 * 1024);
}
}