use scirs2_core::ndarray::{Array, ArrayView, ArrayView2, Dimension};
use scirs2_core::numeric::{Float, FromPrimitive};
use std::fmt::Debug;
use crate::error::{NdimageError, NdimageResult};
use crate::utils::safe_f64_to_float;
#[allow(dead_code)]
fn safe_usize_to_float<T: Float + FromPrimitive>(value: usize) -> NdimageResult<T> {
T::from_usize(value).ok_or_else(|| {
NdimageError::ComputationError(format!("Failed to convert usize {} to float type", value))
})
}
#[allow(dead_code)]
fn safe_as_slice<'a, T, D: Dimension>(array: &'a ArrayView<T, D>) -> NdimageResult<&'a [T]> {
array.as_slice().ok_or_else(|| {
NdimageError::ComputationError("Failed to convert _array to contiguous slice".to_string())
})
}
const GAUSSIAN_BLUR_KERNEL: &str = include_str!("kernels/gaussian_blur.kernel");
const CONVOLUTION_KERNEL: &str = include_str!("kernels/convolution.kernel");
const MEDIAN_FILTER_KERNEL: &str = include_str!("kernels/median_filter.kernel");
const MORPHOLOGY_KERNEL: &str = include_str!("kernels/morphology.kernel");
const ADVANCED_MORPHOLOGY_KERNEL: &str = include_str!("kernels/advanced_morphology.kernel");
const ADVANCED_EDGE_DETECTION_KERNEL: &str = include_str!("kernels/advanced_edge_detection.kernel");
const TEXTURE_ANALYSIS_KERNEL: &str = include_str!("kernels/texture_analysis.kernel");
const ADVANCED_SEGMENTATION_KERNEL: &str = include_str!("kernels/advanced_segmentation.kernel");
const SEPARABLE_GAUSSIAN_KERNEL: &str = r#"
__kernel void separable_gaussian_1d(
__global const float* input__global float* output__global const float* weights,
const int size,
const int radius,
const int direction, // 0 for horizontal, 1 for vertical
const int width,
const int height
) {
int x = get_global_id(0);
int y = get_global_id(1);
if (x >= width || y >= height) return;
float sum = 0.0f;
if (direction == 0) {
// Horizontal pass
for (int i = -radius; i <= radius; i++) {
int px = clamp(x + i, 0, width - 1);
sum += input[y * width + px] * weights[i + radius];
}
} else {
// Vertical pass
for (int i = -radius; i <= radius; i++) {
int py = clamp(y + i, 0, height - 1);
sum += input[py * width + x] * weights[i + radius];
}
}
output[y * width + x] = sum;
}
"#;
const BILATERAL_FILTER_KERNEL: &str = r#"
__kernel void bilateral_filter_2d(
__global const float* input__global float* output,
const float sigma_spatial,
const float sigma_intensity,
const int radius,
const int width,
const int height
) {
int x = get_global_id(0);
int y = get_global_id(1);
if (x >= width || y >= height) return;
float center_val = input[y * width + x];
float sum = 0.0f;
float weight_sum = 0.0f;
float spatial_coeff = -0.5f / (sigma_spatial * sigma_spatial);
float intensity_coeff = -0.5f / (sigma_intensity * sigma_intensity);
for (int dy = -radius; dy <= radius; dy++) {
for (int dx = -radius; dx <= radius; dx++) {
int px = clamp(x + dx, 0, width - 1);
int py = clamp(y + dy, 0, height - 1);
float pixel_val = input[py * width + px];
// Spatial weight
float spatial_dist = dx * dx + dy * dy;
float spatial_weight = exp(spatial_dist * spatial_coeff);
// Intensity weight
float intensity_dist = (pixel_val - center_val) * (pixel_val - center_val);
float intensity_weight = exp(intensity_dist * intensity_coeff);
float total_weight = spatial_weight * intensity_weight;
sum += pixel_val * total_weight;
weight_sum += total_weight;
}
}
output[y * width + x] = sum / weight_sum;
}
"#;
const SOBEL_FILTER_KERNEL: &str = r#"
__kernel void sobel_filter_2d(
__global const float* input__global float* output_x__global float* output_y__global float* magnitude,
const int width,
const int height
) {
int x = get_global_id(0);
int y = get_global_id(1);
if (x >= width || y >= height) return;
// Sobel X kernel: [-1, 0, 1; -2, 0, 2; -1, 0, 1]
float gx = 0.0f;
gx += -1.0f * input[clamp(y-1, 0, height-1) * width + clamp(x-1, 0, width-1)];
gx += -2.0f * input[y * width + clamp(x-1, 0, width-1)];
gx += -1.0f * input[clamp(y+1, 0, height-1) * width + clamp(x-1, 0, width-1)];
gx += 1.0f * input[clamp(y-1, 0, height-1) * width + clamp(x+1, 0, width-1)];
gx += 2.0f * input[y * width + clamp(x+1, 0, width-1)];
gx += 1.0f * input[clamp(y+1, 0, height-1) * width + clamp(x+1, 0, width-1)];
// Sobel Y kernel: [-1, -2, -1; 0, 0, 0; 1, 2, 1]
float gy = 0.0f;
gy += -1.0f * input[clamp(y-1, 0, height-1) * width + clamp(x-1, 0, width-1)];
gy += -2.0f * input[clamp(y-1, 0, height-1) * width + x];
gy += -1.0f * input[clamp(y-1, 0, height-1) * width + clamp(x+1, 0, width-1)];
gy += 1.0f * input[clamp(y+1, 0, height-1) * width + clamp(x-1, 0, width-1)];
gy += 2.0f * input[clamp(y+1, 0, height-1) * width + x];
gy += 1.0f * input[clamp(y+1, 0, height-1) * width + clamp(x+1, 0, width-1)];
output_x[y * width + x] = gx;
output_y[y * width + x] = gy;
magnitude[y * width + x] = sqrt(gx * gx + gy * gy);
}
"#;
const HISTOGRAM_KERNEL: &str = r#"
__kernel void compute_histogram(
__global const float* input__global int* histogram,
const float min_val,
const float max_val,
const int num_bins,
const int total_pixels
) {
int idx = get_global_id(0);
if (idx >= total_pixels) return;
float val = input[idx];
if (val >= min_val && val <= max_val) {
float normalized = (val - min_val) / (max_val - min_val);
int bin = (int)(normalized * (num_bins - 1));
bin = clamp(bin, 0, num_bins - 1);
atomic_inc(&histogram[bin]);
}
}
"#;
const MORPHOLOGY_EROSION_KERNEL: &str = r#"
__kernel void morphology_erosion_2d(
__global const float* input__global float* output__global const int* structure,
const int struct_width,
const int struct_height,
const int width,
const int height
) {
int x = get_global_id(0);
int y = get_global_id(1);
if (x >= width || y >= height) return;
float min_val = INFINITY;
int struct_center_x = struct_width / 2;
int struct_center_y = struct_height / 2;
for (int sy = 0; sy < struct_height; sy++) {
for (int sx = 0; sx < struct_width; sx++) {
if (structure[sy * struct_width + sx]) {
int px = x + sx - struct_center_x;
int py = y + sy - struct_center_y;
px = clamp(px, 0, width - 1);
py = clamp(py, 0, height - 1);
float val = input[py * width + px];
min_val = fmin(min_val, val);
}
}
}
output[y * width + x] = min_val;
}
"#;
const WAVELET_TRANSFORM_KERNEL: &str = r#"
__kernel void wavelet_transform_2d(
__global const float* input__global float* ll_output__global float* lh_output__global float* hl_output__global float* hh_output__global const float* low_filter__global const float* high_filter,
const int filter_length,
const int width,
const int height
) {
int x = get_global_id(0);
int y = get_global_id(1);
if (x >= width/2 || y >= height/2) return;
int filter_radius = filter_length / 2;
// Apply row filters first
float ll_row = 0.0f, lh_row = 0.0f;
for (int i = 0; i < filter_length; i++) {
int px = clamp(x * 2 + i - filter_radius, 0, width - 1);
ll_row += input[y * 2 * width + px] * low_filter[i];
lh_row += input[y * 2 * width + px] * high_filter[i];
}
// Apply column filters to row results
float ll = 0.0f, lh = 0.0f, hl = 0.0f, hh = 0.0f;
for (int i = 0; i < filter_length; i++) {
int py = clamp(y * 2 + i - filter_radius, 0, height - 1);
// This is simplified - full implementation would need temporary storage
ll += ll_row * low_filter[i];
lh += lh_row * low_filter[i];
hl += ll_row * high_filter[i];
hh += lh_row * high_filter[i];
}
ll_output[y * (width/2) + x] = ll;
lh_output[y * (width/2) + x] = lh;
hl_output[y * (width/2) + x] = hl;
hh_output[y * (width/2) + x] = hh;
}
"#;
const TEMPLATE_MATCHING_KERNEL: &str = r#"
__kernel void template_matching_ncc(
__global const float* image__global const float* template__global float* output,
const int image_width,
const int image_height,
const int template_width,
const int template_height,
const float template_mean,
const float template_norm
) {
int x = get_global_id(0);
int y = get_global_id(1);
int output_width = image_width - template_width + 1;
int output_height = image_height - template_height + 1;
if (x >= output_width || y >= output_height) return;
// Compute image patch mean
float image_sum = 0.0f;
for (int ty = 0; ty < template_height; ty++) {
for (int tx = 0; tx < template_width; tx++) {
image_sum += image[(y + ty) * image_width + (x + tx)];
}
}
float image_mean = image_sum / (template_width * template_height);
// Compute normalized cross-correlation
float correlation = 0.0f;
float image_variance = 0.0f;
for (int ty = 0; ty < template_height; ty++) {
for (int tx = 0; tx < template_width; tx++) {
float image_val = image[(y + ty) * image_width + (x + tx)];
float template_val = template[ty * template_width + tx];
float image_centered = image_val - image_mean;
float template_centered = template_val - template_mean;
correlation += image_centered * template_centered;
image_variance += image_centered * image_centered;
}
}
float image_norm = sqrt(image_variance);
float ncc = (image_norm > 0.0f) ? correlation / (image_norm * template_norm) : 0.0f;
output[y * output_width + x] = ncc;
}
"#;
const DISTANCE_TRANSFORM_KERNEL: &str = r#"
__kernel void distance_transform_edt(
__global const int* binary_image__global float* distance,
const int width,
const int height
) {
int x = get_global_id(0);
int y = get_global_id(1);
if (x >= width || y >= height) return;
int idx = y * width + x;
if (binary_image[idx] == 0) {
distance[idx] = 0.0f;
return;
}
float min_dist = INFINITY;
// Brute force approach - can be optimized with separable algorithm
for (int py = 0; py < height; py++) {
for (int px = 0; px < width; px++) {
if (binary_image[py * width + px] == 0) {
float dx = px - x;
float dy = py - y;
float dist = sqrt(dx * dx + dy * dy);
min_dist = fmin(min_dist, dist);
}
}
}
distance[idx] = min_dist;
}
"#;
const GABOR_FILTER_KERNEL: &str = r#"
__kernel void gabor_filter_2d(
__global const float* input__global float* output,
const float sigma_x,
const float sigma_y,
const float theta,
const float lambda,
const float gamma,
const float psi,
const int kernel_size,
const int width,
const int height
) {
int x = get_global_id(0);
int y = get_global_id(1);
if (x >= width || y >= height) return;
float sum = 0.0f;
int half_size = kernel_size / 2;
float cos_theta = cos(theta);
float sin_theta = sin(theta);
for (int ky = -half_size; ky <= half_size; ky++) {
for (int kx = -half_size; kx <= half_size; kx++) {
// Rotate coordinates
float x_rot = kx * cos_theta + ky * sin_theta;
float y_rot = -kx * sin_theta + ky * cos_theta;
// Gabor function
float envelope = exp(-0.5f * (x_rot * x_rot / (sigma_x * sigma_x) +
gamma * gamma * y_rot * y_rot / (sigma_y * sigma_y)));
float wave = cos(2.0f * M_PI * x_rot / lambda + psi);
float gabor_val = envelope * wave;
// Apply to image
int px = clamp(x + kx, 0, width - 1);
int py = clamp(y + ky, 0, height - 1);
sum += input[py * width + px] * gabor_val;
}
}
output[y * width + x] = sum;
}
"#;
const LAPLACIAN_KERNEL: &str = r#"
__kernel void laplacian_filter_2d(
__global const float* input__global float* output,
const int width,
const int height,
const int connectivity // 4 or 8
) {
int x = get_global_id(0);
int y = get_global_id(1);
if (x >= width || y >= height) return;
float center = input[y * width + x];
float sum = 0.0f;
if (connectivity == 4) {
// 4-connected Laplacian: [0, -1, 0; -1, 4, -1; 0, -1, 0]
sum += 4.0f * center;
sum += -1.0f * input[clamp(y-1, 0, height-1) * width + x]; // top
sum += -1.0f * input[clamp(y+1, 0, height-1) * width + x]; // bottom
sum += -1.0f * input[y * width + clamp(x-1, 0, width-1)]; // left
sum += -1.0f * input[y * width + clamp(x+1, 0, width-1)]; // right
} else {
// 8-connected Laplacian: [-1, -1, -1; -1, 8, -1; -1, -1, -1]
sum += 8.0f * center;
for (int dy = -1; dy <= 1; dy++) {
for (int dx = -1; dx <= 1; dx++) {
if (dx == 0 && dy == 0) continue;
int px = clamp(x + dx, 0, width - 1);
int py = clamp(y + dy, 0, height - 1);
sum += -1.0f * input[py * width + px];
}
}
}
output[y * width + x] = sum;
}
"#;
pub struct KernelRegistry {
kernels: std::collections::HashMap<String, KernelInfo>,
}
pub struct KernelInfo {
pub name: String,
pub source: String,
pub entry_point: String,
pub work_dimensions: usize,
}
impl KernelRegistry {
pub fn new() -> Self {
let mut registry = Self {
kernels: std::collections::HashMap::new(),
};
registry.register_builtin_kernels();
registry
}
fn register_builtin_kernels(&mut self) {
self.register_kernel(
"gaussian_blur_2d",
GAUSSIAN_BLUR_KERNEL,
"gaussian_blur_2d",
2,
);
self.register_kernel("convolution_2d", CONVOLUTION_KERNEL, "convolution_2d", 2);
self.register_kernel(
"median_filter_2d",
MEDIAN_FILTER_KERNEL,
"median_filter_2d",
2,
);
self.register_kernel("morphology_erosion", MORPHOLOGY_KERNEL, "erosion_2d", 2);
self.register_kernel("morphology_dilation", MORPHOLOGY_KERNEL, "dilation_2d", 2);
self.register_kernel(
"separable_gaussian_1d",
SEPARABLE_GAUSSIAN_KERNEL,
"separable_gaussian_1d",
2,
);
self.register_kernel(
"bilateral_filter_2d",
BILATERAL_FILTER_KERNEL,
"bilateral_filter_2d",
2,
);
self.register_kernel("sobel_filter_2d", SOBEL_FILTER_KERNEL, "sobel_filter_2d", 2);
self.register_kernel(
"laplacian_filter_2d",
LAPLACIAN_KERNEL,
"laplacian_filter_2d",
2,
);
self.register_kernel(
"hit_or_miss_2d",
ADVANCED_MORPHOLOGY_KERNEL,
"hit_or_miss_2d",
2,
);
self.register_kernel(
"morphological_gradient_2d",
ADVANCED_MORPHOLOGY_KERNEL,
"morphological_gradient_2d",
2,
);
self.register_kernel(
"tophat_transform_2d",
ADVANCED_MORPHOLOGY_KERNEL,
"tophat_transform_2d",
2,
);
self.register_kernel("skeleton_2d", ADVANCED_MORPHOLOGY_KERNEL, "skeleton_2d", 2);
self.register_kernel(
"distance_transform_chamfer_2d",
ADVANCED_MORPHOLOGY_KERNEL,
"distance_transform_chamfer_2d",
2,
);
self.register_kernel(
"canny_gradient_2d",
ADVANCED_EDGE_DETECTION_KERNEL,
"canny_gradient_2d",
2,
);
self.register_kernel(
"canny_non_maximum_suppression_2d",
ADVANCED_EDGE_DETECTION_KERNEL,
"canny_non_maximum_suppression_2d",
2,
);
self.register_kernel(
"canny_double_threshold_2d",
ADVANCED_EDGE_DETECTION_KERNEL,
"canny_double_threshold_2d",
2,
);
self.register_kernel(
"canny_edge_tracking_2d",
ADVANCED_EDGE_DETECTION_KERNEL,
"canny_edge_tracking_2d",
2,
);
self.register_kernel(
"laplacian_of_gaussian_2d",
ADVANCED_EDGE_DETECTION_KERNEL,
"laplacian_of_gaussian_2d",
2,
);
self.register_kernel(
"zero_crossing_2d",
ADVANCED_EDGE_DETECTION_KERNEL,
"zero_crossing_2d",
2,
);
self.register_kernel(
"harris_corner_response_2d",
ADVANCED_EDGE_DETECTION_KERNEL,
"harris_corner_response_2d",
2,
);
self.register_kernel(
"oriented_fast_keypoints_2d",
ADVANCED_EDGE_DETECTION_KERNEL,
"oriented_fast_keypoints_2d",
2,
);
self.register_kernel(
"local_binary_pattern_2d",
TEXTURE_ANALYSIS_KERNEL,
"local_binary_pattern_2d",
2,
);
self.register_kernel(
"uniform_local_binary_pattern_2d",
TEXTURE_ANALYSIS_KERNEL,
"uniform_local_binary_pattern_2d",
2,
);
self.register_kernel(
"glcm_cooccurrence_2d",
TEXTURE_ANALYSIS_KERNEL,
"glcm_cooccurrence_2d",
2,
);
self.register_kernel(
"glcmfeatures_2d",
TEXTURE_ANALYSIS_KERNEL,
"glcmfeatures_2d",
1,
);
self.register_kernel(
"gabor_filter_2d",
TEXTURE_ANALYSIS_KERNEL,
"gabor_filter_2d",
2,
);
self.register_kernel(
"lawstexture_energy_2d",
TEXTURE_ANALYSIS_KERNEL,
"lawstexture_energy_2d",
2,
);
self.register_kernel(
"fractal_dimension_2d",
TEXTURE_ANALYSIS_KERNEL,
"fractal_dimension_2d",
2,
);
self.register_kernel(
"watershed_labels_init_2d",
ADVANCED_SEGMENTATION_KERNEL,
"watershed_labels_init_2d",
2,
);
self.register_kernel(
"watershed_propagation_2d",
ADVANCED_SEGMENTATION_KERNEL,
"watershed_propagation_2d",
2,
);
self.register_kernel(
"region_growing_2d",
ADVANCED_SEGMENTATION_KERNEL,
"region_growing_2d",
2,
);
self.register_kernel(
"mean_shift_2d",
ADVANCED_SEGMENTATION_KERNEL,
"mean_shift_2d",
2,
);
self.register_kernel(
"level_set_evolution_2d",
ADVANCED_SEGMENTATION_KERNEL,
"level_set_evolution_2d",
2,
);
self.register_kernel(
"chan_vese_energy_2d",
ADVANCED_SEGMENTATION_KERNEL,
"chan_vese_energy_2d",
2,
);
self.register_kernel(
"active_contour_evolution_2d",
ADVANCED_SEGMENTATION_KERNEL,
"active_contour_evolution_2d",
1,
);
self.register_kernel(
"superpixel_slic_2d",
ADVANCED_SEGMENTATION_KERNEL,
"superpixel_slic_2d",
2,
);
}
pub fn register_kernel(&mut self, name: &str, source: &str, entrypoint: &str, dims: usize) {
self.kernels.insert(
name.to_string(),
KernelInfo {
name: name.to_string(),
source: source.to_string(),
entry_point: entrypoint.to_string(),
work_dimensions: dims,
},
);
}
pub fn get_kernel(&self, name: &str) -> Option<&KernelInfo> {
self.kernels.get(name)
}
}
pub trait GpuBuffer<T>: Send + Sync {
fn size(&self) -> usize;
fn copy_from_host(&mut self, data: &[T]) -> NdimageResult<()>;
fn copy_to_host(&self, data: &mut [T]) -> NdimageResult<()>;
fn as_any(&self) -> &dyn std::any::Any;
fn as_any_mut(&mut self) -> &mut dyn std::any::Any;
}
pub trait GpuKernelExecutor<T>: Send + Sync {
fn execute_kernel(
&self,
kernel: &KernelInfo,
inputs: &[&dyn GpuBuffer<T>],
outputs: &[&mut dyn GpuBuffer<T>],
work_size: &[usize],
params: &[T],
) -> NdimageResult<()>;
}
pub struct CpuFallbackBuffer<T> {
data: Vec<T>,
}
impl<T> CpuFallbackBuffer<T>
where
T: Clone + Default,
{
pub fn from_slice(data: &[T]) -> NdimageResult<Self> {
Ok(Self {
data: data.to_vec(),
})
}
pub fn empty(size: usize) -> NdimageResult<Self> {
Ok(Self {
data: vec![T::default(); size],
})
}
}
impl<T> GpuBuffer<T> for CpuFallbackBuffer<T>
where
T: Clone + Default + Send + Sync + Copy + 'static,
{
fn size(&self) -> usize {
self.data.len()
}
fn copy_from_host(&mut self, data: &[T]) -> NdimageResult<()> {
if data.len() != self.data.len() {
return Err(NdimageError::InvalidInput(format!(
"Data size mismatch: expected {}, got {}",
self.data.len(),
data.len()
)));
}
self.data.copy_from_slice(data);
Ok(())
}
fn copy_to_host(&self, data: &mut [T]) -> NdimageResult<()> {
if data.len() != self.data.len() {
return Err(NdimageError::InvalidInput(format!(
"Data size mismatch: expected {}, got {}",
self.data.len(),
data.len()
)));
}
data.copy_from_slice(&self.data);
Ok(())
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
self
}
}
pub struct CpuFallbackExecutor;
impl CpuFallbackExecutor {
pub fn new() -> Self {
Self
}
}
impl<T> GpuKernelExecutor<T> for CpuFallbackExecutor
where
T: Float + FromPrimitive + Clone + Send + Sync + 'static,
{
fn execute_kernel(
&self,
kernel: &KernelInfo,
inputs: &[&dyn GpuBuffer<T>],
outputs: &[&mut dyn GpuBuffer<T>],
work_size: &[usize],
_params: &[T],
) -> NdimageResult<()> {
Err(NdimageError::NotImplementedError(format!(
"CPU fallback execution for GPU kernel '{}' is not fully implemented. Work _size: {:?}, {} inputs, {} outputs",
kernel.name,
work_size,
inputs.len(),
outputs.len()
)))
}
}
#[allow(dead_code)]
pub fn gpu_gaussian_filter_2d<T>(
input: &ArrayView2<T>,
sigma: [T; 2],
executor: &dyn GpuKernelExecutor<T>,
) -> NdimageResult<Array<T, scirs2_core::ndarray::Ix2>>
where
T: Float + FromPrimitive + Debug + Clone + Default + Send + Sync + 'static,
{
let (h, w) = input.dim();
let input_buffer = allocate_gpu_buffer(safe_as_slice(input)?)?;
let mut output_buffer = allocate_gpu_buffer_empty::<T>(h * w)?;
let params = vec![
sigma[0],
sigma[1],
safe_usize_to_float::<T>(h)?,
safe_usize_to_float::<T>(w)?,
];
let registry = KernelRegistry::new();
let kernel = registry
.get_kernel("gaussian_blur_2d")
.ok_or_else(|| NdimageError::ComputationError("Kernel not found".into()))?;
executor.execute_kernel(
kernel,
&[input_buffer.as_ref()],
&[output_buffer.as_mut()],
&[h, w],
¶ms,
)?;
let mut output_data = vec![T::zero(); h * w];
output_buffer.copy_to_host(&mut output_data)?;
Ok(Array::from_shape_vec((h, w), output_data)?)
}
#[allow(dead_code)]
pub fn gpu_convolve_2d<T>(
input: &ArrayView2<T>,
kernel: &ArrayView2<T>,
executor: &dyn GpuKernelExecutor<T>,
) -> NdimageResult<Array<T, scirs2_core::ndarray::Ix2>>
where
T: Float + FromPrimitive + Debug + Clone + Default + Send + Sync + 'static,
{
let (ih, iw) = input.dim();
let (kh, kw) = kernel.dim();
let input_buffer = allocate_gpu_buffer(safe_as_slice(input)?)?;
let kernel_buffer = allocate_gpu_buffer(safe_as_slice(kernel)?)?;
let mut output_buffer = allocate_gpu_buffer_empty::<T>(ih * iw)?;
let params = vec![
safe_usize_to_float::<T>(ih)?,
safe_usize_to_float::<T>(iw)?,
safe_usize_to_float::<T>(kh)?,
safe_usize_to_float::<T>(kw)?,
];
let registry = KernelRegistry::new();
let gpu_kernel = registry
.get_kernel("convolution_2d")
.ok_or_else(|| NdimageError::ComputationError("Kernel not found".into()))?;
executor.execute_kernel(
gpu_kernel,
&[input_buffer.as_ref(), kernel_buffer.as_ref()],
&[output_buffer.as_mut()],
&[ih, iw],
¶ms,
)?;
let mut output_data = vec![T::zero(); ih * iw];
output_buffer.copy_to_host(&mut output_data)?;
Ok(Array::from_shape_vec((ih, iw), output_data)?)
}
#[allow(dead_code)]
pub fn gpu_median_filter_2d<T>(
input: &ArrayView2<T>,
size: [usize; 2],
executor: &dyn GpuKernelExecutor<T>,
) -> NdimageResult<Array<T, scirs2_core::ndarray::Ix2>>
where
T: Float + FromPrimitive + Debug + Clone + Default + Send + Sync + 'static,
{
let (h, w) = input.dim();
let input_buffer = allocate_gpu_buffer(safe_as_slice(input)?)?;
let mut output_buffer = allocate_gpu_buffer_empty::<T>(h * w)?;
let params = vec![
safe_usize_to_float::<T>(h)?,
safe_usize_to_float::<T>(w)?,
safe_usize_to_float::<T>(size[0])?,
safe_usize_to_float::<T>(size[1])?,
];
let registry = KernelRegistry::new();
let kernel = registry
.get_kernel("median_filter_2d")
.ok_or_else(|| NdimageError::ComputationError("Kernel not found".into()))?;
executor.execute_kernel(
kernel,
&[input_buffer.as_ref()],
&[output_buffer.as_mut()],
&[h, w],
¶ms,
)?;
let mut output_data = vec![T::zero(); h * w];
output_buffer.copy_to_host(&mut output_data)?;
Ok(Array::from_shape_vec((h, w), output_data)?)
}
#[allow(dead_code)]
pub fn gpu_erosion_2d<T>(
input: &ArrayView2<T>,
structure: &ArrayView2<bool>,
executor: &dyn GpuKernelExecutor<T>,
) -> NdimageResult<Array<T, scirs2_core::ndarray::Ix2>>
where
T: Float + FromPrimitive + Debug + Clone + Default + Send + Sync + 'static,
{
let (h, w) = input.dim();
let (sh, sw) = structure.dim();
let structure_t: Vec<T> = structure
.iter()
.map(|&b| if b { T::one() } else { T::zero() })
.collect();
let input_buffer = allocate_gpu_buffer(safe_as_slice(input)?)?;
let structure_buffer = allocate_gpu_buffer(&structure_t)?;
let mut output_buffer = allocate_gpu_buffer_empty::<T>(h * w)?;
let params = vec![
safe_usize_to_float::<T>(h)?,
safe_usize_to_float::<T>(w)?,
safe_usize_to_float::<T>(sh)?,
safe_usize_to_float::<T>(sw)?,
];
let registry = KernelRegistry::new();
let kernel = registry
.get_kernel("morphology_erosion")
.ok_or_else(|| NdimageError::ComputationError("Kernel not found".into()))?;
executor.execute_kernel(
kernel,
&[input_buffer.as_ref(), structure_buffer.as_ref()],
&[output_buffer.as_mut()],
&[h, w],
¶ms,
)?;
let mut output_data = vec![T::zero(); h * w];
output_buffer.copy_to_host(&mut output_data)?;
Ok(Array::from_shape_vec((h, w), output_data)?)
}
#[allow(dead_code)]
pub fn gpu_separable_gaussian_filter_2d<T>(
input: &ArrayView2<T>,
sigma: [T; 2],
executor: &dyn GpuKernelExecutor<T>,
) -> NdimageResult<Array<T, scirs2_core::ndarray::Ix2>>
where
T: Float + FromPrimitive + Debug + Clone + Default + Send + Sync + 'static,
{
let (h, w) = input.dim();
let radius_x = (safe_f64_to_float::<T>(3.0)? * sigma[0])
.to_usize()
.ok_or_else(|| {
NdimageError::ComputationError("Failed to convert radius_x to usize".to_string())
})?;
let radius_y = (safe_f64_to_float::<T>(3.0)? * sigma[1])
.to_usize()
.ok_or_else(|| {
NdimageError::ComputationError("Failed to convert radius_y to usize".to_string())
})?;
let max_radius = radius_x.max(radius_y);
let weights_size = 2 * max_radius + 1;
let weights_x: Result<Vec<T>, NdimageError> = (0..weights_size)
.map(|i| -> NdimageResult<T> {
let offset = safe_usize_to_float::<T>(i)? - safe_usize_to_float::<T>(max_radius)?;
let exp_arg = -safe_f64_to_float::<T>(0.5)? * offset * offset / (sigma[0] * sigma[0]);
Ok(exp_arg.exp())
})
.collect();
let weights_x = weights_x?;
let weights_y: Result<Vec<T>, NdimageError> = (0..weights_size)
.map(|i| -> NdimageResult<T> {
let offset = safe_usize_to_float::<T>(i)? - safe_usize_to_float::<T>(max_radius)?;
let exp_arg = -safe_f64_to_float::<T>(0.5)? * offset * offset / (sigma[1] * sigma[1]);
Ok(exp_arg.exp())
})
.collect();
let weights_y = weights_y?;
let input_buffer = allocate_gpu_buffer(safe_as_slice(input)?)?;
let weights_x_buffer = allocate_gpu_buffer(&weights_x)?;
let weights_y_buffer = allocate_gpu_buffer(&weights_y)?;
let mut temp_buffer = allocate_gpu_buffer_empty::<T>(h * w)?;
let mut output_buffer = allocate_gpu_buffer_empty::<T>(h * w)?;
let registry = KernelRegistry::new();
let kernel = registry
.get_kernel("separable_gaussian_1d")
.ok_or_else(|| NdimageError::ComputationError("Kernel not found".into()))?;
let params_h = vec![
safe_usize_to_float::<T>(h * w)?,
safe_usize_to_float::<T>(radius_x)?,
T::zero(), safe_usize_to_float::<T>(w)?,
safe_usize_to_float::<T>(h)?,
];
executor.execute_kernel(
kernel,
&[input_buffer.as_ref(), weights_x_buffer.as_ref()],
&[temp_buffer.as_mut()],
&[h, w],
¶ms_h,
)?;
let params_v = vec![
safe_usize_to_float::<T>(h * w)?,
safe_usize_to_float::<T>(radius_y)?,
T::one(), safe_usize_to_float::<T>(w)?,
safe_usize_to_float::<T>(h)?,
];
executor.execute_kernel(
kernel,
&[temp_buffer.as_ref(), weights_y_buffer.as_ref()],
&[output_buffer.as_mut()],
&[h, w],
¶ms_v,
)?;
let mut output_data = vec![T::zero(); h * w];
output_buffer.copy_to_host(&mut output_data)?;
Ok(Array::from_shape_vec((h, w), output_data)?)
}
#[allow(dead_code)]
pub fn gpu_bilateral_filter_2d<T>(
input: &ArrayView2<T>,
sigma_spatial: T,
sigma_intensity: T,
radius: usize,
executor: &dyn GpuKernelExecutor<T>,
) -> NdimageResult<Array<T, scirs2_core::ndarray::Ix2>>
where
T: Float + FromPrimitive + Debug + Clone + Default + Send + Sync + 'static,
{
let (h, w) = input.dim();
let input_buffer = allocate_gpu_buffer(safe_as_slice(input)?)?;
let mut output_buffer = allocate_gpu_buffer_empty::<T>(h * w)?;
let params = vec![
sigma_spatial,
sigma_intensity,
safe_usize_to_float::<T>(radius)?,
safe_usize_to_float::<T>(w)?,
safe_usize_to_float::<T>(h)?,
];
let registry = KernelRegistry::new();
let kernel = registry
.get_kernel("bilateral_filter_2d")
.ok_or_else(|| NdimageError::ComputationError("Kernel not found".into()))?;
executor.execute_kernel(
kernel,
&[input_buffer.as_ref()],
&[output_buffer.as_mut()],
&[h, w],
¶ms,
)?;
let mut output_data = vec![T::zero(); h * w];
output_buffer.copy_to_host(&mut output_data)?;
Ok(Array::from_shape_vec((h, w), output_data)?)
}
#[allow(dead_code)]
pub fn gpu_sobel_filter_2d<T>(
input: &ArrayView2<T>,
executor: &dyn GpuKernelExecutor<T>,
) -> NdimageResult<(
Array<T, scirs2_core::ndarray::Ix2>,
Array<T, scirs2_core::ndarray::Ix2>,
Array<T, scirs2_core::ndarray::Ix2>,
)>
where
T: Float + FromPrimitive + Debug + Clone + Default + Send + Sync + 'static,
{
let (h, w) = input.dim();
let input_buffer = allocate_gpu_buffer(safe_as_slice(input)?)?;
let mut output_x_buffer = allocate_gpu_buffer_empty::<T>(h * w)?;
let mut output_y_buffer = allocate_gpu_buffer_empty::<T>(h * w)?;
let mut magnitude_buffer = allocate_gpu_buffer_empty::<T>(h * w)?;
let params = vec![safe_usize_to_float::<T>(w)?, safe_usize_to_float::<T>(h)?];
let registry = KernelRegistry::new();
let kernel = registry
.get_kernel("sobel_filter_2d")
.ok_or_else(|| NdimageError::ComputationError("Kernel not found".into()))?;
executor.execute_kernel(
kernel,
&[input_buffer.as_ref()],
&[
output_x_buffer.as_mut(),
output_y_buffer.as_mut(),
magnitude_buffer.as_mut(),
],
&[h, w],
¶ms,
)?;
let mut output_x_data = vec![T::zero(); h * w];
let mut output_y_data = vec![T::zero(); h * w];
let mut magnitude_data = vec![T::zero(); h * w];
output_x_buffer.copy_to_host(&mut output_x_data)?;
output_y_buffer.copy_to_host(&mut output_y_data)?;
magnitude_buffer.copy_to_host(&mut magnitude_data)?;
Ok((
Array::from_shape_vec((h, w), output_x_data)?,
Array::from_shape_vec((h, w), output_y_data)?,
Array::from_shape_vec((h, w), magnitude_data)?,
))
}
#[allow(dead_code)]
pub fn gpu_laplacian_filter_2d<T>(
input: &ArrayView2<T>,
connectivity: usize,
executor: &dyn GpuKernelExecutor<T>,
) -> NdimageResult<Array<T, scirs2_core::ndarray::Ix2>>
where
T: Float + FromPrimitive + Debug + Clone + Default + Send + Sync + 'static,
{
let (h, w) = input.dim();
let input_buffer = allocate_gpu_buffer(safe_as_slice(input)?)?;
let mut output_buffer = allocate_gpu_buffer_empty::<T>(h * w)?;
let params = vec![
safe_usize_to_float::<T>(w)?,
safe_usize_to_float::<T>(h)?,
safe_usize_to_float::<T>(connectivity)?,
];
let registry = KernelRegistry::new();
let kernel = registry
.get_kernel("laplacian_filter_2d")
.ok_or_else(|| NdimageError::ComputationError("Kernel not found".into()))?;
executor.execute_kernel(
kernel,
&[input_buffer.as_ref()],
&[output_buffer.as_mut()],
&[h, w],
¶ms,
)?;
let mut output_data = vec![T::zero(); h * w];
output_buffer.copy_to_host(&mut output_data)?;
Ok(Array::from_shape_vec((h, w), output_data)?)
}
#[allow(dead_code)]
fn allocate_gpu_buffer<T>(data: &[T]) -> NdimageResult<Box<dyn GpuBuffer<T>>>
where
T: Clone + Default + Send + Sync + Copy + 'static,
{
#[cfg(feature = "cuda")]
{
return crate::backend::cuda::allocate_gpu_buffer(data);
}
#[cfg(not(feature = "cuda"))]
{
Ok(Box::new(CpuFallbackBuffer::from_slice(data)?))
}
}
#[allow(dead_code)]
fn allocate_gpu_buffer_empty<T>(size: usize) -> NdimageResult<Box<dyn GpuBuffer<T>>>
where
T: Clone + Default + Send + Sync + Copy + 'static,
{
#[cfg(feature = "cuda")]
{
return crate::backend::cuda::allocate_gpu_buffer_empty(size);
}
#[cfg(not(feature = "cuda"))]
{
Ok(Box::new(CpuFallbackBuffer::empty(size)?))
}
}