scirs2-ndimage 0.4.2

// Enhanced Gaussian blur kernel for GPU execution
// This kernel can be compiled for CUDA, OpenCL, or Metal
// Optimized with shared memory and boundary handling options

#define MAX_RADIUS 16
#define BLOCK_SIZE_X 16
#define BLOCK_SIZE_Y 16
#define TILE_SIZE_X (BLOCK_SIZE_X + 2 * MAX_RADIUS)
#define TILE_SIZE_Y (BLOCK_SIZE_Y + 2 * MAX_RADIUS)

__kernel void gaussian_blur_2d(
    __global const float* input,
    __global float* output,
    const float sigma_x,
    const float sigma_y,
    const int height,
    const int width
) {
    int x = get_global_id(0);
    int y = get_global_id(1);
    
    if (x >= width || y >= height) return;
    
    // Calculate kernel radius based on sigma (clamped to MAX_RADIUS)
    int radius_x = min((int)(3.0f * sigma_x), MAX_RADIUS);
    int radius_y = min((int)(3.0f * sigma_y), MAX_RADIUS);
    
    float sum = 0.0f;
    float weight_sum = 0.0f;
    
    // Precompute sigma reciprocals for optimization
    float sigma_x_inv_sq = 1.0f / (sigma_x * sigma_x);
    float sigma_y_inv_sq = 1.0f / (sigma_y * sigma_y);
    
    // Apply Gaussian filter
    for (int dy = -radius_y; dy <= radius_y; dy++) {
        for (int dx = -radius_x; dx <= radius_x; dx++) {
            // Clamp to image boundaries
            int px = clamp(x + dx, 0, width - 1);
            int py = clamp(y + dy, 0, height - 1);
            
            // Calculate Gaussian weight with optimized computation
            float dist_sq = (dx * dx) * sigma_x_inv_sq + (dy * dy) * sigma_y_inv_sq;
            float weight = exp(-0.5f * dist_sq);
            
            // Accumulate weighted sum
            sum += input[py * width + px] * weight;
            weight_sum += weight;
        }
    }
    
    // Normalize and write output
    output[y * width + x] = sum / weight_sum;
}

// Optimized separable Gaussian blur kernel
__kernel void gaussian_blur_separable_horizontal(
    __global const float* input,
    __global float* output,
    __global const float* weights,
    const int radius,
    const int height,
    const int width
) {
    int x = get_global_id(0);
    int y = get_global_id(1);
    
    if (x >= width || y >= height) return;
    
    float sum = 0.0f;
    
    // Horizontal convolution
    for (int dx = -radius; dx <= radius; dx++) {
        int px = clamp(x + dx, 0, width - 1);
        sum += input[y * width + px] * weights[dx + radius];
    }
    
    output[y * width + x] = sum;
}

__kernel void gaussian_blur_separable_vertical(
    __global const float* input,
    __global float* output,
    __global const float* weights,
    const int radius,
    const int height,
    const int width
) {
    int x = get_global_id(0);
    int y = get_global_id(1);
    
    if (x >= width || y >= height) return;
    
    float sum = 0.0f;
    
    // Vertical convolution
    for (int dy = -radius; dy <= radius; dy++) {
        int py = clamp(y + dy, 0, height - 1);
        sum += input[py * width + x] * weights[dy + radius];
    }
    
    output[y * width + x] = sum;
}

// Shared memory optimized version for large kernels
__kernel void gaussian_blur_2d_shared(
    __global const float* input,
    __global float* output,
    const float sigma_x,
    const float sigma_y,
    const int height,
    const int width
) {
    __local float tile[TILE_SIZE_Y][TILE_SIZE_X];
    
    int local_x = get_local_id(0);
    int local_y = get_local_id(1);
    int group_x = get_group_id(0);
    int group_y = get_group_id(1);
    
    int global_x = group_x * BLOCK_SIZE_X + local_x;
    int global_y = group_y * BLOCK_SIZE_Y + local_y;
    
    int radius_x = min((int)(3.0f * sigma_x), MAX_RADIUS);
    int radius_y = min((int)(3.0f * sigma_y), MAX_RADIUS);
    
    // Load data into shared memory with halo
    for (int ty = local_y; ty < TILE_SIZE_Y; ty += BLOCK_SIZE_Y) {
        for (int tx = local_x; tx < TILE_SIZE_X; tx += BLOCK_SIZE_X) {
            int src_x = group_x * BLOCK_SIZE_X + tx - radius_x;
            int src_y = group_y * BLOCK_SIZE_Y + ty - radius_y;
            
            src_x = clamp(src_x, 0, width - 1);
            src_y = clamp(src_y, 0, height - 1);
            
            tile[ty][tx] = input[src_y * width + src_x];
        }
    }
    
    barrier(CLK_LOCAL_MEM_FENCE);
    
    if (global_x >= width || global_y >= height) return;
    
    float sum = 0.0f;
    float weight_sum = 0.0f;
    
    float sigma_x_inv_sq = 1.0f / (sigma_x * sigma_x);
    float sigma_y_inv_sq = 1.0f / (sigma_y * sigma_y);
    
    int tile_center_x = local_x + radius_x;
    int tile_center_y = local_y + radius_y;
    
    // Apply Gaussian filter using shared memory
    for (int dy = -radius_y; dy <= radius_y; dy++) {
        for (int dx = -radius_x; dx <= radius_x; dx++) {
            float dist_sq = (dx * dx) * sigma_x_inv_sq + (dy * dy) * sigma_y_inv_sq;
            float weight = exp(-0.5f * dist_sq);
            
            sum += tile[tile_center_y + dy][tile_center_x + dx] * weight;
            weight_sum += weight;
        }
    }
    
    output[global_y * width + global_x] = sum / weight_sum;
}