pub fn gpu_batchnorm_forward(
_input: &CudaBuffer<f32>,
_weight: &CudaBuffer<f32>,
_bias: &CudaBuffer<f32>,
_running_mean: &mut CudaBuffer<f32>,
_running_var: &mut CudaBuffer<f32>,
_channels: usize,
_spatial: usize,
_eps: f32,
_momentum: f32,
_training: bool,
device: &GpuDevice,
) -> GpuResult<(CudaBuffer<f32>, CudaBuffer<f32>, CudaBuffer<f32>)>Expand description
BatchNorm2d forward on GPU (placeholder — kernel pass-1 indexing needs refinement). Currently validates the kernel compiles and falls back to returning an error so callers use the CPU path.