pub fn depthwise_conv_3x3_simd( input: &[f32], kernel: &[f32], output: &mut [f32], h: usize, w: usize, c: usize, stride: usize, padding: usize, )
SIMD-accelerated depthwise 3x3 convolution