pub fn relu_out(input: &Tensor, output: &mut Tensor)
ReLU writing into pre-allocated output tensor. Zero allocation overhead.