pub fn sync_gradient(gradient: &mut Tensor<f32>, pg: &ProcessGroup)
Synchronizes a single gradient tensor.