boostr 0.1.0 - Docs.rs

//! Gradient bucket manager for overlapping allreduce with backward pass
//!
//! Groups model parameters into fixed-size buckets and fires allreduce
//! on each bucket as soon as all its gradients are ready, enabling
//! communication/computation overlap during the backward pass.

use std::collections::HashMap;
use std::sync::Arc;

use crate::distributed::comm_utils::all_reduce_tensor;
use crate::error::{Error, Result};
use numr::autograd::{GradStore, Var};
use numr::dtype::DType;
use numr::ops::{ScalarOps, TensorOps};
use numr::runtime::{Communicator, ReduceOp, Runtime, RuntimeClient};
use numr::tensor::{Tensor, TensorId};

/// A bucket of parameters whose gradients are allreduced together.
struct Bucket<R: Runtime> {
    /// Parameter IDs in this bucket
    param_ids: Vec<TensorId>,
    /// Number of elements per parameter
    param_numels: Vec<usize>,
    /// Original shapes for each parameter's gradient
    param_shapes: Vec<Vec<usize>>,
    /// DType for the flat buffer (used to validate dtype consistency)
    dtype: DType,
    /// Received gradients (stored as we get hook notifications)
    received_grads: HashMap<TensorId, Tensor<R>>,
    /// Flat contiguous buffer for allreduce
    flat_buffer: Option<Tensor<R>>,
    /// Whether allreduce has been launched for this bucket
    allreduce_launched: bool,
    /// Completion event handle (set when using overlapped mode)
    completion_event: Option<u64>,
}

/// Manages gradient buckets and fires allreduce during backward.
///
/// Parameters are grouped into buckets of approximately `bucket_size_bytes`.
/// When all gradients in a bucket are ready, they are flattened into a
/// contiguous buffer and allreduced. After backward completes, call
/// [`wait_and_unflatten`] to sync pending allreduce ops and scatter
/// the averaged gradients back into the grad store.
///
/// # Event-Based Compute-Communication Overlap
///
/// When `compute_stream_handle` is provided and the communicator supports
/// [`StreamSyncOps`](numr::runtime::StreamSyncOps), allreduce operations are
/// issued on a dedicated communication stream using CUDA event synchronization.
/// This allows gradient communication to overlap with continued backward
/// computation on the compute stream, yielding 30-40% throughput improvement
/// (the same technique used by PyTorch DDP).
///
/// On CPU or when the communicator lacks stream support, the manager falls
/// back to blocking allreduce during the backward pass.
pub struct GradientBucketManager<R: Runtime> {
    buckets: Vec<Bucket<R>>,
    /// Maps parameter ID → bucket index
    param_to_bucket: HashMap<TensorId, usize>,
    comm: Arc<dyn Communicator>,
    /// Compute stream handle for event-based overlap (None = fallback to blocking sync)
    compute_stream_handle: Option<u64>,
}

impl<R: Runtime<DType = DType>> GradientBucketManager<R> {
    /// Create a new bucket manager.
    ///
    /// # Arguments
    ///
    /// * `param_info` - Parameter (id, numel, dtype) in reverse-backward order
    ///   (last gradients computed first). This ordering maximizes overlap.
    /// * `comm` - The communicator for allreduce operations.
    /// * `bucket_size_bytes` - Target bucket size in bytes (default: 25 MiB).
    /// * `compute_stream_handle` - Optional compute stream handle from
    ///   `RuntimeClient::compute_stream_handle()`. When both this and
    ///   `comm.as_stream_sync()` are available, enables event-based
    ///   compute-communication overlap for 30-40% throughput improvement.
    pub fn new(
        param_info: &[(TensorId, usize, DType)],
        comm: Arc<dyn Communicator>,
        bucket_size_bytes: usize,
        compute_stream_handle: Option<u64>,
    ) -> Self {
        let mut buckets = Vec::new();
        let mut param_to_bucket = HashMap::new();
        let mut current_ids = Vec::new();
        let mut current_numels = Vec::new();
        let mut current_bytes = 0usize;
        let mut current_dtype = DType::F32;

        for &(id, numel, dtype) in param_info {
            let elem_bytes = dtype.size_in_bytes();
            let param_bytes = numel * elem_bytes;

            // Start a new bucket if adding this param would exceed the limit
            // or if dtype changes (all params in a bucket must share dtype)
            if !current_ids.is_empty()
                && (current_bytes + param_bytes > bucket_size_bytes || dtype != current_dtype)
            {
                let n = current_ids.len();
                for &cid in &current_ids {
                    param_to_bucket.insert(cid, buckets.len());
                }
                buckets.push(Bucket {
                    param_ids: std::mem::take(&mut current_ids),
                    param_numels: std::mem::take(&mut current_numels),
                    param_shapes: Vec::with_capacity(n),
                    dtype: current_dtype,
                    received_grads: HashMap::new(),
                    flat_buffer: None,
                    allreduce_launched: false,
                    completion_event: None,
                });
                current_bytes = 0;
            }

            current_ids.push(id);
            current_numels.push(numel);
            current_bytes += param_bytes;
            current_dtype = dtype;
        }

        // Flush remaining params into a final bucket
        if !current_ids.is_empty() {
            let n = current_ids.len();
            for &cid in &current_ids {
                param_to_bucket.insert(cid, buckets.len());
            }
            buckets.push(Bucket {
                param_ids: current_ids,
                param_numels: current_numels,
                param_shapes: Vec::with_capacity(n),
                dtype: current_dtype,
                received_grads: HashMap::new(),
                flat_buffer: None,
                allreduce_launched: false,
                completion_event: None,
            });
        }

        // Enable overlapped mode only if both stream sync and compute stream are available.
        // When the communicator lacks stream support, silently fall back to blocking allreduce.
        let overlap_handle = if comm.as_stream_sync().is_some() {
            compute_stream_handle
        } else {
            // Communicator does not support StreamSyncOps; event-based overlap unavailable.
            None
        };

        Self {
            buckets,
            param_to_bucket,
            comm,
            compute_stream_handle: overlap_handle,
        }
    }

    /// Mark a gradient as ready. When all grads in a bucket are ready,
    /// flatten them into a contiguous buffer and launch allreduce.
    pub fn mark_grad_ready<C>(&mut self, id: TensorId, grad: &Tensor<R>, client: &C) -> Result<()>
    where
        C: RuntimeClient<R> + TensorOps<R>,
    {
        let bucket_idx = match self.param_to_bucket.get(&id) {
            Some(&idx) => idx,
            None => return Ok(()), // Not a tracked parameter
        };

        let bucket = &mut self.buckets[bucket_idx];
        if bucket.allreduce_launched {
            return Ok(()); // Already launched
        }

        // Clone required: the hook borrows grad from the backward pass, but we
        // need to own it until flatten_and_allreduce runs. Temporary 2x memory
        // per gradient until the bucket is flattened.
        bucket.received_grads.insert(id, grad.clone());

        // Check if all grads in this bucket are ready
        if bucket.received_grads.len() < bucket.param_ids.len() {
            return Ok(());
        }

        // All grads ready — flatten into contiguous buffer and launch allreduce
        self.flatten_and_allreduce(bucket_idx, client)
    }

    /// Flatten all gradients in a bucket into a contiguous buffer and launch allreduce.
    fn flatten_and_allreduce<C>(&mut self, bucket_idx: usize, client: &C) -> Result<()>
    where
        C: RuntimeClient<R> + TensorOps<R>,
    {
        let bucket = &mut self.buckets[bucket_idx];

        // Validate dtype consistency
        for &pid in &bucket.param_ids {
            if let Some(g) = bucket.received_grads.get(&pid) {
                if g.dtype() != bucket.dtype {
                    return Err(Error::DistributedError {
                        reason: format!(
                            "dtype mismatch in bucket {bucket_idx}: expected {:?}, got {:?}",
                            bucket.dtype,
                            g.dtype()
                        ),
                    });
                }
            }
        }

        // Save original shapes and collect flattened gradient tensors
        bucket.param_shapes.clear();
        let mut flat_grads: Vec<Tensor<R>> = Vec::with_capacity(bucket.param_ids.len());
        for &pid in &bucket.param_ids {
            let g = bucket
                .received_grads
                .get(&pid)
                .ok_or_else(|| Error::DistributedError {
                    reason: format!("gradient missing for param in bucket {bucket_idx}"),
                })?;
            bucket.param_shapes.push(g.shape().to_vec());
            let flat = g.flatten().map_err(|e| Error::DistributedError {
                reason: format!("flatten gradient failed: {e}"),
            })?;
            flat_grads.push(flat);
        }

        // Concatenate into one contiguous buffer
        let refs: Vec<&Tensor<R>> = flat_grads.iter().collect();
        let flat_buffer = client.cat(&refs, 0).map_err(|e| Error::DistributedError {
            reason: format!("cat gradients failed: {e}"),
        })?;

        // Launch allreduce — with event-based overlap if available
        if let Some(compute_stream) = self.compute_stream_handle {
            let sync = self
                .comm
                .as_stream_sync()
                .expect("compute_stream_handle is Some only when as_stream_sync() is Some");

            // 1. Record event on compute stream (gradient data is ready)
            let ready_event = sync.create_event().map_err(|e| Error::DistributedError {
                reason: format!("create ready event failed: {e}"),
            })?;

            // Use a closure to ensure ready_event cleanup on any error path.
            let overlap_result = (|| -> Result<u64> {
                sync.record_on_stream(ready_event, compute_stream)
                    .map_err(|e| Error::DistributedError {
                        reason: format!("record ready event failed: {e}"),
                    })?;

                // 2. Make comm stream wait for gradient data to be ready.
                // After comm_stream_wait_event returns, the CUDA driver has captured
                // the event dependency in the comm stream's work queue. The event
                // handle is safe to destroy: CUDA events are reference-counted
                // internally and the driver keeps the dependency alive until the
                // stream has executed past the wait point.
                sync.comm_stream_wait_event(ready_event)
                    .map_err(|e| Error::DistributedError {
                        reason: format!("comm stream wait for ready event failed: {e}"),
                    })?;

                // 3. Launch allreduce (runs on comm stream, non-blocking to compute)
                all_reduce_tensor(self.comm.as_ref(), &flat_buffer, ReduceOp::Sum)?;

                // 4. Create and record completion event on comm stream
                let completion_event =
                    sync.create_event().map_err(|e| Error::DistributedError {
                        reason: format!("create completion event failed: {e}"),
                    })?;

                if let Err(e) = sync.record_on_comm_stream(completion_event) {
                    let _ = sync.destroy_event(completion_event);
                    return Err(Error::DistributedError {
                        reason: format!("record completion event failed: {e}"),
                    });
                }

                Ok(completion_event)
            })();

            // Always destroy the ready event — safe because CUDA events are
            // reference-counted; the driver holds the dependency until the comm
            // stream executes past its wait point.
            let _ = sync.destroy_event(ready_event);

            bucket.completion_event = Some(overlap_result?);
        } else {
            // Fallback: blocking allreduce (no overlap)
            all_reduce_tensor(self.comm.as_ref(), &flat_buffer, ReduceOp::Sum)?;
        }

        bucket.flat_buffer = Some(flat_buffer);
        bucket.allreduce_launched = true;

        Ok(())
    }

    /// After backward completes: sync all pending allreduce ops, unflatten
    /// buffers back into individual gradients, and divide by world_size.
    ///
    /// Writes the averaged gradients into the provided `GradStore`.
    pub fn wait_and_unflatten<C>(&mut self, client: &C, grads: &mut GradStore<R>) -> Result<()>
    where
        C: RuntimeClient<R> + TensorOps<R> + ScalarOps<R>,
    {
        let world_size = self.comm.world_size();
        let scale = 1.0 / world_size as f64;

        if let Some(compute_stream) = self.compute_stream_handle {
            // Event-based: make compute stream wait on each bucket's completion event
            let sync = self
                .comm
                .as_stream_sync()
                .expect("compute_stream_handle is Some only when as_stream_sync() is Some");
            for bucket in &mut self.buckets {
                if let Some(event) = bucket.completion_event.take() {
                    if let Err(e) = sync.stream_wait_event(compute_stream, event) {
                        let _ = sync.destroy_event(event);
                        return Err(Error::DistributedError {
                            reason: format!("compute stream wait for completion event failed: {e}"),
                        });
                    }
                    let _ = sync.destroy_event(event);
                }
            }
        } else {
            // Fallback: blocking sync
            self.comm.sync().map_err(|e| Error::DistributedError {
                reason: format!("sync after allreduce failed: {e}"),
            })?;
        }

        // Unflatten each bucket's flat buffer back into individual gradients
        for bucket in &mut self.buckets {
            let flat_buffer = match bucket.flat_buffer.take() {
                Some(buf) => buf,
                None => continue,
            };

            // Slice the flat buffer to extract each param's gradient
            let mut offset = 0usize;
            for (i, &pid) in bucket.param_ids.iter().enumerate() {
                let numel = bucket.param_numels[i];
                let shape = &bucket.param_shapes[i];

                // Extract this param's slice from the flat buffer
                let flat_grad =
                    flat_buffer
                        .narrow(0, offset, numel)
                        .map_err(|e| Error::DistributedError {
                            reason: format!("narrow failed during unflatten: {e}"),
                        })?;

                // Reshape to match original gradient shape
                let reshaped = flat_grad
                    .reshape(shape)
                    .map_err(|e| Error::DistributedError {
                        reason: format!("reshape failed during unflatten: {e}"),
                    })?;

                // Scale by 1/world_size to average
                let averaged = if world_size > 1 {
                    client.mul_scalar(&reshaped, scale)?
                } else {
                    reshaped
                };

                grads.insert(pid, averaged);
                offset += numel;
            }
        }

        Ok(())
    }

    /// Reset all buckets for a new backward pass.
    pub fn reset(&mut self) {
        let sync = self.comm.as_stream_sync();
        for bucket in &mut self.buckets {
            bucket.received_grads.clear();
            bucket.allreduce_launched = false;
            bucket.flat_buffer = None;
            bucket.param_shapes.clear();
            // Clean up any leaked completion events
            if let Some(event) = bucket.completion_event.take() {
                if let Some(s) = sync {
                    let _ = s.destroy_event(event);
                }
            }
        }
    }

    /// Number of buckets.
    pub fn num_buckets(&self) -> usize {
        self.buckets.len()
    }
}

/// Extract leaf parameter IDs from a computation graph in backward traversal order.
///
/// Performs a topological sort of the graph (same as backward), collects
/// leaf node IDs (those with no `grad_fn`), and returns them in the order
/// they are encountered during backward (reverse topological order).
///
/// This ordering is optimal for bucket construction: gradients computed
/// first during backward should be in the same bucket so the bucket fills
/// quickly and allreduce can start early.
pub fn param_order_from_graph<R: Runtime>(loss: &Var<R>) -> Vec<TensorId> {
    use std::collections::HashSet;

    let mut topo = Vec::new();
    let mut visited = HashSet::new();

    fn dfs<R: Runtime>(
        id: TensorId,
        grad_fn: Option<Arc<dyn numr::autograd::GradFn<R>>>,
        visited: &mut HashSet<TensorId>,
        topo: &mut Vec<(TensorId, bool)>, // (id, is_leaf)
    ) {
        if visited.contains(&id) {
            return;
        }
        visited.insert(id);

        let input_ids: Vec<TensorId> = grad_fn
            .as_ref()
            .map(|gf| gf.inputs().to_vec())
            .unwrap_or_default();

        if let Some(gf) = &grad_fn {
            for (input_id, input_grad_fn) in input_ids.iter().zip(gf.input_grad_fns()) {
                dfs(*input_id, input_grad_fn, visited, topo);
            }
        }

        topo.push((id, grad_fn.is_none()));
    }

    dfs(loss.id(), loss.grad_fn().cloned(), &mut visited, &mut topo);

    // Reverse topological order, keep only leaves
    topo.into_iter()
        .rev()
        .filter(|(_, is_leaf)| *is_leaf)
        .map(|(id, _)| id)
        .collect()
}

#[cfg(test)]
#[path = "bucket_manager_tests.rs"]
mod tests;