ferrotorch-distributed 0.5.1

Distributed training for ferrotorch — backends, collectives, and DDP
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
//! Pipeline parallelism for distributed training.
//!
//! Splits a model into sequential stages, each running on a different rank.
//! Micro-batches flow through the pipeline so multiple stages can execute
//! concurrently, improving hardware utilization.
//!
//! Two scheduling strategies are provided:
//!
//! - **GPipe** ([`PipelineSchedule::GPipe`]): All micro-batch forwards run
//!   first, then all backwards. Simple but has high peak memory.
//!
//! - **1F1B** ([`PipelineSchedule::OneFOnEB`]): Interleaves forward and
//!   backward passes per micro-batch to reduce peak memory. **Note:** the
//!   1F1B forward scheduling is implemented, but the backward pass currently
//!   uses GPipe-style sequential processing. True 1F1B memory savings require
//!   a combined forward+backward method that interleaves at a finer
//!   granularity, which is planned for a future release. The current
//!   implementation provides scheduling structure but not the full memory
//!   benefit.

use std::sync::Arc;

use ferrotorch_core::storage::TensorStorage;
use ferrotorch_core::{FerrotorchError, FerrotorchResult, Float, Tensor};
use ferrotorch_nn::Module;

use crate::backend::Backend;

// ---------------------------------------------------------------------------
// Pipeline schedule
// ---------------------------------------------------------------------------

/// Pipeline scheduling strategy.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PipelineSchedule {
    /// GPipe: all forwards then all backwards.
    GPipe,
    /// 1F1B: interleaved forward and backward scheduling.
    ///
    /// **Current limitation:** Forward scheduling follows 1F1B ordering, but
    /// the backward pass uses GPipe-style sequential processing. The memory
    /// benefit of true 1F1B requires a combined forward+backward method
    /// (future work).
    OneFOnEB,
}

// ---------------------------------------------------------------------------
// Pipeline
// ---------------------------------------------------------------------------

/// Pipeline parallel wrapper.
///
/// Wraps a single stage module and coordinates with other ranks via the
/// provided [`Backend`]. Each rank runs one stage of the pipeline.
///
/// # Usage
///
/// ```ignore
/// let pipeline = Pipeline::new(
///     stage_module,
///     backend,
///     num_microbatches,
///     PipelineSchedule::GPipe,
/// )?;
/// ```
pub struct Pipeline<M: Module<T>, T: Float> {
    module: M,
    backend: Arc<dyn Backend>,
    num_microbatches: usize,
    schedule: PipelineSchedule,
    _marker: std::marker::PhantomData<T>,
}

impl<M: Module<T>, T: Float> std::fmt::Debug for Pipeline<M, T> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("Pipeline")
            .field("num_microbatches", &self.num_microbatches)
            .field("schedule", &self.schedule)
            .finish_non_exhaustive()
    }
}

impl<M: Module<T>, T: Float> Pipeline<M, T> {
    /// Create a new pipeline stage.
    ///
    /// # Arguments
    ///
    /// * `module` - This rank's stage module.
    /// * `backend` - Communication backend (rank/world_size determine stage).
    /// * `num_microbatches` - Number of micro-batches to split the input into.
    /// * `schedule` - Pipeline scheduling strategy.
    ///
    /// # Errors
    ///
    /// Returns an error if `num_microbatches` is zero or if `world_size` is
    /// less than 2 (pipeline parallelism requires at least 2 stages).
    pub fn new(
        module: M,
        backend: Arc<dyn Backend>,
        num_microbatches: usize,
        schedule: PipelineSchedule,
    ) -> FerrotorchResult<Self> {
        if num_microbatches == 0 {
            return Err(FerrotorchError::InvalidArgument {
                message: "Pipeline: num_microbatches must be > 0".into(),
            });
        }
        if backend.world_size() < 2 {
            return Err(FerrotorchError::InvalidArgument {
                message: format!(
                    "Pipeline: world_size must be >= 2 for pipeline parallelism, got {}",
                    backend.world_size(),
                ),
            });
        }
        Ok(Self {
            module,
            backend,
            num_microbatches,
            schedule,
            _marker: std::marker::PhantomData,
        })
    }

    /// Run the forward pass for all micro-batches through this stage.
    ///
    /// - If this is the first stage (rank 0), splits `input` into
    ///   `num_microbatches` chunks along dimension 0.
    /// - If this is a middle or last stage, receives activations from the
    ///   previous stage.
    /// - After running forward, sends activations to the next stage (unless
    ///   this is the last stage).
    ///
    /// Returns the outputs for each micro-batch (only meaningful on the last
    /// stage).
    pub fn forward(&self, input: Option<&Tensor<T>>) -> FerrotorchResult<Vec<Tensor<T>>> {
        let rank = self.backend.rank();
        let world_size = self.backend.world_size();

        let mut outputs = Vec::with_capacity(self.num_microbatches);

        for mb in 0..self.num_microbatches {
            // Get input for this micro-batch.
            let mb_input = if rank == 0 {
                // First stage: chunk the input.
                let input = input.ok_or_else(|| FerrotorchError::InvalidArgument {
                    message: "Pipeline: rank 0 must provide input".into(),
                })?;
                self.get_microbatch(input, mb)?
            } else {
                // Receive from previous stage.
                self.recv_activation(rank - 1)?
            };

            // Forward through this stage's module.
            let output = self.module.forward(&mb_input)?;

            if rank < world_size - 1 {
                // Send to next stage.
                self.send_activation(&output, rank + 1)?;
            }

            outputs.push(output);
        }

        Ok(outputs)
    }

    /// Run the backward pass for all micro-batches.
    ///
    /// For the last stage, computes gradients from the loss. For other stages,
    /// receives gradient activations from the next stage.
    ///
    /// # Implementation note
    ///
    /// Both GPipe and 1F1B schedules currently use the same backward
    /// processing order (sequential over all micro-batches). The 1F1B
    /// forward scheduling is implemented but the backward pass does not
    /// yet interleave with forward to achieve the full memory savings.
    /// This is documented as a known limitation; true 1F1B memory benefits
    /// require a combined forward+backward method (future work).
    pub fn backward(
        &self,
        outputs: &[Tensor<T>],
        grad_outputs: Option<&[Tensor<T>]>,
    ) -> FerrotorchResult<()> {
        let rank = self.backend.rank();
        let world_size = self.backend.world_size();

        for mb in (0..self.num_microbatches).rev() {
            if rank == world_size - 1 {
                // Last stage: use provided grad_outputs.
                if let Some(grads) = grad_outputs {
                    if mb < grads.len() {
                        outputs[mb].set_grad(Some(grads[mb].clone()))?;
                    }
                }
            } else {
                // Receive gradient from next stage.
                let grad = self.recv_activation(rank + 1)?;
                outputs[mb].set_grad(Some(grad))?;
            }

            // Backward through autograd.
            ferrotorch_core::backward(&outputs[mb])?;

            if rank > 0 {
                // Send gradient to previous stage.
                // Compute the gradient w.r.t. the input of this stage.
                let numel = outputs[mb].numel();
                let grad_input = Tensor::from_storage(
                    TensorStorage::cpu(vec![<T as num_traits::Zero>::zero(); numel]),
                    outputs[mb].shape().to_vec(),
                    false,
                )?;
                self.send_activation(&grad_input, rank - 1)?;
            }
        }

        Ok(())
    }

    /// Extract micro-batch `mb_idx` from `input` by chunking dim 0.
    fn get_microbatch(&self, input: &Tensor<T>, mb_idx: usize) -> FerrotorchResult<Tensor<T>> {
        let shape = input.shape();
        if shape.is_empty() {
            return Err(FerrotorchError::InvalidArgument {
                message: "Pipeline: input tensor must have at least 1 dimension".into(),
            });
        }
        let batch_size = shape[0];
        let mb_size = batch_size / self.num_microbatches;
        let start = mb_idx * mb_size;
        let end = if mb_idx == self.num_microbatches - 1 {
            batch_size
        } else {
            start + mb_size
        };

        let data = input.data_vec()?;
        let stride: usize = shape[1..].iter().product();
        let mb_data = data[start * stride..end * stride].to_vec();
        let mut mb_shape = shape.to_vec();
        mb_shape[0] = end - start;

        Tensor::from_storage(TensorStorage::cpu(mb_data), mb_shape, input.requires_grad())
    }

    /// Send an activation tensor to `dst_rank`.
    fn send_activation(&self, tensor: &Tensor<T>, dst_rank: usize) -> FerrotorchResult<()> {
        let data = tensor.data_vec()?;
        let elem_size = std::mem::size_of::<T>();
        let byte_slice: Vec<u8> = data
            .iter()
            .flat_map(|v| {
                // SAFETY: byte-reinterpret of a single `&T` (`T: Float`)
                // into a `[u8]` view of length `size_of::<T>()`.
                //
                // - VALIDITY: every byte pattern is a valid `u8`; reading
                //   the underlying bytes of an IEEE-754 float can never
                //   produce an invalid value.
                // - LENGTH: `elem_size == size_of::<T>()`, set immediately
                //   above; this matches the size of the value pointed to.
                // - ALIGNMENT: `*const u8` is 1-aligned, satisfied by any
                //   `*const T` for primitive `Float`.
                // - LIFETIME: the slice borrows `*v` (live for the closure
                //   body) and is immediately copied via `bytes.to_vec()`
                //   before the closure returns; the borrow does not
                //   escape. No dangling reference is possible.
                // - PROVENANCE: `v as *const T as *const u8` derives from
                //   the live `&T` borrow; the dual cast preserves
                //   provenance under the strict-provenance model.
                // - ENDIANNESS: matches checkpoint.rs:as_le_bytes — this
                //   crate targets LE platforms; the wire format used by
                //   `recv_activation` performs the inverse byte-pattern
                //   read on the same endianness.
                let bytes =
                    unsafe { std::slice::from_raw_parts(v as *const T as *const u8, elem_size) };
                bytes.to_vec()
            })
            .collect();

        // Send shape info first: ndim (8 bytes) + shape dims (8 bytes each).
        let ndim = tensor.shape().len() as u64;
        let mut header = ndim.to_le_bytes().to_vec();
        for &d in tensor.shape() {
            header.extend_from_slice(&(d as u64).to_le_bytes());
        }
        self.backend.send(&header, dst_rank)?;
        self.backend.send(&byte_slice, dst_rank)?;

        Ok(())
    }

    /// Receive an activation tensor from `src_rank`.
    fn recv_activation(&self, src_rank: usize) -> FerrotorchResult<Tensor<T>> {
        // Receive shape header.
        let mut ndim_buf = [0u8; 8];
        self.backend.recv(&mut ndim_buf, src_rank)?;
        let ndim = u64::from_le_bytes(ndim_buf) as usize;

        let mut shape = Vec::with_capacity(ndim);
        for _ in 0..ndim {
            let mut dim_buf = [0u8; 8];
            self.backend.recv(&mut dim_buf, src_rank)?;
            shape.push(u64::from_le_bytes(dim_buf) as usize);
        }

        let numel: usize = shape.iter().product();
        let elem_size = std::mem::size_of::<T>();
        let mut byte_buf = vec![0u8; numel * elem_size];
        self.backend.recv(&mut byte_buf, src_rank)?;

        let data: Vec<T> = byte_buf
            .chunks_exact(elem_size)
            .map(|chunk| match elem_size {
                4 => {
                    let val = f32::from_le_bytes(chunk.try_into().unwrap());
                    T::from(val).unwrap()
                }
                8 => {
                    let val = f64::from_le_bytes(chunk.try_into().unwrap());
                    T::from(val).unwrap()
                }
                _ => unreachable!("unsupported element size {}", elem_size),
            })
            .collect();

        Tensor::from_storage(TensorStorage::cpu(data), shape, false)
    }

    /// The schedule used by this pipeline.
    pub fn schedule(&self) -> PipelineSchedule {
        self.schedule
    }

    /// The number of micro-batches.
    pub fn num_microbatches(&self) -> usize {
        self.num_microbatches
    }

    /// Immutable access to the inner module.
    pub fn module(&self) -> &M {
        &self.module
    }

    /// Mutable access to the inner module.
    pub fn module_mut(&mut self) -> &mut M {
        &mut self.module
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::backend::SimulatedBackend;
    use ferrotorch_nn::Parameter;

    /// Identity module with real train/eval state tracking.
    /// Used by multiple pipeline validation tests.
    struct IdentityModule {
        training: bool,
    }

    impl IdentityModule {
        fn new() -> Self {
            Self { training: true }
        }
    }

    impl Module<f32> for IdentityModule {
        fn forward(&self, input: &Tensor<f32>) -> FerrotorchResult<Tensor<f32>> {
            Ok(input.clone())
        }
        fn parameters(&self) -> Vec<&Parameter<f32>> {
            vec![]
        }
        fn parameters_mut(&mut self) -> Vec<&mut Parameter<f32>> {
            vec![]
        }
        fn named_parameters(&self) -> Vec<(String, &Parameter<f32>)> {
            vec![]
        }
        fn train(&mut self) {
            self.training = true;
        }
        fn eval(&mut self) {
            self.training = false;
        }
        fn is_training(&self) -> bool {
            self.training
        }
    }

    #[test]
    fn test_pipeline_new_validates_microbatches() {
        let group = SimulatedBackend::create_group(2).unwrap();
        let b: Arc<dyn Backend> = Arc::new(group.into_iter().next().unwrap());

        // Zero microbatches should error.
        let result = Pipeline::new(IdentityModule::new(), b.clone(), 0, PipelineSchedule::GPipe);
        assert!(result.is_err());
        let err = format!("{}", result.unwrap_err());
        assert!(err.contains("num_microbatches must be > 0"));
    }

    #[test]
    fn test_pipeline_new_validates_world_size() {
        let group = SimulatedBackend::create_group(1).unwrap();
        let b: Arc<dyn Backend> = Arc::new(group.into_iter().next().unwrap());

        // World size 1 should error.
        let result = Pipeline::new(IdentityModule::new(), b, 2, PipelineSchedule::OneFOnEB);
        assert!(result.is_err());
        let err = format!("{}", result.unwrap_err());
        assert!(err.contains("world_size must be >= 2"));
    }

    #[test]
    fn test_pipeline_schedule_accessors() {
        let group = SimulatedBackend::create_group(2).unwrap();
        let b: Arc<dyn Backend> = Arc::new(group.into_iter().next().unwrap());

        let pipeline = Pipeline::new(IdentityModule::new(), b, 4, PipelineSchedule::GPipe).unwrap();
        assert_eq!(pipeline.schedule(), PipelineSchedule::GPipe);
        assert_eq!(pipeline.num_microbatches(), 4);
    }

    #[test]
    fn test_identity_module_train_eval_toggles_state() {
        let mut m = IdentityModule::new();
        assert!(m.is_training());
        m.eval();
        assert!(!m.is_training());
        m.train();
        assert!(m.is_training());
    }
}