Skip to main content

ferrotorch_distributed/
lib.rs

1//! Distributed training for ferrotorch.
2//!
3//! This crate provides the building blocks for multi-rank training:
4//!
5//! - **Backends** ([`backend`]) — Transport-agnostic communication.
6//!   [`TcpBackend`](backend::TcpBackend) for real multi-process training,
7//!   [`SimulatedBackend`](backend::SimulatedBackend) for in-process testing.
8//!
9//! - **Collectives** ([`collective`]) — [`allreduce`](collective::allreduce),
10//!   [`all_gather`](collective::all_gather),
11//!   [`reduce_scatter`](collective::reduce_scatter),
12//!   [`broadcast`](collective::broadcast), and [`barrier`](collective::barrier).
13//!
14//! - **DDP** ([`ddp`]) — [`DDP`](ddp::DDP) wraps a `Module` and
15//!   synchronizes gradients across ranks after each backward pass.
16//!
17//! - **FSDP** ([`fsdp`]) — [`FSDP`](fsdp::FSDP) wraps a `Module` and
18//!   shards parameters across ranks, all-gathering during forward and
19//!   reduce-scattering gradients during backward.
20//!
21//! - **RPC** ([`rpc`]) — Remote Procedure Call framework with
22//!   [`RpcContext`](rpc::RpcContext) for invoking functions on remote ranks,
23//!   and [`RRef`](rpc::RRef) for holding references to remote data.
24//!
25//! - **Pipeline parallelism** ([`pipeline`]) —
26//!   [`Pipeline`](pipeline::Pipeline) splits a model into sequential stages
27//!   and processes microbatches through them. Supports
28//!   [`GPipe`](pipeline::PipelineSchedule::GPipe) and
29//!   [`Interleaved1F1B`](pipeline::PipelineSchedule::Interleaved1F1B) schedules.
30//!
31//! - **GPU collectives** ([`gpu_collective`], requires `gpu` feature) —
32//!   [`gpu_allreduce`](gpu_collective::gpu_allreduce) and
33//!   [`gpu_broadcast`](gpu_collective::gpu_broadcast) transfer GPU tensors
34//!   to CPU, run the standard TCP collective, and copy back. Portable
35//!   alternative to NCCL.
36//!
37//! # Quick start
38//!
39//! ```ignore
40//! use ferrotorch_distributed::backend::SimulatedBackend;
41//! use ferrotorch_distributed::collective::{allreduce, ReduceOp};
42//! use ferrotorch_distributed::ddp::DDP;
43//! use ferrotorch_distributed::fsdp::FSDP;
44//! use ferrotorch_distributed::rpc::{RpcContext, SimulatedRpcBackend};
45//! use ferrotorch_distributed::pipeline::{Pipeline, PipelineStage, PipelineSchedule};
46//! ```
47
48pub mod backend;
49pub mod checkpoint;
50pub mod collective;
51pub mod ddp;
52pub mod error;
53pub mod fsdp;
54pub mod pipeline;
55pub mod rpc;
56
57#[cfg(feature = "gpu")]
58pub mod gpu_collective;
59
60// Re-export key types at crate root for convenience.
61pub use backend::{Backend, SimulatedBackend, TcpBackend};
62pub use checkpoint::{
63    AsyncCheckpointer, CheckpointFuture, DistCheckpointError, DistributedCheckpoint, ShardMetadata,
64    TensorShardSpec, flat_shard_metadata, load_distributed, reshard, save_distributed,
65};
66pub use collective::{
67    DEFAULT_COLLECTIVE_TIMEOUT, ReduceOp, all_gather, all_gather_with_timeout, allreduce,
68    allreduce_with_timeout, barrier, broadcast, reduce_scatter, reduce_scatter_with_timeout,
69};
70pub use ddp::DDP;
71pub use error::DistributedError;
72pub use fsdp::FSDP;
73pub use pipeline::{Pipeline, PipelineSchedule};
74pub use rpc::{RpcAgent, RpcError, TcpRpcBackend};
75
76#[cfg(feature = "gpu")]
77pub use gpu_collective::{gpu_allreduce, gpu_broadcast};