ferrotorch_distributed/lib.rs
1//! Distributed training for ferrotorch.
2//!
3//! This crate provides the building blocks for multi-rank training:
4//!
5//! - **Backends** ([`backend`]) — Transport-agnostic communication.
6//! [`TcpBackend`](backend::TcpBackend) for real multi-process training,
7//! [`SimulatedBackend`](backend::SimulatedBackend) for in-process testing.
8//!
9//! - **Collectives** ([`collective`]) — [`allreduce`](collective::allreduce),
10//! [`all_gather`](collective::all_gather),
11//! [`reduce_scatter`](collective::reduce_scatter),
12//! [`broadcast`](collective::broadcast), and [`barrier`](collective::barrier).
13//!
14//! - **DDP** ([`ddp`]) — [`DDP`](ddp::DDP) wraps a `Module` and
15//! synchronizes gradients across ranks after each backward pass.
16//!
17//! - **FSDP** ([`fsdp`]) — [`FSDP`](fsdp::FSDP) wraps a `Module` and
18//! shards parameters across ranks, all-gathering during forward and
19//! reduce-scattering gradients during backward.
20//!
21//! - **RPC** ([`rpc`]) — Remote Procedure Call framework with
22//! [`RpcContext`](rpc::RpcContext) for invoking functions on remote ranks,
23//! and [`RRef`](rpc::RRef) for holding references to remote data.
24//!
25//! - **Pipeline parallelism** ([`pipeline`]) —
26//! [`Pipeline`](pipeline::Pipeline) splits a model into sequential stages
27//! and processes microbatches through them. Supports
28//! [`GPipe`](pipeline::PipelineSchedule::GPipe) and
29//! [`Interleaved1F1B`](pipeline::PipelineSchedule::Interleaved1F1B) schedules.
30//!
31//! - **GPU collectives** ([`gpu_collective`], requires `gpu` feature) —
32//! [`gpu_allreduce`](gpu_collective::gpu_allreduce) and
33//! [`gpu_broadcast`](gpu_collective::gpu_broadcast) transfer GPU tensors
34//! to CPU, run the standard TCP collective, and copy back. Portable
35//! alternative to NCCL.
36//!
37//! # Quick start
38//!
39//! ```ignore
40//! use ferrotorch_distributed::backend::SimulatedBackend;
41//! use ferrotorch_distributed::collective::{allreduce, ReduceOp};
42//! use ferrotorch_distributed::ddp::DDP;
43//! use ferrotorch_distributed::fsdp::FSDP;
44//! use ferrotorch_distributed::rpc::{RpcContext, SimulatedRpcBackend};
45//! use ferrotorch_distributed::pipeline::{Pipeline, PipelineStage, PipelineSchedule};
46//! ```
47
48pub mod backend;
49pub mod checkpoint;
50pub mod collective;
51pub mod ddp;
52pub mod error;
53pub mod fsdp;
54pub mod pipeline;
55pub mod rpc;
56
57#[cfg(feature = "gpu")]
58pub mod gpu_collective;
59
60// Re-export key types at crate root for convenience.
61pub use backend::{Backend, SimulatedBackend, TcpBackend};
62pub use checkpoint::{
63 AsyncCheckpointer, CheckpointFuture, DistCheckpointError, DistributedCheckpoint, ShardMetadata,
64 TensorShardSpec, flat_shard_metadata, load_distributed, reshard, save_distributed,
65};
66pub use collective::{
67 DEFAULT_COLLECTIVE_TIMEOUT, ReduceOp, all_gather, all_gather_with_timeout, allreduce,
68 allreduce_with_timeout, barrier, broadcast, reduce_scatter, reduce_scatter_with_timeout,
69};
70pub use ddp::DDP;
71pub use error::DistributedError;
72pub use fsdp::FSDP;
73pub use pipeline::{Pipeline, PipelineSchedule};
74pub use rpc::{RpcAgent, RpcError, TcpRpcBackend};
75
76#[cfg(feature = "gpu")]
77pub use gpu_collective::{gpu_allreduce, gpu_broadcast};