Skip to main content

ferrotorch_distributed/
lib.rs

1// Lint baseline mirrors the workspace-standard pattern from
2// `ferrotorch-gpu`/`-jit`/`-cubecl`/`-xpu` lib.rs. `unsafe_code` is NOT
3// denied: this crate calls into NCCL via raw FFI (`nccl_sys`), uses
4// `dlopen`/`dlsym`/`std::mem::transmute` to load CUDA stream symbols
5// without a compile-time CUDA dependency (`nccl_backend`), and performs
6// byte-reinterpret tensor I/O (`checkpoint`, `pipeline`). Per-block
7// SAFETY substantiation lives at each `unsafe { ... }` site.
8#![warn(clippy::all, clippy::pedantic)]
9#![deny(rust_2018_idioms)]
10// `missing_docs` and `missing_debug_implementations` are held at `allow`
11// while the workspace-wide rustdoc / `Debug` pass is tracked separately
12// (matches the existing `ferrotorch-gpu`/`-core` precedent — diverging
13// unilaterally from a leaf crate would be Step 4 architectural
14// unilateralism). Several distributed types own `Mutex<NcclComm>` raw
15// FFI pointers, `Arc<dyn Backend>` trait objects, or `Box<dyn Fn>` RPC
16// handlers whose `Debug` impls require careful hand-rolling.
17#![allow(missing_docs, missing_debug_implementations)]
18// Pedantic lints we explicitly accept across this crate. Each allow names
19// a concrete reason — the alternative would be churn-for-zero-benefit or
20// a worse API. Mirrors the ferrotorch-gpu baseline; add to this list only
21// with a one-line justification.
22#![allow(
23    // # Errors / # Panics sections will be added during the workspace-wide
24    // rustdoc pass (tracked separately, not gated on this lint baseline).
25    clippy::missing_errors_doc,
26    clippy::missing_panics_doc,
27    // Distributed code casts pervasively between `usize` rank/world_size
28    // and `i32` NCCL/MPI peer indices, and between `u64` byte counters
29    // and `usize` buffer lengths; the explicit cast is more readable
30    // than try_into/unwrap or num-traits indirection.
31    clippy::cast_possible_truncation,
32    clippy::cast_possible_wrap,
33    clippy::cast_sign_loss,
34    clippy::cast_precision_loss,
35    clippy::cast_lossless,
36    // `#[must_use]` on every getter is churn for marginal value; callers
37    // in this codebase already use the returned values.
38    clippy::must_use_candidate,
39    // Builder-style methods returning `Self` document their pattern in
40    // the type signature; `#[must_use]` is noise.
41    clippy::return_self_not_must_use,
42    // Doc comments follow the standard rustdoc layout; pedantic
43    // doc-markdown rules are too aggressive for technical prose with
44    // NCCL/MPI/RPC terminology.
45    clippy::doc_markdown,
46    // Test/helper modules define small fns after `let`-bindings; the
47    // hoisting requirement is style-only.
48    clippy::items_after_statements,
49    // Long match-on-strategy/op blocks mirror the NCCL/PyTorch
50    // taxonomy 1:1; splitting reduces legibility.
51    clippy::too_many_lines,
52    // Manual `Debug` impls intentionally omit non-Debug fields like
53    // `Mutex<NcclComm>` (raw FFI pointers) and `Arc<dyn Backend>` to
54    // keep formatted output free of lock probes / opaque handles.
55    clippy::missing_fields_in_debug,
56    // `match { Some(x) => x, None => return }` is the natural shape
57    // when the `else` branch is non-trivial.
58    clippy::single_match_else,
59    // Methods that take `&self` for a uniform interface (e.g.,
60    // `world_size()` on backends with a single rank) are part of the
61    // public API shape and not refactor candidates.
62    clippy::unused_self,
63    // `.map(...).unwrap_or(...)` is the documented PyTorch-style
64    // fallback shape used in option-bearing collectives; rewriting
65    // to `match` is lossier.
66    clippy::map_unwrap_or,
67    // Match arms that each call out a specific reduction/op variant
68    // are intentional when the variant set is documented and the
69    // "wildcard branch" would hide future additions.
70    clippy::match_same_arms,
71    // `.collect::<Vec<_>>()` after mapping is the idiomatic shape;
72    // rewriting to `extend(map(..))` is lossier and clippy's preference
73    // is contested.
74    clippy::redundant_closure_for_method_calls,
75    // `for elem in vec.into_iter()` on owned `Vec`s mirrors the consumed
76    // semantics in iteration; clippy's `for elem in vec` rewrite hides
77    // that the value is consumed.
78    clippy::explicit_into_iter_loop,
79    // FFI raw-pointer casts (`*const c_void` <-> `*const T`) and
80    // `&T as *const T` are the natural shape in NCCL bindings; clippy's
81    // preferred `.cast()` / `std::ptr::from_ref` rewrites do not
82    // improve legibility in this context.
83    clippy::ptr_as_ptr,
84    clippy::ref_as_ptr,
85    // `format!("{x}")` already uses inline captures where `Display` is
86    // direct; some sites use `.to_string()` or pass `&str` for
87    // readability with structured prefixes.
88    clippy::uninlined_format_args,
89    // `HashMap<String, Tensor<T>>` parameters in checkpoint helpers
90    // mirror PyTorch's `state_dict` shape; generalising over the
91    // hasher would leak `S: BuildHasher` to every caller.
92    clippy::implicit_hasher,
93)]
94
95//! Distributed training for ferrotorch.
96//!
97//! This crate provides the building blocks for multi-rank training:
98//!
99//! - **Backends** ([`backend`]) — Transport-agnostic communication.
100//!   [`TcpBackend`](backend::TcpBackend) for real multi-process training,
101//!   [`SimulatedBackend`](backend::SimulatedBackend) for in-process testing.
102//!
103//! - **Collectives** ([`collective`]) — [`allreduce`](collective::allreduce),
104//!   [`all_gather`](collective::all_gather),
105//!   [`reduce_scatter`](collective::reduce_scatter),
106//!   [`broadcast`](collective::broadcast), and [`barrier`](collective::barrier).
107//!
108//! - **Async collectives** ([`async_collective`]) —
109//!   [`async_all_gather`](async_collective::async_all_gather) and
110//!   [`async_reduce_scatter`](async_collective::async_reduce_scatter)
111//!   return a [`PendingCollective`](async_collective::PendingCollective)
112//!   handle that can be `wait()`ed on after local compute, enabling FSDP
113//!   backward prefetch.
114//!
115//! - **DDP** ([`ddp`]) — [`DDP`](ddp::DDP) wraps a `Module` and
116//!   synchronizes gradients across ranks after each backward pass.
117//!
118//! - **FSDP** ([`fsdp`]) — [`FSDP`](fsdp::FSDP) wraps a `Module` and
119//!   shards parameters across ranks, all-gathering during forward and
120//!   reduce-scattering gradients during backward.
121//!
122//! - **RPC** ([`rpc`]) — Remote Procedure Call framework with
123//!   [`RpcContext`](rpc::RpcContext) for invoking functions on remote ranks,
124//!   and [`RRef`](rpc::RRef) for holding references to remote data.
125//!
126//! - **Pipeline parallelism** ([`pipeline`]) —
127//!   [`Pipeline`](pipeline::Pipeline) splits a model into sequential stages
128//!   and processes microbatches through them. Supports
129//!   [`GPipe`](pipeline::PipelineSchedule::GPipe) and
130//!   [`Interleaved1F1B`](pipeline::PipelineSchedule::Interleaved1F1B) schedules.
131//!
132//! - **GPU collectives** ([`gpu_collective`], requires `gpu` feature) —
133//!   [`gpu_allreduce`](gpu_collective::gpu_allreduce) and
134//!   [`gpu_broadcast`](gpu_collective::gpu_broadcast) route through NCCL
135//!   when the `nccl` feature is enabled, or through an opt-in host
136//!   round-trip when `FERROTORCH_ENABLE_GPU_FALLBACK=1` is set. Without
137//!   either, they return `Err` (PyTorch parity). See [`gpu_collective`]
138//!   for details.
139//!
140//! - **Native-Rust backends** ([`gloo_backend`] / [`mpi_backend`],
141//!   require `gloo-backend` / `mpi-native` feature respectively, both
142//!   default off) — pure-Rust TCP transport with textbook ring allreduce /
143//!   tree broadcast / ring barrier collectives. No C/C++ FFI. #1132
144//!   landed the gloo backend; #1133 landed the MPI-subset backend
145//!   delegating to the same gloo_native primitives. With the feature
146//!   off, construction returns
147//!   [`DistributedError::BackendUnavailable`](error::DistributedError::BackendUnavailable)
148//!   for source-compat with the original #459 skeleton contract. The
149//!   legacy `mpi-backend` feature name aliases to `mpi-native`.
150//!
151//! - **Skeleton backends** ([`ucc_backend`], requires `ucc-backend`
152//!   feature, default off) — API contract only. Construction returns
153//!   [`DistributedError::BackendUnavailable`](error::DistributedError::BackendUnavailable)
154//!   when the feature is off. The feature would unlock a real binding
155//!   (UCC C library) — tracked in #1134 (replacing closed #459). Use
156//!   [`is_gloo_available`](gloo_backend::is_gloo_available),
157//!   [`is_mpi_available`](mpi_backend::is_mpi_available), and
158//!   [`is_ucc_available`](ucc_backend::is_ucc_available) to discriminate
159//!   at runtime.
160//!
161//! # Quick start
162//!
163//! ```ignore
164//! use ferrotorch_distributed::backend::SimulatedBackend;
165//! use ferrotorch_distributed::collective::{allreduce, ReduceOp};
166//! use ferrotorch_distributed::ddp::DDP;
167//! use ferrotorch_distributed::fsdp::FSDP;
168//! use ferrotorch_distributed::rpc::{RpcContext, SimulatedRpcBackend};
169//! use ferrotorch_distributed::pipeline::{Pipeline, PipelineStage, PipelineSchedule};
170//! ```
171//!
172//! ## REQ status (per `.design/ferrotorch-distributed/lib.md`)
173//!
174//! | REQ | Status | Evidence |
175//! |---|---|---|
176//! | REQ-1 (lint baseline) | SHIPPED | `#![warn(clippy::all, clippy::pedantic)]` / `#![deny(rust_2018_idioms)]` at top of `lib.rs`; consumers: every other `.rs` file in the crate inherits the policy and `cargo clippy -- -D warnings` is green. |
177//! | REQ-2 (module surface) | SHIPPED | `pub mod` block in `lib.rs` declares 16 unconditional + 5 feature-gated modules; consumers: every sibling `crate::<mod>::` path. |
178//! | REQ-3 (re-exports) | SHIPPED | `pub use` block in `lib.rs` re-exports 35 symbols from 10 submodules; consumer `ferrotorch/src/lib.rs` (`pub use ferrotorch_distributed::*;`). |
179//! | REQ-4 (crate-level docs) | SHIPPED | `//!` block in `lib.rs` enumerates every subsystem with rustdoc links; consumer `cargo doc -p ferrotorch-distributed` landing page. |
180
181pub mod async_collective;
182pub mod backend;
183pub mod checkpoint;
184pub mod collective;
185pub mod ddp;
186pub mod device_mesh;
187pub mod dtensor;
188pub mod error;
189pub mod fsdp;
190pub mod gloo_backend;
191#[cfg(feature = "gloo-backend")]
192pub(crate) mod gloo_native;
193pub mod mpi_backend;
194pub mod p2p;
195pub mod pipeline;
196pub mod rpc;
197pub mod sync_batch_norm;
198pub mod ucc_backend;
199
200#[cfg(feature = "gpu")]
201pub mod gpu_collective;
202
203#[cfg(feature = "nccl")]
204pub mod hybrid_backend;
205#[cfg(feature = "nccl")]
206pub mod nccl_backend;
207#[cfg(feature = "nccl")]
208pub mod nccl_collective;
209#[cfg(feature = "nccl")]
210pub mod nccl_sys;
211
212// Re-export key types at crate root for convenience.
213pub use async_collective::{PendingCollective, async_all_gather, async_reduce_scatter};
214pub use backend::{Backend, SimulatedBackend, SubBackend, TcpBackend};
215pub use checkpoint::{
216    AsyncCheckpointer, CheckpointFuture, DistCheckpointError, DistributedCheckpoint, ShardMetadata,
217    TensorShardSpec, flat_shard_metadata, load_distributed, reshard, save_distributed,
218};
219pub use collective::{
220    DEFAULT_COLLECTIVE_TIMEOUT, ReduceOp, all_gather, all_gather_with_timeout, all_to_all,
221    all_to_all_single_uneven, all_to_all_with_timeout, allreduce, allreduce_with_timeout, barrier,
222    broadcast, reduce_scatter, reduce_scatter_tensor, reduce_scatter_with_timeout,
223};
224pub use ddp::DDP;
225pub use device_mesh::DeviceMesh;
226pub use dtensor::{DTensor, Placement};
227pub use error::DistributedError;
228pub use fsdp::FSDP;
229pub use gloo_backend::{GlooBackend, is_gloo_available};
230pub use mpi_backend::{MpiBackend, is_mpi_available};
231pub use p2p::{recv, recv_into, recv_into_with_timeout, recv_with_timeout, send, sendrecv};
232pub use pipeline::{Pipeline, PipelineSchedule};
233pub use rpc::{RpcAgent, RpcError, TcpRpcBackend};
234pub use sync_batch_norm::SyncBatchNorm2d;
235pub use ucc_backend::{UccBackend, is_ucc_available};
236
237#[cfg(feature = "gpu")]
238pub use gpu_collective::{gpu_allreduce, gpu_broadcast};
239
240#[cfg(feature = "nccl")]
241pub use hybrid_backend::HybridBackend;
242#[cfg(feature = "nccl")]
243pub use nccl_backend::{NcclBackend, is_nccl_available};
244#[cfg(feature = "nccl")]
245pub use nccl_collective::{nccl_all_gather, nccl_allreduce, nccl_broadcast, nccl_reduce_scatter};
246#[cfg(feature = "nccl")]
247pub use nccl_sys::NcclUniqueId;