ferrotorch_distributed/lib.rs
1// Lint baseline mirrors the workspace-standard pattern from
2// `ferrotorch-gpu`/`-jit`/`-cubecl`/`-xpu` lib.rs. `unsafe_code` is NOT
3// denied: this crate calls into NCCL via raw FFI (`nccl_sys`), uses
4// `dlopen`/`dlsym`/`std::mem::transmute` to load CUDA stream symbols
5// without a compile-time CUDA dependency (`nccl_backend`), and performs
6// byte-reinterpret tensor I/O (`checkpoint`, `pipeline`). Per-block
7// SAFETY substantiation lives at each `unsafe { ... }` site.
8#![warn(clippy::all, clippy::pedantic)]
9#![deny(rust_2018_idioms)]
10// `missing_docs` and `missing_debug_implementations` are held at `allow`
11// while the workspace-wide rustdoc / `Debug` pass is tracked separately
12// (matches the existing `ferrotorch-gpu`/`-core` precedent — diverging
13// unilaterally from a leaf crate would be Step 4 architectural
14// unilateralism). Several distributed types own `Mutex<NcclComm>` raw
15// FFI pointers, `Arc<dyn Backend>` trait objects, or `Box<dyn Fn>` RPC
16// handlers whose `Debug` impls require careful hand-rolling.
17#![allow(missing_docs, missing_debug_implementations)]
18// Pedantic lints we explicitly accept across this crate. Each allow names
19// a concrete reason — the alternative would be churn-for-zero-benefit or
20// a worse API. Mirrors the ferrotorch-gpu baseline; add to this list only
21// with a one-line justification.
22#![allow(
23 // # Errors / # Panics sections will be added during the workspace-wide
24 // rustdoc pass (tracked separately, not gated on this lint baseline).
25 clippy::missing_errors_doc,
26 clippy::missing_panics_doc,
27 // Distributed code casts pervasively between `usize` rank/world_size
28 // and `i32` NCCL/MPI peer indices, and between `u64` byte counters
29 // and `usize` buffer lengths; the explicit cast is more readable
30 // than try_into/unwrap or num-traits indirection.
31 clippy::cast_possible_truncation,
32 clippy::cast_possible_wrap,
33 clippy::cast_sign_loss,
34 clippy::cast_precision_loss,
35 clippy::cast_lossless,
36 // `#[must_use]` on every getter is churn for marginal value; callers
37 // in this codebase already use the returned values.
38 clippy::must_use_candidate,
39 // Builder-style methods returning `Self` document their pattern in
40 // the type signature; `#[must_use]` is noise.
41 clippy::return_self_not_must_use,
42 // Doc comments follow the standard rustdoc layout; pedantic
43 // doc-markdown rules are too aggressive for technical prose with
44 // NCCL/MPI/RPC terminology.
45 clippy::doc_markdown,
46 // Test/helper modules define small fns after `let`-bindings; the
47 // hoisting requirement is style-only.
48 clippy::items_after_statements,
49 // Long match-on-strategy/op blocks mirror the NCCL/PyTorch
50 // taxonomy 1:1; splitting reduces legibility.
51 clippy::too_many_lines,
52 // Manual `Debug` impls intentionally omit non-Debug fields like
53 // `Mutex<NcclComm>` (raw FFI pointers) and `Arc<dyn Backend>` to
54 // keep formatted output free of lock probes / opaque handles.
55 clippy::missing_fields_in_debug,
56 // `match { Some(x) => x, None => return }` is the natural shape
57 // when the `else` branch is non-trivial.
58 clippy::single_match_else,
59 // Methods that take `&self` for a uniform interface (e.g.,
60 // `world_size()` on backends with a single rank) are part of the
61 // public API shape and not refactor candidates.
62 clippy::unused_self,
63 // `.map(...).unwrap_or(...)` is the documented PyTorch-style
64 // fallback shape used in option-bearing collectives; rewriting
65 // to `match` is lossier.
66 clippy::map_unwrap_or,
67 // Match arms that each call out a specific reduction/op variant
68 // are intentional when the variant set is documented and the
69 // "wildcard branch" would hide future additions.
70 clippy::match_same_arms,
71 // `.collect::<Vec<_>>()` after mapping is the idiomatic shape;
72 // rewriting to `extend(map(..))` is lossier and clippy's preference
73 // is contested.
74 clippy::redundant_closure_for_method_calls,
75 // `for elem in vec.into_iter()` on owned `Vec`s mirrors the consumed
76 // semantics in iteration; clippy's `for elem in vec` rewrite hides
77 // that the value is consumed.
78 clippy::explicit_into_iter_loop,
79 // FFI raw-pointer casts (`*const c_void` <-> `*const T`) and
80 // `&T as *const T` are the natural shape in NCCL bindings; clippy's
81 // preferred `.cast()` / `std::ptr::from_ref` rewrites do not
82 // improve legibility in this context.
83 clippy::ptr_as_ptr,
84 clippy::ref_as_ptr,
85 // `format!("{x}")` already uses inline captures where `Display` is
86 // direct; some sites use `.to_string()` or pass `&str` for
87 // readability with structured prefixes.
88 clippy::uninlined_format_args,
89 // `HashMap<String, Tensor<T>>` parameters in checkpoint helpers
90 // mirror PyTorch's `state_dict` shape; generalising over the
91 // hasher would leak `S: BuildHasher` to every caller.
92 clippy::implicit_hasher,
93)]
94
95//! Distributed training for ferrotorch.
96//!
97//! This crate provides the building blocks for multi-rank training:
98//!
99//! - **Backends** ([`backend`]) — Transport-agnostic communication.
100//! [`TcpBackend`](backend::TcpBackend) for real multi-process training,
101//! [`SimulatedBackend`](backend::SimulatedBackend) for in-process testing.
102//!
103//! - **Collectives** ([`collective`]) — [`allreduce`](collective::allreduce),
104//! [`all_gather`](collective::all_gather),
105//! [`reduce_scatter`](collective::reduce_scatter),
106//! [`broadcast`](collective::broadcast), and [`barrier`](collective::barrier).
107//!
108//! - **Async collectives** ([`async_collective`]) —
109//! [`async_all_gather`](async_collective::async_all_gather) and
110//! [`async_reduce_scatter`](async_collective::async_reduce_scatter)
111//! return a [`PendingCollective`](async_collective::PendingCollective)
112//! handle that can be `wait()`ed on after local compute, enabling FSDP
113//! backward prefetch.
114//!
115//! - **DDP** ([`ddp`]) — [`DDP`](ddp::DDP) wraps a `Module` and
116//! synchronizes gradients across ranks after each backward pass.
117//!
118//! - **FSDP** ([`fsdp`]) — [`FSDP`](fsdp::FSDP) wraps a `Module` and
119//! shards parameters across ranks, all-gathering during forward and
120//! reduce-scattering gradients during backward.
121//!
122//! - **RPC** ([`rpc`]) — Remote Procedure Call framework with
123//! [`RpcContext`](rpc::RpcContext) for invoking functions on remote ranks,
124//! and [`RRef`](rpc::RRef) for holding references to remote data.
125//!
126//! - **Pipeline parallelism** ([`pipeline`]) —
127//! [`Pipeline`](pipeline::Pipeline) splits a model into sequential stages
128//! and processes microbatches through them. Supports
129//! [`GPipe`](pipeline::PipelineSchedule::GPipe) and
130//! [`Interleaved1F1B`](pipeline::PipelineSchedule::Interleaved1F1B) schedules.
131//!
132//! - **GPU collectives** ([`gpu_collective`], requires `gpu` feature) —
133//! [`gpu_allreduce`](gpu_collective::gpu_allreduce) and
134//! [`gpu_broadcast`](gpu_collective::gpu_broadcast) route through NCCL
135//! when the `nccl` feature is enabled, or through an opt-in host
136//! round-trip when `FERROTORCH_ENABLE_GPU_FALLBACK=1` is set. Without
137//! either, they return `Err` (PyTorch parity). See [`gpu_collective`]
138//! for details.
139//!
140//! - **Native-Rust backends** ([`gloo_backend`] / [`mpi_backend`],
141//! require `gloo-backend` / `mpi-native` feature respectively, both
142//! default off) — pure-Rust TCP transport with textbook ring allreduce /
143//! tree broadcast / ring barrier collectives. No C/C++ FFI. #1132
144//! landed the gloo backend; #1133 landed the MPI-subset backend
145//! delegating to the same gloo_native primitives. With the feature
146//! off, construction returns
147//! [`DistributedError::BackendUnavailable`](error::DistributedError::BackendUnavailable)
148//! for source-compat with the original #459 skeleton contract. The
149//! legacy `mpi-backend` feature name aliases to `mpi-native`.
150//!
151//! - **Skeleton backends** ([`ucc_backend`], requires `ucc-backend`
152//! feature, default off) — API contract only. Construction returns
153//! [`DistributedError::BackendUnavailable`](error::DistributedError::BackendUnavailable)
154//! when the feature is off. The feature would unlock a real binding
155//! (UCC C library) — tracked in #1134 (replacing closed #459). Use
156//! [`is_gloo_available`](gloo_backend::is_gloo_available),
157//! [`is_mpi_available`](mpi_backend::is_mpi_available), and
158//! [`is_ucc_available`](ucc_backend::is_ucc_available) to discriminate
159//! at runtime.
160//!
161//! # Quick start
162//!
163//! ```ignore
164//! use ferrotorch_distributed::backend::SimulatedBackend;
165//! use ferrotorch_distributed::collective::{allreduce, ReduceOp};
166//! use ferrotorch_distributed::ddp::DDP;
167//! use ferrotorch_distributed::fsdp::FSDP;
168//! use ferrotorch_distributed::rpc::{RpcContext, SimulatedRpcBackend};
169//! use ferrotorch_distributed::pipeline::{Pipeline, PipelineStage, PipelineSchedule};
170//! ```
171//!
172//! ## REQ status (per `.design/ferrotorch-distributed/lib.md`)
173//!
174//! | REQ | Status | Evidence |
175//! |---|---|---|
176//! | REQ-1 (lint baseline) | SHIPPED | `#![warn(clippy::all, clippy::pedantic)]` / `#![deny(rust_2018_idioms)]` at top of `lib.rs`; consumers: every other `.rs` file in the crate inherits the policy and `cargo clippy -- -D warnings` is green. |
177//! | REQ-2 (module surface) | SHIPPED | `pub mod` block in `lib.rs` declares 16 unconditional + 5 feature-gated modules; consumers: every sibling `crate::<mod>::` path. |
178//! | REQ-3 (re-exports) | SHIPPED | `pub use` block in `lib.rs` re-exports 35 symbols from 10 submodules; consumer `ferrotorch/src/lib.rs` (`pub use ferrotorch_distributed::*;`). |
179//! | REQ-4 (crate-level docs) | SHIPPED | `//!` block in `lib.rs` enumerates every subsystem with rustdoc links; consumer `cargo doc -p ferrotorch-distributed` landing page. |
180
181pub mod async_collective;
182pub mod backend;
183pub mod checkpoint;
184pub mod collective;
185pub mod ddp;
186pub mod device_mesh;
187pub mod dtensor;
188pub mod error;
189pub mod fsdp;
190pub mod gloo_backend;
191#[cfg(feature = "gloo-backend")]
192pub(crate) mod gloo_native;
193pub mod mpi_backend;
194pub mod p2p;
195pub mod pipeline;
196pub mod rpc;
197pub mod sync_batch_norm;
198pub mod ucc_backend;
199
200#[cfg(feature = "gpu")]
201pub mod gpu_collective;
202
203#[cfg(feature = "nccl")]
204pub mod hybrid_backend;
205#[cfg(feature = "nccl")]
206pub mod nccl_backend;
207#[cfg(feature = "nccl")]
208pub mod nccl_collective;
209#[cfg(feature = "nccl")]
210pub mod nccl_sys;
211
212// Re-export key types at crate root for convenience.
213pub use async_collective::{PendingCollective, async_all_gather, async_reduce_scatter};
214pub use backend::{Backend, SimulatedBackend, SubBackend, TcpBackend};
215pub use checkpoint::{
216 AsyncCheckpointer, CheckpointFuture, DistCheckpointError, DistributedCheckpoint, ShardMetadata,
217 TensorShardSpec, flat_shard_metadata, load_distributed, reshard, save_distributed,
218};
219pub use collective::{
220 DEFAULT_COLLECTIVE_TIMEOUT, ReduceOp, all_gather, all_gather_with_timeout, all_to_all,
221 all_to_all_single_uneven, all_to_all_with_timeout, allreduce, allreduce_with_timeout, barrier,
222 broadcast, reduce_scatter, reduce_scatter_tensor, reduce_scatter_with_timeout,
223};
224pub use ddp::DDP;
225pub use device_mesh::DeviceMesh;
226pub use dtensor::{DTensor, Placement};
227pub use error::DistributedError;
228pub use fsdp::FSDP;
229pub use gloo_backend::{GlooBackend, is_gloo_available};
230pub use mpi_backend::{MpiBackend, is_mpi_available};
231pub use p2p::{recv, recv_into, recv_into_with_timeout, recv_with_timeout, send, sendrecv};
232pub use pipeline::{Pipeline, PipelineSchedule};
233pub use rpc::{RpcAgent, RpcError, TcpRpcBackend};
234pub use sync_batch_norm::SyncBatchNorm2d;
235pub use ucc_backend::{UccBackend, is_ucc_available};
236
237#[cfg(feature = "gpu")]
238pub use gpu_collective::{gpu_allreduce, gpu_broadcast};
239
240#[cfg(feature = "nccl")]
241pub use hybrid_backend::HybridBackend;
242#[cfg(feature = "nccl")]
243pub use nccl_backend::{NcclBackend, is_nccl_available};
244#[cfg(feature = "nccl")]
245pub use nccl_collective::{nccl_all_gather, nccl_allreduce, nccl_broadcast, nccl_reduce_scatter};
246#[cfg(feature = "nccl")]
247pub use nccl_sys::NcclUniqueId;