1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
// Lint baseline mirrors the workspace-standard pattern from
// `ferrotorch-gpu`/`-jit`/`-cubecl`/`-xpu` lib.rs. `unsafe_code` is NOT
// denied: this crate calls into NCCL via raw FFI (`nccl_sys`), uses
// `dlopen`/`dlsym`/`std::mem::transmute` to load CUDA stream symbols
// without a compile-time CUDA dependency (`nccl_backend`), and performs
// byte-reinterpret tensor I/O (`checkpoint`, `pipeline`). Per-block
// SAFETY substantiation lives at each `unsafe { ... }` site.
// `missing_docs` and `missing_debug_implementations` are held at `allow`
// while the workspace-wide rustdoc / `Debug` pass is tracked separately
// (matches the existing `ferrotorch-gpu`/`-core` precedent — diverging
// unilaterally from a leaf crate would be Step 4 architectural
// unilateralism). Several distributed types own `Mutex<NcclComm>` raw
// FFI pointers, `Arc<dyn Backend>` trait objects, or `Box<dyn Fn>` RPC
// handlers whose `Debug` impls require careful hand-rolling.
// Pedantic lints we explicitly accept across this crate. Each allow names
// a concrete reason — the alternative would be churn-for-zero-benefit or
// a worse API. Mirrors the ferrotorch-gpu baseline; add to this list only
// with a one-line justification.
` on every getter is churn for marginal value; callers
// in this codebase already use the returned values.
must_use_candidate,
// Builder-style methods returning `Self` document their pattern in
// the type signature; `#[must_use]` is noise.
return_self_not_must_use,
// Doc comments follow the standard rustdoc layout; pedantic
// doc-markdown rules are too aggressive for technical prose with
// NCCL/MPI/RPC terminology.
doc_markdown,
// Test/helper modules define small fns after `let`-bindings; the
// hoisting requirement is style-only.
items_after_statements,
// Long match-on-strategy/op blocks mirror the NCCL/PyTorch
// taxonomy 1:1; splitting reduces legibility.
too_many_lines,
// Manual `Debug` impls intentionally omit non-Debug fields like
// `Mutex<NcclComm>` (raw FFI pointers) and `Arc<dyn Backend>` to
// keep formatted output free of lock probes / opaque handles.
missing_fields_in_debug,
// `match { Some(x) => x, None => return }` is the natural shape
// when the `else` branch is non-trivial.
single_match_else,
// Methods that take `&self` for a uniform interface (e.g.,
// `world_size()` on backends with a single rank) are part of the
// public API shape and not refactor candidates.
unused_self,
// `.map(...).unwrap_or(...)` is the documented PyTorch-style
// fallback shape used in option-bearing collectives; rewriting
// to `match` is lossier.
map_unwrap_or,
// Match arms that each call out a specific reduction/op variant
// are intentional when the variant set is documented and the
// "wildcard branch" would hide future additions.
match_same_arms,
// `.collect::<Vec<_>>()` after mapping is the idiomatic shape;
// rewriting to `extend(map(..))` is lossier and clippy's preference
// is contested.
redundant_closure_for_method_calls,
// `for elem in vec.into_iter()` on owned `Vec`s mirrors the consumed
// semantics in iteration; clippy's `for elem in vec` rewrite hides
// that the value is consumed.
explicit_into_iter_loop,
// FFI raw-pointer casts (`*const c_void` <-> `*const T`) and
// `&T as *const T` are the natural shape in NCCL bindings; clippy's
// preferred `.cast()` / `std::ptr::from_ref` rewrites do not
// improve legibility in this context.
ptr_as_ptr,
ref_as_ptr,
// `format!("{x}")` already uses inline captures where `Display` is
// direct; some sites use `.to_string()` or pass `&str` for
// readability with structured prefixes.
uninlined_format_args,
// `HashMap<String, Tensor<T>>` parameters in checkpoint helpers
// mirror PyTorch's `state_dict` shape; generalising over the
// hasher would leak `S: BuildHasher` to every caller.
implicit_hasher,
)]
//! Distributed training for ferrotorch.
//!
//! This crate provides the building blocks for multi-rank training:
//!
//! - **Backends** ([`backend`]) — Transport-agnostic communication.
//! [`TcpBackend`](backend::TcpBackend) for real multi-process training,
//! [`SimulatedBackend`](backend::SimulatedBackend) for in-process testing.
//!
//! - **Collectives** ([`collective`]) — [`allreduce`](collective::allreduce),
//! [`all_gather`](collective::all_gather),
//! [`reduce_scatter`](collective::reduce_scatter),
//! [`broadcast`](collective::broadcast), and [`barrier`](collective::barrier).
//!
//! - **Async collectives** ([`async_collective`]) —
//! [`async_all_gather`](async_collective::async_all_gather) and
//! [`async_reduce_scatter`](async_collective::async_reduce_scatter)
//! return a [`PendingCollective`](async_collective::PendingCollective)
//! handle that can be `wait()`ed on after local compute, enabling FSDP
//! backward prefetch.
//!
//! - **DDP** ([`ddp`]) — [`DDP`](ddp::DDP) wraps a `Module` and
//! synchronizes gradients across ranks after each backward pass.
//!
//! - **FSDP** ([`fsdp`]) — [`FSDP`](fsdp::FSDP) wraps a `Module` and
//! shards parameters across ranks, all-gathering during forward and
//! reduce-scattering gradients during backward.
//!
//! - **RPC** ([`rpc`]) — Remote Procedure Call framework with
//! [`RpcContext`](rpc::RpcContext) for invoking functions on remote ranks,
//! and [`RRef`](rpc::RRef) for holding references to remote data.
//!
//! - **Pipeline parallelism** ([`pipeline`]) —
//! [`Pipeline`](pipeline::Pipeline) splits a model into sequential stages
//! and processes microbatches through them. Supports
//! [`GPipe`](pipeline::PipelineSchedule::GPipe) and
//! [`Interleaved1F1B`](pipeline::PipelineSchedule::Interleaved1F1B) schedules.
//!
//! - **GPU collectives** ([`gpu_collective`], requires `gpu` feature) —
//! [`gpu_allreduce`](gpu_collective::gpu_allreduce) and
//! [`gpu_broadcast`](gpu_collective::gpu_broadcast) route through NCCL
//! when the `nccl` feature is enabled, or through an opt-in host
//! round-trip when `FERROTORCH_ENABLE_GPU_FALLBACK=1` is set. Without
//! either, they return `Err` (PyTorch parity). See [`gpu_collective`]
//! for details.
//!
//! - **Native-Rust backends** ([`gloo_backend`] / [`mpi_backend`],
//! require `gloo-backend` / `mpi-native` feature respectively, both
//! default off) — pure-Rust TCP transport with textbook ring allreduce /
//! tree broadcast / ring barrier collectives. No C/C++ FFI. #1132
//! landed the gloo backend; #1133 landed the MPI-subset backend
//! delegating to the same gloo_native primitives. With the feature
//! off, construction returns
//! [`DistributedError::BackendUnavailable`](error::DistributedError::BackendUnavailable)
//! for source-compat with the original #459 skeleton contract. The
//! legacy `mpi-backend` feature name aliases to `mpi-native`.
//!
//! - **Skeleton backends** ([`ucc_backend`], requires `ucc-backend`
//! feature, default off) — API contract only. Construction returns
//! [`DistributedError::BackendUnavailable`](error::DistributedError::BackendUnavailable)
//! when the feature is off. The feature would unlock a real binding
//! (UCC C library) — tracked in #1134 (replacing closed #459). Use
//! [`is_gloo_available`](gloo_backend::is_gloo_available),
//! [`is_mpi_available`](mpi_backend::is_mpi_available), and
//! [`is_ucc_available`](ucc_backend::is_ucc_available) to discriminate
//! at runtime.
//!
//! # Quick start
//!
//! ```ignore
//! use ferrotorch_distributed::backend::SimulatedBackend;
//! use ferrotorch_distributed::collective::{allreduce, ReduceOp};
//! use ferrotorch_distributed::ddp::DDP;
//! use ferrotorch_distributed::fsdp::FSDP;
//! use ferrotorch_distributed::rpc::{RpcContext, SimulatedRpcBackend};
//! use ferrotorch_distributed::pipeline::{Pipeline, PipelineStage, PipelineSchedule};
//! ```
pub
// Re-export key types at crate root for convenience.
pub use ;
pub use ;
pub use ;
pub use ;
pub use DDP;
pub use DeviceMesh;
pub use ;
pub use DistributedError;
pub use FSDP;
pub use ;
pub use ;
pub use ;
pub use ;
pub use ;
pub use SyncBatchNorm2d;
pub use ;
pub use ;
pub use HybridBackend;
pub use ;
pub use ;
pub use NcclUniqueId;