ferrotorch_distributed/error.rs
1//! Error types for distributed operations.
2//!
3//! ## REQ status (per `.design/ferrotorch-distributed/error.md`)
4//!
5//! Full evidence rows (impl + non-test production consumer + upstream
6//! cites) live in the design doc; this synopsis is a one-line summary
7//! per REQ.
8//!
9//! | REQ | Status | Evidence |
10//! |---|---|---|
11//! | REQ-1 (DistributedError enum) | SHIPPED | `pub enum DistributedError` in `error.rs` with 11 `#[non_exhaustive]` variants; consumers `use crate::error::DistributedError;` in `backend.rs`, `collective.rs`, `gloo_backend.rs`. |
12//! | REQ-2 (diagnostic fields per variant) | SHIPPED | every variant carries named fields rendered in `#[error("...")]` strings; verified by `backend.rs` tests (`test_invalid_world_size`, `test_send_to_invalid_rank`). |
13//! | REQ-3 (From conversion) | SHIPPED | `impl From<DistributedError> for FerrotorchError` at the bottom of `error.rs`; consumers `.into()` at every fallible site in `backend.rs` and `collective.rs`. |
14//! | REQ-4 (BackendUnavailable variant) | SHIPPED | `BackendUnavailable { backend: &'static str }` variant in `error.rs`; consumers in `gloo_backend.rs`, `mpi_backend.rs`, `ucc_backend.rs` (feature-off construction paths). |
15
16use ferrotorch_core::FerrotorchError;
17
18/// Errors specific to the distributed training subsystem.
19#[derive(Debug, thiserror::Error)]
20#[non_exhaustive]
21pub enum DistributedError {
22 #[error("invalid world size: {world_size} (must be >= 1)")]
23 InvalidWorldSize { world_size: usize },
24
25 #[error("invalid rank {rank} for world size {world_size}")]
26 InvalidRank { rank: usize, world_size: usize },
27
28 #[error("cannot send to self (rank {rank})")]
29 SelfSend { rank: usize },
30
31 #[error("size mismatch: expected {expected} bytes, got {got}")]
32 SizeMismatch { expected: usize, got: usize },
33
34 #[error("I/O error: {message}")]
35 Io { message: String },
36
37 #[error("lock poisoned: {message}")]
38 LockPoisoned { message: String },
39
40 #[error("channel closed: {message}")]
41 ChannelClosed { message: String },
42
43 #[error("unsupported reduce operation: {message}")]
44 UnsupportedOp { message: String },
45
46 #[error("operation timed out after {seconds}s")]
47 Timeout { seconds: u64 },
48
49 #[error("no connection to rank {rank} (star topology: non-zero ranks only connect to rank 0)")]
50 NoConnection { rank: usize },
51
52 /// Returned when the user requested a backend whose binding layer
53 /// isn't compiled into this build (e.g. `gloo-backend` / `mpi-backend`
54 /// / `ucc-backend` feature off, or a CUDA-required backend on a
55 /// non-CUDA system). The caller is expected to either enable the
56 /// feature, install the underlying C library, or pick a different
57 /// backend (`SimulatedBackend` / `TcpBackend` always work).
58 /// (Replaces closed #459; live follow-ups: #1132 / #1133 / #1134.)
59 #[error(
60 "backend `{backend}` is not available in this build (enable the corresponding cargo feature \
61 and ensure the underlying library is installed)"
62 )]
63 BackendUnavailable { backend: &'static str },
64}
65
66impl From<DistributedError> for FerrotorchError {
67 fn from(e: DistributedError) -> Self {
68 FerrotorchError::InvalidArgument {
69 message: e.to_string(),
70 }
71 }
72}