Skip to main content

ferrotorch_distributed/
error.rs

1//! Error types for distributed operations.
2//!
3//! ## REQ status (per `.design/ferrotorch-distributed/error.md`)
4//!
5//! Full evidence rows (impl + non-test production consumer + upstream
6//! cites) live in the design doc; this synopsis is a one-line summary
7//! per REQ.
8//!
9//! | REQ | Status | Evidence |
10//! |---|---|---|
11//! | REQ-1 (DistributedError enum) | SHIPPED | `pub enum DistributedError` in `error.rs` with 11 `#[non_exhaustive]` variants; consumers `use crate::error::DistributedError;` in `backend.rs`, `collective.rs`, `gloo_backend.rs`. |
12//! | REQ-2 (diagnostic fields per variant) | SHIPPED | every variant carries named fields rendered in `#[error("...")]` strings; verified by `backend.rs` tests (`test_invalid_world_size`, `test_send_to_invalid_rank`). |
13//! | REQ-3 (From conversion) | SHIPPED | `impl From<DistributedError> for FerrotorchError` at the bottom of `error.rs`; consumers `.into()` at every fallible site in `backend.rs` and `collective.rs`. |
14//! | REQ-4 (BackendUnavailable variant) | SHIPPED | `BackendUnavailable { backend: &'static str }` variant in `error.rs`; consumers in `gloo_backend.rs`, `mpi_backend.rs`, `ucc_backend.rs` (feature-off construction paths). |
15
16use ferrotorch_core::FerrotorchError;
17
18/// Errors specific to the distributed training subsystem.
19#[derive(Debug, thiserror::Error)]
20#[non_exhaustive]
21pub enum DistributedError {
22    #[error("invalid world size: {world_size} (must be >= 1)")]
23    InvalidWorldSize { world_size: usize },
24
25    #[error("invalid rank {rank} for world size {world_size}")]
26    InvalidRank { rank: usize, world_size: usize },
27
28    #[error("cannot send to self (rank {rank})")]
29    SelfSend { rank: usize },
30
31    #[error("size mismatch: expected {expected} bytes, got {got}")]
32    SizeMismatch { expected: usize, got: usize },
33
34    #[error("I/O error: {message}")]
35    Io { message: String },
36
37    #[error("lock poisoned: {message}")]
38    LockPoisoned { message: String },
39
40    #[error("channel closed: {message}")]
41    ChannelClosed { message: String },
42
43    #[error("unsupported reduce operation: {message}")]
44    UnsupportedOp { message: String },
45
46    #[error("operation timed out after {seconds}s")]
47    Timeout { seconds: u64 },
48
49    #[error("no connection to rank {rank} (star topology: non-zero ranks only connect to rank 0)")]
50    NoConnection { rank: usize },
51
52    /// Returned when the user requested a backend whose binding layer
53    /// isn't compiled into this build (e.g. `gloo-backend` / `mpi-backend`
54    /// / `ucc-backend` feature off, or a CUDA-required backend on a
55    /// non-CUDA system). The caller is expected to either enable the
56    /// feature, install the underlying C library, or pick a different
57    /// backend (`SimulatedBackend` / `TcpBackend` always work).
58    /// (Replaces closed #459; live follow-ups: #1132 / #1133 / #1134.)
59    #[error(
60        "backend `{backend}` is not available in this build (enable the corresponding cargo feature \
61         and ensure the underlying library is installed)"
62    )]
63    BackendUnavailable { backend: &'static str },
64}
65
66impl From<DistributedError> for FerrotorchError {
67    fn from(e: DistributedError) -> Self {
68        FerrotorchError::InvalidArgument {
69            message: e.to_string(),
70        }
71    }
72}