Skip to main content

ferrotorch_distributed/
error.rs

1//! Error types for distributed operations.
2
3use ferrotorch_core::FerrotorchError;
4
5/// Errors specific to the distributed training subsystem.
6#[derive(Debug, thiserror::Error)]
7#[non_exhaustive]
8pub enum DistributedError {
9    #[error("invalid world size: {world_size} (must be >= 1)")]
10    InvalidWorldSize { world_size: usize },
11
12    #[error("invalid rank {rank} for world size {world_size}")]
13    InvalidRank { rank: usize, world_size: usize },
14
15    #[error("cannot send to self (rank {rank})")]
16    SelfSend { rank: usize },
17
18    #[error("size mismatch: expected {expected} bytes, got {got}")]
19    SizeMismatch { expected: usize, got: usize },
20
21    #[error("I/O error: {message}")]
22    Io { message: String },
23
24    #[error("lock poisoned: {message}")]
25    LockPoisoned { message: String },
26
27    #[error("channel closed: {message}")]
28    ChannelClosed { message: String },
29
30    #[error("unsupported reduce operation: {message}")]
31    UnsupportedOp { message: String },
32
33    #[error("operation timed out after {seconds}s")]
34    Timeout { seconds: u64 },
35
36    #[error("no connection to rank {rank} (star topology: non-zero ranks only connect to rank 0)")]
37    NoConnection { rank: usize },
38}
39
40impl From<DistributedError> for FerrotorchError {
41    fn from(e: DistributedError) -> Self {
42        FerrotorchError::InvalidArgument {
43            message: e.to_string(),
44        }
45    }
46}