Skip to main content

ibverbs_rs/network/
mod.rs

1//! Distributed RDMA network — ranked multi-peer setup with barrier synchronization and an out-of-band TCP exchanger for cluster bootstrapping.
2//!
3//! A [`Node`] combines a [`MultiChannel`] with a rank, a world size, and a
4//! [`Barrier`] to form a complete building block for distributed RDMA programs.
5//! It exposes the full [`multi_channel`](crate::multi_channel) operation API
6//! (scatter/gather sends, writes, reads, multicast) and adds barrier synchronization
7//! for coordinating execution across all nodes in the network.
8//!
9//! # Connection lifecycle
10//!
11//! Connecting a set of nodes requires exchanging endpoint information between every
12//! pair of participants. The [`tcp_exchanger`](Exchanger) utility performs this
13//! out-of-band exchange over TCP, driven by a shared [`RawNetworkConfig`] that
14//! describes the address and port of each node.
15//!
16//! 1. **Build** — call [`Node::builder`] (or [`ProtectionDomain::create_node`]) and
17//!    set at minimum `rank`, `world_size`, and `pd`. An optional
18//!    [`BarrierAlgorithm`] can be chosen; the default is
19//!    [`BinaryTree`](BarrierAlgorithm::BinaryTree).
20//! 2. **Exchange endpoints** — call [`Node::endpoint`](PreparedNode::endpoint) to
21//!    obtain the local [`LocalEndpoint`], then use
22//!    [`Exchanger::await_exchange_all`] to distribute it to all peers and collect
23//!    theirs. Pass the result through [`Node::gather_endpoints`](PreparedNode::gather_endpoints)
24//!    to produce [`RemoteEndpoints`] in the format expected by the handshake.
25//! 3. **Handshake** — call [`PreparedNode::handshake`] with the remote endpoints to
26//!    bring up all queue pairs and obtain the ready-to-use [`Node`].
27//!
28//! # Operations
29//!
30//! All [`MultiChannel`] operations are forwarded directly on [`Node`]:
31//! [`scatter_send`](Node::scatter_send), [`gather_receive`](Node::gather_receive),
32//! [`scatter_write`](Node::scatter_write), [`gather_read`](Node::gather_read), and
33//! [`multicast_send`](Node::multicast_send), along with their scoped and unpolled
34//! variants via [`Node::scope`] and [`Node::manual_scope`].
35//!
36//! # Barrier synchronization
37//!
38//! [`Node::barrier`] blocks until every node in the supplied peer list has called
39//! barrier, or until the timeout expires. The peer list may be any subset of the
40//! world, allowing partial barriers across subgroups.
41//! [`Node::barrier_unchecked`] skips peer-list validation for hot paths.
42//!
43//! The barrier algorithm is selected at build time via [`BarrierAlgorithm`]:
44//!
45//! * [`Centralized`](BarrierAlgorithm::Centralized) — the lowest-ranked participant
46//!   acts as coordinator; simple but does not scale well.
47//! * [`Dissemination`](BarrierAlgorithm::Dissemination) — pairwise exchange at
48//!   exponential distances; no designated leader, scales well.
49//! * [`BinaryTree`](BarrierAlgorithm::BinaryTree) — tree-based reduce and broadcast;
50//!   a balanced alternative to dissemination.
51//!
52//! # Network configuration
53//!
54//! [`RawNetworkConfig`] is a serializable description of the cluster (one
55//! [`NodeConfig`] per rank, each with an IP address and port) that can be loaded from
56//! JSON. [`RawNetworkConfig::build`] validates it and produces a [`NetworkConfig`]
57//! ready for use with [`Exchanger`].
58//!
59//! # Example: building a node and exchanging data
60//!
61//! ```no_run
62//! use ibverbs_rs::ibverbs;
63//! use ibverbs_rs::network::{Node, ExchangeConfig, Exchanger, RawNetworkConfig};
64//! use ibverbs_rs::multi_channel::PeerSendWorkRequest;
65//!
66//! // Load network config (see RawNetworkConfig for the JSON format)
67//! let json = std::fs::read_to_string("network.json")?;
68//! let config = serde_json::from_str::<RawNetworkConfig>(&json)?.build()?;
69//! let rank = 0;
70//!
71//! let ctx = ibverbs::open_device("mlx5_0")?;
72//! let pd = ctx.allocate_pd()?;
73//!
74//! // 1. Build
75//! let prepared = Node::builder()
76//!     .pd(&pd)
77//!     .rank(rank)
78//!     .world_size(config.world_size())
79//!     .build()?;
80//!
81//! // 2. Exchange endpoints over TCP
82//! let local_ep = prepared.endpoint();
83//! let remote_eps = Exchanger::await_exchange_all(
84//!     rank, &config, &local_ep, &ExchangeConfig::default(),
85//! )?;
86//! let remote_eps = prepared.gather_endpoints(remote_eps)?;
87//!
88//! // 3. Handshake
89//! let mut node = prepared.handshake(remote_eps)?;
90//!
91//! // Send data to peer 1
92//! let buf = [42u8; 64];
93//! let mr = node.pd().register_local_mr_slice(&buf)?;
94//! node.send(PeerSendWorkRequest::new(1, &[mr.gather_element(&buf)]))?;
95//! # Ok::<(), Box<dyn std::error::Error>>(())
96//! ```
97//!
98//! See also [`examples/network.rs`](https://github.com/Tikitikitikidesuka/ibverbs-rs/blob/main/examples/network.rs)
99//! for a complete multi-node runnable example.
100//!
101//! [`MultiChannel`]: crate::multi_channel::MultiChannel
102
103mod barrier;
104mod builder;
105mod config;
106mod ops;
107mod polling_scope;
108mod tcp_exchanger;
109
110pub use barrier::{Barrier, BarrierAlgorithm, BarrierError, PreparedBarrier};
111#[doc(hidden)]
112pub use builder::node_builder::{
113    Empty, SetAccess, SetAckTimeout, SetBarrier, SetMaxAckRetries, SetMaxRecvSge, SetMaxRecvWr,
114    SetMaxRnrRetries, SetMaxSendSge, SetMaxSendWr, SetMinCqEntries, SetMinRnrTimer, SetMtu, SetPd,
115    SetRank, SetRecvPsn, SetSendPsn, SetWorldSize,
116};
117pub use builder::{
118    LocalEndpoint, NetworkChannelEndpoint, NodeBuilder, PreparedNode, RemoteEndpoints,
119};
120pub use config::{NetworkConfig, NetworkConfigError, NodeConfig, RawNetworkConfig};
121pub use tcp_exchanger::{ExchangeConfig, ExchangeError, Exchanger};
122
123use crate::ibverbs::protection_domain::ProtectionDomain;
124use crate::multi_channel::MultiChannel;
125
126/// A ranked RDMA network node with barrier synchronization.
127///
128/// Wraps a [`MultiChannel`] with a rank, world size, and a [`Barrier`] for
129/// collective synchronization across all nodes in the network.
130#[derive(Debug)]
131pub struct Node {
132    rank: usize,
133    world_size: usize,
134    multi_channel: MultiChannel,
135    barrier: Barrier,
136}
137
138impl Node {
139    /// Returns the protection domain this node belongs to.
140    pub fn pd(&self) -> &ProtectionDomain {
141        self.multi_channel.pd()
142    }
143
144    /// Returns the total number of nodes in the network.
145    pub fn world_size(&self) -> usize {
146        self.world_size
147    }
148
149    /// Returns this node's rank (index) in the network.
150    pub fn rank(&self) -> usize {
151        self.rank
152    }
153}
154
155impl ProtectionDomain {
156    /// Creates a builder under this protection domain.
157    pub fn create_node(&self) -> NodeBuilder<'_, SetPd> {
158        Node::builder().pd(self)
159    }
160}