xdp_socket/create_socket.rs
1//! # AF_XDP Socket Creation and Configuration
2//!
3//! ## Purpose
4//!
5//! This file contains the logic for creating and configuring AF_XDP sockets. It provides
6//! a high-level API to set up sockets for transmit-only, receive-only, or
7//! bidirectional packet processing, abstracting away many of the low-level details.
8//!
9//! ## How it works
10//!
11//! It uses `libc` syscalls to create a raw AF_XDP socket. It then allocates a UMEM
12//! (Userspace Memory) region for zero-copy data transfers, configures the necessary
13//! rings (TX, RX, Fill, Completion) with appropriate sizes, maps them into memory,
14//! and binds the socket to a specific network interface and queue. The logic handles
15//! different UMEM and ring configurations based on whether the socket is for TX, RX,
16//! or both.
17//!
18//! ## Main components
19//!
20//! - `create_socket()`: The core unsafe function that handles the detailed setup logic.
21//! - `create_tx_socket()`, `create_rx_socket()`, `create_bi_socket()`: Safe public
22//! functions that wrap `create_socket` for specific use cases.
23//! - `setup_umem()`: A helper function to allocate and register the UMEM with the kernel.
24//! - `ring_offsets()`: A helper to query the kernel for the memory map offsets of the rings.
25//! - `XdpConfig`, `Direction`: Public structs and enums for socket configuration.
26
27use crate::mmap::OwnedMmap;
28use crate::ring::{FRAME_COUNT, FRAME_SIZE, Ring, RingType};
29use crate::socket::{Inner, RxSocket, TxSocket};
30use std::io;
31use std::mem::size_of;
32use std::os::fd::{FromRawFd as _, OwnedFd};
33use std::sync::Arc;
34
35/// Creates one or two sockets for AF_XDP packet processing.
36///
37/// This is the core function for setting up AF_XDP sockets. It handles UMEM
38/// allocation, ring configuration, and binding to a network interface queue.
39///
40/// # How it works
41///
42/// 1. Creates a raw `AF_XDP` socket.
43/// 2. Calls `setup_umem` to create a memory-mapped UMEM region.
44/// 3. Sets the sizes for the Fill, Completion, TX, and RX rings via `setsockopt`.
45/// 4. Retrieves the memory map offsets for the rings from the kernel.
46/// 5. Memory-maps the required rings based on the specified `Direction`.
47/// 6. Binds the socket to the given interface index and queue ID, enabling zero-copy
48/// and need-wakeup flags based on the config.
49/// 7. Wraps the components in `TxSocket` and/or `RxSocket` and returns them.
50///
51/// # Arguments
52/// * `if_index` - The index of the network interface to bind to.
53/// * `if_queue` - The queue index of the interface to bind to.
54/// * `direction` - The desired direction(s) for the socket (`Tx`, `Rx`, or `Both`).
55/// * `config` - Optional configuration for zero-copy, huge pages, etc.
56///
57/// # Returns
58/// A tuple `(Option<TxSocket>, Option<RxSocket>)`. The appropriate socket(s) will be
59/// `Some` based on the `direction`.
60///
61/// # Safety
62/// This function is unsafe because it directly interfaces with low-level Linux APIs.
63/// The caller must ensure the provided parameters are valid.
64pub fn create_socket(
65 if_index: u32,
66 if_queue: u32,
67 direction: Direction,
68 config: Option<XdpConfig>,
69) -> Result<(Option<TxSocket>, Option<RxSocket>), io::Error> {
70 let (rx_ring_size, tx_ring_size) = match direction {
71 Direction::Tx => (0, FRAME_COUNT), // all frames for outgoing packets
72 Direction::Rx => (FRAME_COUNT, 0), // all frames for incoming packets
73 Direction::Both => (FRAME_COUNT / 2, FRAME_COUNT / 2), // split frames for both directions
74 };
75
76 let (fd, raw_fd) = unsafe {
77 let fd = libc::socket(libc::AF_XDP, libc::SOCK_RAW | libc::SOCK_CLOEXEC, 0);
78 if fd < 0 {
79 return Err(io::Error::last_os_error());
80 }
81 (OwnedFd::from_raw_fd(fd), fd)
82 };
83 let umem = setup_umem(raw_fd, config.as_ref())?;
84
85 RingType::Fill.set_size(raw_fd, tx_ring_size)?;
86 RingType::Completion.set_size(raw_fd, tx_ring_size)?;
87 if tx_ring_size > 0 {
88 RingType::Tx.set_size(raw_fd, tx_ring_size)?;
89 }
90 if rx_ring_size > 0 {
91 RingType::Rx.set_size(raw_fd, rx_ring_size)?;
92 }
93
94 let offsets = ring_offsets(raw_fd)?;
95
96 // Mapping Tx rings in case of Tx and Both direction
97 let (tx_ring, c_ring) = if direction == Direction::Rx {
98 (Ring::default(), Ring::default())
99 } else {
100 (
101 RingType::Tx.mmap(raw_fd, &offsets, tx_ring_size)?,
102 RingType::Completion.mmap(raw_fd, &offsets, tx_ring_size)?,
103 )
104 };
105
106 // Mapping Rx rings in case of Rx and Both direction
107 let (rx_ring, f_ring) = if direction == Direction::Tx {
108 (Ring::default(), Ring::default())
109 } else {
110 (
111 RingType::Rx.mmap(raw_fd, &offsets, rx_ring_size)?,
112 RingType::Fill.mmap(raw_fd, &offsets, rx_ring_size)?,
113 )
114 };
115
116 let zero_copy = match config.and_then(|cfg| cfg.zero_copy) {
117 Some(true) => libc::XDP_ZEROCOPY,
118 Some(false) => libc::XDP_COPY,
119 None => 0,
120 };
121
122 let need_wakeup = if config.and_then(|cfg| cfg.need_wakeup).unwrap_or(true) {
123 libc::XDP_USE_NEED_WAKEUP
124 } else {
125 0
126 };
127
128 let sxdp = libc::sockaddr_xdp {
129 sxdp_family: libc::AF_XDP as libc::sa_family_t,
130 sxdp_flags: need_wakeup | zero_copy,
131 sxdp_ifindex: if_index,
132 sxdp_queue_id: if_queue,
133 sxdp_shared_umem_fd: 0,
134 };
135
136 if unsafe {
137 libc::bind(
138 raw_fd,
139 &sxdp as *const _ as *const libc::sockaddr,
140 size_of::<libc::sockaddr_xdp>() as libc::socklen_t,
141 ) < 0
142 } {
143 return Err(io::Error::other(format!(
144 "Failed to bind: {}",
145 io::Error::last_os_error()
146 )));
147 }
148
149 // its just owned shared memory and socket descriptor
150 // that we can share between Tx and Rx sockets
151 // to release it when both are destroyed
152 #[allow(clippy::arc_with_non_send_sync)]
153 let inner = Arc::new(Inner::new(umem,fd));
154
155 let tx_socket = if direction != Direction::Rx {
156 Some(TxSocket::new(Some(inner.clone()), tx_ring, c_ring, 0))
157 } else {
158 None
159 };
160
161 let rx_socket = if direction != Direction::Tx {
162 Some(RxSocket::new(
163 Some(inner.clone()),
164 rx_ring,
165 f_ring,
166 tx_ring_size,
167 ))
168 } else {
169 None
170 };
171
172 Ok((tx_socket, rx_socket))
173}
174
175/// Creates a `TxSocket` for sending packets.
176///
177/// This is a convenience wrapper around `create_socket` for transmit-only use cases.
178///
179/// # Arguments
180/// * `if_index` - The index of the network interface to use.
181/// * `if_queue` - The queue ID of the network interface to use.
182/// * `config` - Optional `XdpConfig` to customize the socket.
183///
184/// # Returns
185/// A `Result` containing a `TxSocket` on success, or an `io::Error` on failure.
186pub fn create_tx_socket(
187 if_index: u32,
188 if_queue: u32,
189 config: Option<XdpConfig>,
190) -> Result<TxSocket, io::Error> {
191 let (tx_socket, _) = create_socket(if_index, if_queue, Direction::Tx, config)?;
192 tx_socket.ok_or_else(|| io::Error::other("Failed to create Tx socket"))
193}
194
195/// Creates an `RxSocket` for receiving packets.
196///
197/// This is a convenience wrapper around `create_socket` for receive-only use cases.
198///
199/// # Arguments
200/// * `if_index` - The index of the network interface to use.
201/// * `if_queue` - The queue ID of the network interface to use.
202/// * `config` - Optional `XdpConfig` to customize the socket.
203///
204/// # Returns
205/// A `Result` containing an `RxSocket` on success, or an `io::Error` on failure.
206pub fn create_rx_socket(
207 if_index: u32,
208 if_queue: u32,
209 config: Option<XdpConfig>,
210) -> Result<RxSocket, io::Error> {
211 let (_, rx_socket) = create_socket(if_index, if_queue, Direction::Rx, config)?;
212 rx_socket.ok_or_else(|| io::Error::other("Failed to create Rx socket"))
213}
214
215/// Creates a pair of sockets (`TxSocket`, `RxSocket`) for bidirectional communication.
216///
217/// This is a convenience wrapper around `create_socket` for bidirectional use cases.
218/// The UMEM frame pool is split between the two sockets.
219///
220/// # Arguments
221/// * `if_index` - The index of the network interface to use.
222/// * `if_queue` - The queue ID of the network interface to use.
223/// * `config` - Optional `XdpConfig` to customize the sockets.
224///
225/// # Returns
226/// A `Result` containing a tuple of `(TxSocket, RxSocket)` on success, or an `io::Error` on failure.
227pub fn create_bi_socket(
228 if_index: u32,
229 if_queue: u32,
230 config: Option<XdpConfig>,
231) -> Result<(TxSocket, RxSocket), io::Error> {
232 let (tx_socket, rx_socket) = create_socket(if_index, if_queue, Direction::Both, config)?;
233 Ok((
234 tx_socket.ok_or_else(|| io::Error::other("Failed to create Tx socket"))?,
235 rx_socket.ok_or_else(|| io::Error::other("Failed to create Rx socket"))?,
236 ))
237}
238
239/// Retrieves the memory map offsets for the AF_XDP rings from the kernel.
240///
241/// This function uses `getsockopt` with `XDP_MMAP_OFFSETS` to query the kernel for
242/// the correct offsets of the producer/consumer indices and descriptor arrays for
243/// all four rings.
244///
245/// # Arguments
246/// * `raw_fd` - The raw file descriptor of the AF_XDP socket.
247pub fn ring_offsets(raw_fd: libc::c_int) -> io::Result<libc::xdp_mmap_offsets> {
248 let mut offsets: libc::xdp_mmap_offsets = unsafe { std::mem::zeroed() };
249 let mut optlen = size_of::<libc::xdp_mmap_offsets>() as libc::socklen_t;
250 unsafe {
251 if libc::getsockopt(
252 raw_fd,
253 libc::SOL_XDP,
254 libc::XDP_MMAP_OFFSETS,
255 &mut offsets as *mut _ as *mut libc::c_void,
256 &mut optlen,
257 ) < 0
258 {
259 return Err(io::Error::last_os_error());
260 }
261 }
262 Ok(offsets)
263}
264
265/// Allocates and registers the UMEM (Userspace Memory) region with the kernel.
266///
267/// # How it works
268///
269/// 1. It calls `OwnedMmap::mmap` to create a memory-mapped region, optionally
270/// backed by huge pages.
271/// 2. It populates an `xdp_umem_reg` struct with the address and size of the UMEM.
272/// 3. It calls `setsockopt` with `XDP_UMEM_REG` to register the UMEM with the
273/// kernel, making it available for zero-copy operations.
274///
275/// # Arguments
276/// * `raw_fd` - The raw file descriptor of the AF_XDP socket.
277/// * `config` - Optional configuration, used to determine if huge pages should be used.
278pub fn setup_umem(raw_fd: libc::c_int, config: Option<&XdpConfig>) -> io::Result<OwnedMmap> {
279 let umem = OwnedMmap::mmap(
280 FRAME_COUNT * FRAME_SIZE,
281 config.and_then(|cfg| cfg.huge_page),
282 )
283 .map_err(|e| io::Error::other(format!("Failed to allocate UMEM: {}", e)))?;
284
285 let reg = unsafe {
286 libc::xdp_umem_reg {
287 addr: umem.as_void_ptr() as u64,
288 len: umem.len() as u64,
289 chunk_size: FRAME_SIZE as u32,
290 ..std::mem::zeroed()
291 }
292 };
293
294 unsafe {
295 if libc::setsockopt(
296 raw_fd,
297 libc::SOL_XDP,
298 libc::XDP_UMEM_REG,
299 ® as *const _ as *const libc::c_void,
300 size_of::<libc::xdp_umem_reg>() as libc::socklen_t,
301 ) < 0
302 {
303 return Err(io::Error::other(format!(
304 "Failed to register UMEM: {}",
305 io::Error::last_os_error()
306 )));
307 }
308 }
309
310 Ok(umem)
311}
312
313/// Specifies the direction of an AF_XDP socket.
314#[derive(Copy, Clone, Debug, PartialEq)]
315#[repr(i32)]
316pub enum Direction {
317 /// Transmit-only socket.
318 Tx = 0,
319 /// Receive-only socket.
320 Rx = 1,
321 /// Bidirectional socket (both transmit and receive).
322 Both = -1,
323}
324
325/// Configuration options for creating an AF_XDP socket.
326#[derive(Debug, Copy, Clone, Default)]
327pub struct XdpConfig {
328 /// Enables or disables zero-copy mode.
329 ///
330 /// - `Some(true)`: Enables `XDP_ZEROCOPY`.
331 /// - `Some(false)`: Enables `XDP_COPY`.
332 /// - `None`: The kernel's default behavior is used (typically copy mode).
333 pub zero_copy: Option<bool>,
334 /// Enables or disables huge pages for the UMEM.
335 ///
336 /// - `Some(true)`: Attempts to use huge pages.
337 /// - `Some(false)`: Uses standard page sizes.
338 /// - `None`: The implementation default is used (typically standard pages).
339 pub huge_page: Option<bool>,
340 /// Sets the `XDP_USE_NEED_WAKEUP` flag.
341 ///
342 /// - `Some(true)`: The flag is set. The application must call `kick()` to wake up the kernel.
343 /// - `Some(false)`: The flag is not set. The kernel polls without needing a wakeup call.
344 /// - `None`: Defaults to `true`.
345 pub need_wakeup: Option<bool>,
346}