xdp_socket/
create_socket.rs

1//! # AF_XDP Socket Creation and Configuration
2//!
3//! ## Purpose
4//!
5//! This file contains the logic for creating and configuring AF_XDP sockets. It provides
6//! a high-level API to set up sockets for transmit-only, receive-only, or
7//! bidirectional packet processing, abstracting away many of the low-level details.
8//!
9//! ## How it works
10//!
11//! It uses `libc` syscalls to create a raw AF_XDP socket. It then allocates a UMEM
12//! (Userspace Memory) region for zero-copy data transfers, configures the necessary
13//! rings (TX, RX, Fill, Completion) with appropriate sizes, maps them into memory,
14//! and binds the socket to a specific network interface and queue. The logic handles
15//! different UMEM and ring configurations based on whether the socket is for TX, RX,
16//! or both.
17//!
18//! ## Main components
19//!
20//! - `create_socket()`: The core unsafe function that handles the detailed setup logic.
21//! - `create_tx_socket()`, `create_rx_socket()`, `create_bi_socket()`: Safe public
22//!   functions that wrap `create_socket` for specific use cases.
23//! - `setup_umem()`: A helper function to allocate and register the UMEM with the kernel.
24//! - `ring_offsets()`: A helper to query the kernel for the memory map offsets of the rings.
25//! - `XdpConfig`, `Direction`: Public structs and enums for socket configuration.
26
27use crate::mmap::OwnedMmap;
28use crate::ring::{FRAME_COUNT, FRAME_SIZE, Ring, RingType};
29use crate::socket::{Inner, RxSocket, TxSocket};
30use std::io;
31use std::mem::size_of;
32use std::os::fd::{FromRawFd as _, OwnedFd};
33use std::sync::Arc;
34
35/// Creates one or two sockets for AF_XDP packet processing.
36///
37/// This is the core function for setting up AF_XDP sockets. It handles UMEM
38/// allocation, ring configuration, and binding to a network interface queue.
39///
40/// # How it works
41///
42/// 1.  Creates a raw `AF_XDP` socket.
43/// 2.  Calls `setup_umem` to create a memory-mapped UMEM region.
44/// 3.  Sets the sizes for the Fill, Completion, TX, and RX rings via `setsockopt`.
45/// 4.  Retrieves the memory map offsets for the rings from the kernel.
46/// 5.  Memory-maps the required rings based on the specified `Direction`.
47/// 6.  Binds the socket to the given interface index and queue ID, enabling zero-copy
48///     and need-wakeup flags based on the config.
49/// 7.  Wraps the components in `TxSocket` and/or `RxSocket` and returns them.
50///
51/// # Arguments
52/// * `if_index` - The index of the network interface to bind to.
53/// * `if_queue` - The queue index of the interface to bind to.
54/// * `direction` - The desired direction(s) for the socket (`Tx`, `Rx`, or `Both`).
55/// * `config` - Optional configuration for zero-copy, huge pages, etc.
56///
57/// # Returns
58/// A tuple `(Option<TxSocket>, Option<RxSocket>)`. The appropriate socket(s) will be
59/// `Some` based on the `direction`.
60///
61/// # Safety
62/// This function is unsafe because it directly interfaces with low-level Linux APIs.
63/// The caller must ensure the provided parameters are valid.
64pub fn create_socket(
65    if_index: u32,
66    if_queue: u32,
67    direction: Direction,
68    config: Option<XdpConfig>,
69) -> Result<(Option<TxSocket>, Option<RxSocket>), io::Error> {
70    let (rx_ring_size, tx_ring_size) = match direction {
71        Direction::Tx => (0, FRAME_COUNT), // all frames for outgoing packets
72        Direction::Rx => (FRAME_COUNT, 0), // all frames for incoming packets
73        Direction::Both => (FRAME_COUNT / 2, FRAME_COUNT / 2), // split frames for both directions
74    };
75
76    let (fd, raw_fd) = unsafe {
77        let fd = libc::socket(libc::AF_XDP, libc::SOCK_RAW | libc::SOCK_CLOEXEC, 0);
78        if fd < 0 {
79            return Err(io::Error::last_os_error());
80        }
81        (OwnedFd::from_raw_fd(fd), fd)
82    };
83    let umem = setup_umem(raw_fd, config.as_ref())?;
84
85    RingType::Fill.set_size(raw_fd, tx_ring_size)?;
86    RingType::Completion.set_size(raw_fd, tx_ring_size)?;
87    if tx_ring_size > 0 {
88        RingType::Tx.set_size(raw_fd, tx_ring_size)?;
89    }
90    if rx_ring_size > 0 {
91        RingType::Rx.set_size(raw_fd, rx_ring_size)?;
92    }
93
94    let offsets = ring_offsets(raw_fd)?;
95
96    // Mapping Tx rings in case of Tx and Both direction
97    let (tx_ring, c_ring) = if direction == Direction::Rx {
98        (Ring::default(), Ring::default())
99    } else {
100        (
101            RingType::Tx.mmap(raw_fd, &offsets, tx_ring_size)?,
102            RingType::Completion.mmap(raw_fd, &offsets, tx_ring_size)?,
103        )
104    };
105
106    // Mapping Rx rings in case of Rx and Both direction
107    let (rx_ring, f_ring) = if direction == Direction::Tx {
108        (Ring::default(), Ring::default())
109    } else {
110        (
111            RingType::Rx.mmap(raw_fd, &offsets, rx_ring_size)?,
112            RingType::Fill.mmap(raw_fd, &offsets, rx_ring_size)?,
113        )
114    };
115
116    let zero_copy = match config.and_then(|cfg| cfg.zero_copy) {
117        Some(true) => libc::XDP_ZEROCOPY,
118        Some(false) => libc::XDP_COPY,
119        None => 0,
120    };
121
122    let need_wakeup = if config.and_then(|cfg| cfg.need_wakeup).unwrap_or(true) {
123        libc::XDP_USE_NEED_WAKEUP
124    } else {
125        0
126    };
127
128    let sxdp = libc::sockaddr_xdp {
129        sxdp_family: libc::AF_XDP as libc::sa_family_t,
130        sxdp_flags: need_wakeup | zero_copy,
131        sxdp_ifindex: if_index,
132        sxdp_queue_id: if_queue,
133        sxdp_shared_umem_fd: 0,
134    };
135
136    if unsafe {
137        libc::bind(
138            raw_fd,
139            &sxdp as *const _ as *const libc::sockaddr,
140            size_of::<libc::sockaddr_xdp>() as libc::socklen_t,
141        ) < 0
142    } {
143        return Err(io::Error::other(format!(
144            "Failed to bind: {}",
145            io::Error::last_os_error()
146        )));
147    }
148
149    // its just owned shared memory and socket descriptor
150    // that we can share between Tx and Rx sockets
151    // to release it when both are destroyed
152    #[allow(clippy::arc_with_non_send_sync)]
153    let inner = Arc::new(Inner::new(umem,fd));
154
155    let tx_socket = if direction != Direction::Rx {
156        Some(TxSocket::new(Some(inner.clone()), tx_ring, c_ring, 0))
157    } else {
158        None
159    };
160
161    let rx_socket = if direction != Direction::Tx {
162        Some(RxSocket::new(
163            Some(inner.clone()),
164            rx_ring,
165            f_ring,
166            tx_ring_size,
167        ))
168    } else {
169        None
170    };
171
172    Ok((tx_socket, rx_socket))
173}
174
175/// Creates a `TxSocket` for sending packets.
176///
177/// This is a convenience wrapper around `create_socket` for transmit-only use cases.
178///
179/// # Arguments
180/// * `if_index` - The index of the network interface to use.
181/// * `if_queue` - The queue ID of the network interface to use.
182/// * `config` - Optional `XdpConfig` to customize the socket.
183///
184/// # Returns
185/// A `Result` containing a `TxSocket` on success, or an `io::Error` on failure.
186pub fn create_tx_socket(
187    if_index: u32,
188    if_queue: u32,
189    config: Option<XdpConfig>,
190) -> Result<TxSocket, io::Error> {
191    let (tx_socket, _) = create_socket(if_index, if_queue, Direction::Tx, config)?;
192    tx_socket.ok_or_else(|| io::Error::other("Failed to create Tx socket"))
193}
194
195/// Creates an `RxSocket` for receiving packets.
196///
197/// This is a convenience wrapper around `create_socket` for receive-only use cases.
198///
199/// # Arguments
200/// * `if_index` - The index of the network interface to use.
201/// * `if_queue` - The queue ID of the network interface to use.
202/// * `config` - Optional `XdpConfig` to customize the socket.
203///
204/// # Returns
205/// A `Result` containing an `RxSocket` on success, or an `io::Error` on failure.
206pub fn create_rx_socket(
207    if_index: u32,
208    if_queue: u32,
209    config: Option<XdpConfig>,
210) -> Result<RxSocket, io::Error> {
211    let (_, rx_socket) = create_socket(if_index, if_queue, Direction::Rx, config)?;
212    rx_socket.ok_or_else(|| io::Error::other("Failed to create Rx socket"))
213}
214
215/// Creates a pair of sockets (`TxSocket`, `RxSocket`) for bidirectional communication.
216///
217/// This is a convenience wrapper around `create_socket` for bidirectional use cases.
218/// The UMEM frame pool is split between the two sockets.
219///
220/// # Arguments
221/// * `if_index` - The index of the network interface to use.
222/// * `if_queue` - The queue ID of the network interface to use.
223/// * `config` - Optional `XdpConfig` to customize the sockets.
224///
225/// # Returns
226/// A `Result` containing a tuple of `(TxSocket, RxSocket)` on success, or an `io::Error` on failure.
227pub fn create_bi_socket(
228    if_index: u32,
229    if_queue: u32,
230    config: Option<XdpConfig>,
231) -> Result<(TxSocket, RxSocket), io::Error> {
232    let (tx_socket, rx_socket) = create_socket(if_index, if_queue, Direction::Both, config)?;
233    Ok((
234        tx_socket.ok_or_else(|| io::Error::other("Failed to create Tx socket"))?,
235        rx_socket.ok_or_else(|| io::Error::other("Failed to create Rx socket"))?,
236    ))
237}
238
239/// Retrieves the memory map offsets for the AF_XDP rings from the kernel.
240///
241/// This function uses `getsockopt` with `XDP_MMAP_OFFSETS` to query the kernel for
242/// the correct offsets of the producer/consumer indices and descriptor arrays for
243/// all four rings.
244///
245/// # Arguments
246/// * `raw_fd` - The raw file descriptor of the AF_XDP socket.
247pub fn ring_offsets(raw_fd: libc::c_int) -> io::Result<libc::xdp_mmap_offsets> {
248    let mut offsets: libc::xdp_mmap_offsets = unsafe { std::mem::zeroed() };
249    let mut optlen = size_of::<libc::xdp_mmap_offsets>() as libc::socklen_t;
250    unsafe {
251        if libc::getsockopt(
252            raw_fd,
253            libc::SOL_XDP,
254            libc::XDP_MMAP_OFFSETS,
255            &mut offsets as *mut _ as *mut libc::c_void,
256            &mut optlen,
257        ) < 0
258        {
259            return Err(io::Error::last_os_error());
260        }
261    }
262    Ok(offsets)
263}
264
265/// Allocates and registers the UMEM (Userspace Memory) region with the kernel.
266///
267/// # How it works
268///
269/// 1.  It calls `OwnedMmap::mmap` to create a memory-mapped region, optionally
270///     backed by huge pages.
271/// 2.  It populates an `xdp_umem_reg` struct with the address and size of the UMEM.
272/// 3.  It calls `setsockopt` with `XDP_UMEM_REG` to register the UMEM with the
273///     kernel, making it available for zero-copy operations.
274///
275/// # Arguments
276/// * `raw_fd` - The raw file descriptor of the AF_XDP socket.
277/// * `config` - Optional configuration, used to determine if huge pages should be used.
278pub fn setup_umem(raw_fd: libc::c_int, config: Option<&XdpConfig>) -> io::Result<OwnedMmap> {
279    let umem = OwnedMmap::mmap(
280        FRAME_COUNT * FRAME_SIZE,
281        config.and_then(|cfg| cfg.huge_page),
282    )
283    .map_err(|e| io::Error::other(format!("Failed to allocate UMEM: {}", e)))?;
284
285    let reg = unsafe {
286        libc::xdp_umem_reg {
287            addr: umem.as_void_ptr() as u64,
288            len: umem.len() as u64,
289            chunk_size: FRAME_SIZE as u32,
290            ..std::mem::zeroed()
291        }
292    };
293
294    unsafe {
295        if libc::setsockopt(
296            raw_fd,
297            libc::SOL_XDP,
298            libc::XDP_UMEM_REG,
299            &reg as *const _ as *const libc::c_void,
300            size_of::<libc::xdp_umem_reg>() as libc::socklen_t,
301        ) < 0
302        {
303            return Err(io::Error::other(format!(
304                "Failed to register UMEM: {}",
305                io::Error::last_os_error()
306            )));
307        }
308    }
309
310    Ok(umem)
311}
312
313/// Specifies the direction of an AF_XDP socket.
314#[derive(Copy, Clone, Debug, PartialEq)]
315#[repr(i32)]
316pub enum Direction {
317    /// Transmit-only socket.
318    Tx = 0,
319    /// Receive-only socket.
320    Rx = 1,
321    /// Bidirectional socket (both transmit and receive).
322    Both = -1,
323}
324
325/// Configuration options for creating an AF_XDP socket.
326#[derive(Debug, Copy, Clone, Default)]
327pub struct XdpConfig {
328    /// Enables or disables zero-copy mode.
329    ///
330    /// - `Some(true)`: Enables `XDP_ZEROCOPY`.
331    /// - `Some(false)`: Enables `XDP_COPY`.
332    /// - `None`: The kernel's default behavior is used (typically copy mode).
333    pub zero_copy: Option<bool>,
334    /// Enables or disables huge pages for the UMEM.
335    ///
336    /// - `Some(true)`: Attempts to use huge pages.
337    /// - `Some(false)`: Uses standard page sizes.
338    /// - `None`: The implementation default is used (typically standard pages).
339    pub huge_page: Option<bool>,
340    /// Sets the `XDP_USE_NEED_WAKEUP` flag.
341    ///
342    /// - `Some(true)`: The flag is set. The application must call `kick()` to wake up the kernel.
343    /// - `Some(false)`: The flag is not set. The kernel polls without needing a wakeup call.
344    /// - `None`: Defaults to `true`.
345    pub need_wakeup: Option<bool>,
346}