xdp_socket/
create.rs

1//! # AF_XDP Socket Creation and Configuration
2//!
3//! ## Purpose
4//!
5//! This file contains the logic for creating and configuring AF_XDP sockets. It provides
6//! a high-level API to set up sockets for transmit-only, receive-only, or
7//! bidirectional packet processing, abstracting away many of the low-level details.
8//!
9//! ## How it works
10//!
11//! It uses `libc` syscalls to create a raw AF_XDP socket. It then allocates a UMEM
12//! (Userspace Memory) region for zero-copy data transfers, configures the necessary
13//! rings (TX, RX, Fill, Completion) with appropriate sizes, maps them into memory,
14//! and binds the socket to a specific network interface and queue. The logic handles
15//! different UMEM and ring configurations based on whether the socket is for TX, RX,
16//! or both.
17//!
18//! ## Main components
19//!
20//! - `create_socket()`: The core unsafe function that handles the detailed setup logic.
21//! - `create_tx_socket()`, `create_rx_socket()`, `create_bi_socket()`: Safe public
22//!   functions that wrap `create_socket` for specific use cases.
23//! - `setup_umem()`: A helper function to allocate and register the UMEM with the kernel.
24//! - `ring_offsets()`: A helper to query the kernel for the memory map offsets of the rings.
25//! - `XdpConfig`, `Direction`: Public structs and enums for socket configuration.
26
27use crate::mmap::OwnedMmap;
28use crate::ring::{FRAME_COUNT, FRAME_SIZE, Ring, RingType, XdpDesc};
29use crate::socket::{Inner, RxSocket, TxSocket};
30use std::io;
31use std::mem::size_of;
32use std::os::fd::{FromRawFd as _, OwnedFd};
33use std::sync::Arc;
34
35/// Creates one or two sockets for AF_XDP packet processing.
36///
37/// This is the core function for setting up AF_XDP sockets. It handles UMEM
38/// allocation, ring configuration, and binding to a network interface queue.
39///
40/// # How it works
41///
42/// 1.  Creates a raw `AF_XDP` socket.
43/// 2.  Calls `setup_umem` to create a memory-mapped UMEM region.
44/// 3.  Sets the sizes for the Fill, Completion, TX, and RX rings via `setsockopt`.
45/// 4.  Retrieves the memory map offsets for the rings from the kernel.
46/// 5.  Memory-maps the required rings based on the specified `Direction`.
47/// 6.  Binds the socket to the given interface index and queue ID, enabling zero-copy
48///     and need-wakeup flags based on the config.
49/// 7.  Wraps the components in `TxSocket` and/or `RxSocket` and returns them.
50///
51/// # Arguments
52/// * `if_index` - The index of the network interface to bind to.
53/// * `if_queue` - The queue index of the interface to bind to.
54/// * `direction` - The desired direction(s) for the socket (`Tx`, `Rx`, or `Both`).
55/// * `config` - Optional configuration for zero-copy, huge pages, etc.
56///
57/// # Returns
58/// A tuple `(Option<TxSocket>, Option<RxSocket>)`. The appropriate socket(s) will be
59/// `Some` based on the `direction`.
60///
61/// # Safety
62/// This function is unsafe because it directly interfaces with low-level Linux APIs.
63/// The caller must ensure the provided parameters are valid.
64pub fn create_socket(
65    if_index: u32,
66    if_queue: u32,
67    direction: Direction,
68    config: Option<XdpConfig>,
69) -> Result<(Option<TxSocket>, Option<RxSocket>), io::Error> {
70    let (rx_ring_size, tx_ring_size) = match direction {
71        Direction::Tx => (0, FRAME_COUNT), // all frames for outgoing packets
72        Direction::Rx => (FRAME_COUNT, 0), // all frames for incoming packets
73        Direction::Both => (FRAME_COUNT / 2, FRAME_COUNT / 2), // split frames for both directions
74    };
75
76    let (fd, raw_fd) = unsafe {
77        let fd = libc::socket(libc::AF_XDP, libc::SOCK_RAW | libc::SOCK_CLOEXEC, 0);
78        if fd < 0 {
79            return Err(io::Error::last_os_error());
80        }
81        (OwnedFd::from_raw_fd(fd), fd)
82    };
83    let umem = setup_umem(raw_fd, config.as_ref())?;
84
85    RingType::Fill.set_size(raw_fd, tx_ring_size)?;
86    RingType::Completion.set_size(raw_fd, tx_ring_size)?;
87    if tx_ring_size > 0 {
88        RingType::Tx.set_size(raw_fd, tx_ring_size)?;
89    }
90    if rx_ring_size > 0 {
91        RingType::Rx.set_size(raw_fd, rx_ring_size)?;
92    }
93
94    let offsets = ring_offsets(raw_fd)?;
95
96    // Mapping Tx rings in case of Tx and Both direction
97    let (c_ring, tx_ring) = if direction == Direction::Rx {
98        (Ring::default(), Ring::default())
99    } else {
100        (
101            RingType::Completion.mmap(raw_fd, &offsets, tx_ring_size)?,
102            {
103                let mut tx_ring: Ring<XdpDesc> =
104                    RingType::Tx.mmap(raw_fd, &offsets, tx_ring_size)?;
105                tx_ring.fill(0);
106                tx_ring
107            },
108        )
109    };
110
111    // Mapping Rx rings in case of Rx and Both direction
112    let (rx_ring, f_ring) = if direction == Direction::Tx {
113        (Ring::default(), Ring::default())
114    } else {
115        (RingType::Rx.mmap(raw_fd, &offsets, rx_ring_size)?, {
116            let mut f_ring: Ring<u64> = RingType::Fill.mmap(raw_fd, &offsets, rx_ring_size)?;
117            f_ring.fill(tx_ring_size as u32);
118            f_ring.update_producer(f_ring.len as u32);
119            f_ring
120        })
121    };
122
123    let zero_copy = match config.and_then(|cfg| cfg.zero_copy) {
124        Some(true) => libc::XDP_ZEROCOPY,
125        Some(false) => libc::XDP_COPY,
126        None => 0,
127    };
128
129    let need_wakeup = if config.and_then(|cfg| cfg.need_wakeup).unwrap_or(true) {
130        libc::XDP_USE_NEED_WAKEUP
131    } else {
132        0
133    };
134
135    let sxdp = libc::sockaddr_xdp {
136        sxdp_family: libc::AF_XDP as libc::sa_family_t,
137        sxdp_flags: need_wakeup | zero_copy,
138        sxdp_ifindex: if_index,
139        sxdp_queue_id: if_queue,
140        sxdp_shared_umem_fd: 0,
141    };
142
143    if unsafe {
144        libc::bind(
145            raw_fd,
146            &sxdp as *const _ as *const libc::sockaddr,
147            size_of::<libc::sockaddr_xdp>() as libc::socklen_t,
148        ) < 0
149    } {
150        return Err(io::Error::other(format!(
151            "Failed to bind: {}",
152            io::Error::last_os_error()
153        )));
154    }
155
156    // its just owned shared memory and socket descriptor
157    // that we can share between Tx and Rx sockets
158    // to release it when both are destroyed
159    #[allow(clippy::arc_with_non_send_sync)]
160    let inner = Arc::new(Inner::new(umem, fd));
161
162    let tx_socket = if direction != Direction::Rx {
163        Some(TxSocket::new(Some(inner.clone()), tx_ring, c_ring))
164    } else {
165        None
166    };
167
168    let rx_socket = if direction != Direction::Tx {
169        Some(RxSocket::new(Some(inner.clone()), rx_ring, f_ring))
170    } else {
171        None
172    };
173
174    Ok((tx_socket, rx_socket))
175}
176
177/// Creates a `TxSocket` for sending packets.
178///
179/// This is a convenience wrapper around `create_socket` for transmit-only use cases.
180///
181/// # Arguments
182/// * `if_index` - The index of the network interface to use.
183/// * `if_queue` - The queue ID of the network interface to use.
184/// * `config` - Optional `XdpConfig` to customize the socket.
185///
186/// # Returns
187/// A `Result` containing a `TxSocket` on success, or an `io::Error` on failure.
188pub fn create_tx_socket(
189    if_index: u32,
190    if_queue: u32,
191    config: Option<XdpConfig>,
192) -> Result<TxSocket, io::Error> {
193    let (tx_socket, _) = create_socket(if_index, if_queue, Direction::Tx, config)?;
194    tx_socket.ok_or_else(|| io::Error::other("Failed to create Tx socket"))
195}
196
197/// Creates an `RxSocket` for receiving packets.
198///
199/// This is a convenience wrapper around `create_socket` for receive-only use cases.
200///
201/// # Arguments
202/// * `if_index` - The index of the network interface to use.
203/// * `if_queue` - The queue ID of the network interface to use.
204/// * `config` - Optional `XdpConfig` to customize the socket.
205///
206/// # Returns
207/// A `Result` containing an `RxSocket` on success, or an `io::Error` on failure.
208pub fn create_rx_socket(
209    if_index: u32,
210    if_queue: u32,
211    config: Option<XdpConfig>,
212) -> Result<RxSocket, io::Error> {
213    let (_, rx_socket) = create_socket(if_index, if_queue, Direction::Rx, config)?;
214    rx_socket.ok_or_else(|| io::Error::other("Failed to create Rx socket"))
215}
216
217/// Creates a pair of sockets (`TxSocket`, `RxSocket`) for bidirectional communication.
218///
219/// This is a convenience wrapper around `create_socket` for bidirectional use cases.
220/// The UMEM frame pool is split between the two sockets.
221///
222/// # Arguments
223/// * `if_index` - The index of the network interface to use.
224/// * `if_queue` - The queue ID of the network interface to use.
225/// * `config` - Optional `XdpConfig` to customize the sockets.
226///
227/// # Returns
228/// A `Result` containing a tuple of `(TxSocket, RxSocket)` on success, or an `io::Error` on failure.
229pub fn create_bi_socket(
230    if_index: u32,
231    if_queue: u32,
232    config: Option<XdpConfig>,
233) -> Result<(TxSocket, RxSocket), io::Error> {
234    let (tx_socket, rx_socket) = create_socket(if_index, if_queue, Direction::Both, config)?;
235    Ok((
236        tx_socket.ok_or_else(|| io::Error::other("Failed to create Tx socket"))?,
237        rx_socket.ok_or_else(|| io::Error::other("Failed to create Rx socket"))?,
238    ))
239}
240
241/// Retrieves the memory map offsets for the AF_XDP rings from the kernel.
242///
243/// This function uses `getsockopt` with `XDP_MMAP_OFFSETS` to query the kernel for
244/// the correct offsets of the producer/consumer indices and descriptor arrays for
245/// all four rings.
246///
247/// # Arguments
248/// * `raw_fd` - The raw file descriptor of the AF_XDP socket.
249pub fn ring_offsets(raw_fd: libc::c_int) -> io::Result<libc::xdp_mmap_offsets> {
250    let mut offsets: libc::xdp_mmap_offsets = unsafe { std::mem::zeroed() };
251    let mut optlen = size_of::<libc::xdp_mmap_offsets>() as libc::socklen_t;
252    unsafe {
253        if libc::getsockopt(
254            raw_fd,
255            libc::SOL_XDP,
256            libc::XDP_MMAP_OFFSETS,
257            &mut offsets as *mut _ as *mut libc::c_void,
258            &mut optlen,
259        ) < 0
260        {
261            return Err(io::Error::last_os_error());
262        }
263    }
264    Ok(offsets)
265}
266
267/// Allocates and registers the UMEM (Userspace Memory) region with the kernel.
268///
269/// # How it works
270///
271/// 1.  It calls `OwnedMmap::mmap` to create a memory-mapped region, optionally
272///     backed by huge pages.
273/// 2.  It populates an `xdp_umem_reg` struct with the address and size of the UMEM.
274/// 3.  It calls `setsockopt` with `XDP_UMEM_REG` to register the UMEM with the
275///     kernel, making it available for zero-copy operations.
276///
277/// # Arguments
278/// * `raw_fd` - The raw file descriptor of the AF_XDP socket.
279/// * `config` - Optional configuration, used to determine if huge pages should be used.
280pub fn setup_umem(raw_fd: libc::c_int, config: Option<&XdpConfig>) -> io::Result<OwnedMmap> {
281    let umem = OwnedMmap::mmap(
282        FRAME_COUNT * FRAME_SIZE,
283        config.and_then(|cfg| cfg.huge_page),
284    )
285    .map_err(|e| io::Error::other(format!("Failed to allocate UMEM: {}", e)))?;
286
287    let reg = unsafe {
288        libc::xdp_umem_reg {
289            addr: umem.as_void_ptr() as u64,
290            len: umem.len() as u64,
291            chunk_size: FRAME_SIZE as u32,
292            ..std::mem::zeroed()
293        }
294    };
295
296    unsafe {
297        if libc::setsockopt(
298            raw_fd,
299            libc::SOL_XDP,
300            libc::XDP_UMEM_REG,
301            &reg as *const _ as *const libc::c_void,
302            size_of::<libc::xdp_umem_reg>() as libc::socklen_t,
303        ) < 0
304        {
305            return Err(io::Error::other(format!(
306                "Failed to register UMEM: {}",
307                io::Error::last_os_error()
308            )));
309        }
310    }
311
312    Ok(umem)
313}
314
315/// Specifies the direction of an AF_XDP socket.
316#[derive(Copy, Clone, Debug, PartialEq)]
317#[repr(i32)]
318pub enum Direction {
319    /// Transmit-only socket.
320    Tx = 0,
321    /// Receive-only socket.
322    Rx = 1,
323    /// Bidirectional socket (both transmit and receive).
324    Both = -1,
325}
326
327/// Configuration options for creating an AF_XDP socket.
328#[derive(Debug, Copy, Clone, Default)]
329pub struct XdpConfig {
330    /// Enables or disables zero-copy mode.
331    ///
332    /// - `Some(true)`: Enables `XDP_ZEROCOPY`.
333    /// - `Some(false)`: Enables `XDP_COPY`.
334    /// - `None`: The kernel's default behavior is used (typically copy mode).
335    pub zero_copy: Option<bool>,
336    /// Enables or disables huge pages for the UMEM.
337    ///
338    /// - `Some(true)`: Attempts to use huge pages.
339    /// - `Some(false)`: Uses standard page sizes.
340    /// - `None`: The implementation default is used (typically standard pages).
341    pub huge_page: Option<bool>,
342    /// Sets the `XDP_USE_NEED_WAKEUP` flag.
343    ///
344    /// - `Some(true)`: The flag is set. The application must call `kick()` to wake up the kernel.
345    /// - `Some(false)`: The flag is not set. The kernel polls without needing a wakeup call.
346    /// - `None`: Defaults to `true`.
347    pub need_wakeup: Option<bool>,
348}