tun_rs/platform/linux/
offload.rs

1/*!
2# Linux Offload Support Module
3
4This module provides Generic Receive Offload (GRO) and Generic Segmentation Offload (GSO)
5support for Linux TUN devices, significantly improving throughput for TCP and UDP traffic.
6
7## Overview
8
9Modern network cards and drivers use offload techniques to reduce CPU overhead:
10
11- **GSO (Generic Segmentation Offload)**: Allows sending large packets that are segmented by
12  the kernel/driver, reducing per-packet processing overhead.
13
14- **GRO (Generic Receive Offload)**: Coalesces multiple received packets into larger segments,
15  reducing the number of packets passed to the application.
16
17This module implements GRO/GSO for TUN devices using the `virtio_net` header format, compatible
18with the Linux kernel's TUN/TAP driver offload capabilities.
19
20## Performance Benefits
21
22Enabling offload can provide:
23- 2-10x improvement in throughput for TCP traffic
24- Reduced CPU usage per gigabit of traffic
25- Better handling of high-bandwidth applications
26
27The actual improvement depends on:
28- Packet sizes
29- TCP window sizes
30- Network round-trip time
31- CPU capabilities
32
33## Usage
34
35Enable offload when building a device:
36
37```no_run
38# #[cfg(target_os = "linux")]
39# {
40use tun_rs::{DeviceBuilder, GROTable, IDEAL_BATCH_SIZE, VIRTIO_NET_HDR_LEN};
41
42let dev = DeviceBuilder::new()
43    .offload(true)  // Enable offload
44    .ipv4("10.0.0.1", 24, None)
45    .build_sync()?;
46
47// Allocate buffers for batch operations
48let mut original_buffer = vec![0; VIRTIO_NET_HDR_LEN + 65535];
49let mut bufs = vec![vec![0u8; 1500]; IDEAL_BATCH_SIZE];
50let mut sizes = vec![0; IDEAL_BATCH_SIZE];
51
52// Create GRO table for coalescing
53let mut gro_table = GROTable::default();
54
55loop {
56    // Receive multiple packets at once
57    let num = dev.recv_multiple(&mut original_buffer, &mut bufs, &mut sizes, 0)?;
58
59    for i in 0..num {
60        // Process each packet
61        println!("Packet {}: {} bytes", i, sizes[i]);
62    }
63}
64# }
65# Ok::<(), std::io::Error>(())
66```
67
68## Key Types
69
70- [`VirtioNetHdr`]: Header structure for virtio network offload
71- [`GROTable`]: Manages TCP and UDP flow coalescing for GRO
72- [`TcpGROTable`]: TCP-specific GRO state
73- [`UdpGROTable`]: UDP-specific GRO state
74
75## Key Functions
76
77- [`handle_gro`]: Process received packets and perform GRO coalescing
78- [`gso_split`]: Split a GSO packet into multiple segments
79- [`apply_tcp_coalesce_accounting`]: Update TCP headers after coalescing
80
81## Constants
82
83- [`VIRTIO_NET_HDR_LEN`]: Size of the virtio network header (12 bytes)
84- [`IDEAL_BATCH_SIZE`]: Recommended batch size for packet operations (128)
85- [`VIRTIO_NET_HDR_GSO_NONE`], [`VIRTIO_NET_HDR_GSO_TCPV4`], etc.: GSO type constants
86
87## References
88
89- [Linux virtio_net.h](https://github.com/torvalds/linux/blob/master/include/uapi/linux/virtio_net.h)
90- [WireGuard-go offload implementation](https://github.com/WireGuard/wireguard-go/blob/master/tun/offload_linux.go)
91
92## Platform Requirements
93
94- Linux kernel with TUN/TAP driver
95- Kernel support for IFF_VNET_HDR (available since Linux 2.6.32)
96- Root privileges to create TUN devices with offload enabled
97*/
98
99/// https://github.com/WireGuard/wireguard-go/blob/master/tun/offload_linux.go
100use crate::platform::linux::checksum::{checksum, pseudo_header_checksum_no_fold};
101use byteorder::{BigEndian, ByteOrder};
102use bytes::BytesMut;
103use libc::{IPPROTO_TCP, IPPROTO_UDP};
104use std::collections::HashMap;
105use std::io;
106
107/// GSO type: Not a GSO frame (normal packet).
108///
109/// This indicates a regular packet without Generic Segmentation Offload applied.
110/// See: <https://github.com/torvalds/linux/blob/master/include/uapi/linux/virtio_net.h>
111pub const VIRTIO_NET_HDR_GSO_NONE: u8 = 0;
112
113/// Flag: Use csum_start and csum_offset fields for checksum calculation.
114///
115/// When this flag is set, the packet requires checksum calculation.
116/// The `csum_start` field indicates where checksumming should begin,
117/// and `csum_offset` indicates where to write the checksum.
118pub const VIRTIO_NET_HDR_F_NEEDS_CSUM: u8 = 1;
119
120/// GSO type: IPv4 TCP segmentation (TSO - TCP Segmentation Offload).
121///
122/// Large TCP packets can be sent and will be segmented by the kernel/driver.
123pub const VIRTIO_NET_HDR_GSO_TCPV4: u8 = 1;
124
125/// GSO type: IPv6 TCP segmentation (TSO).
126///
127/// Similar to TCPV4 but for IPv6 packets.
128pub const VIRTIO_NET_HDR_GSO_TCPV6: u8 = 4;
129
130/// GSO type: UDP segmentation for IPv4 and IPv6 (USO - UDP Segmentation Offload).
131///
132/// Available in newer Linux kernels for UDP packet segmentation.
133pub const VIRTIO_NET_HDR_GSO_UDP_L4: u8 = 5;
134
135/// Recommended batch size for packet operations with offload.
136///
137/// This constant defines the optimal number of packets to handle per `recv_multiple`
138/// or `send_multiple` call. It balances between:
139/// - Amortizing system call overhead
140/// - Keeping latency reasonable
141/// - Memory usage for packet buffers
142///
143/// Based on WireGuard-go's implementation.
144///
145/// # Example
146///
147/// ```no_run
148/// # #[cfg(target_os = "linux")]
149/// # {
150/// use tun_rs::IDEAL_BATCH_SIZE;
151///
152/// // Allocate buffers for batch operations
153/// let mut bufs = vec![vec![0u8; 1500]; IDEAL_BATCH_SIZE];
154/// let mut sizes = vec![0; IDEAL_BATCH_SIZE];
155/// # }
156/// ```
157///
158/// See: <https://github.com/WireGuard/wireguard-go/blob/master/conn/conn.go#L19>
159pub const IDEAL_BATCH_SIZE: usize = 128;
160
161const TCP_FLAGS_OFFSET: usize = 13;
162
163const TCP_FLAG_FIN: u8 = 0x01;
164const TCP_FLAG_PSH: u8 = 0x08;
165const TCP_FLAG_ACK: u8 = 0x10;
166
167/// Virtio network header for offload support.
168///
169/// This structure precedes each packet when offload is enabled on a Linux TUN device.
170/// It provides metadata about Generic Segmentation Offload (GSO) and checksum requirements,
171/// allowing the kernel to perform hardware-accelerated operations.
172///
173/// The header matches the Linux kernel's `virtio_net_hdr` structure defined in
174/// `include/uapi/linux/virtio_net.h`.
175///
176/// # Memory Layout
177///
178/// The structure is `#[repr(C)]` and has a fixed size of 12 bytes ([`VIRTIO_NET_HDR_LEN`]).
179/// All multi-byte fields are in native endianness.
180///
181/// # Usage
182///
183/// When reading from a TUN device with offload enabled:
184/// ```no_run
185/// # #[cfg(target_os = "linux")]
186/// # {
187/// use tun_rs::{VirtioNetHdr, VIRTIO_NET_HDR_LEN};
188///
189/// let mut buf = vec![0u8; VIRTIO_NET_HDR_LEN + 1500];
190/// // let n = dev.recv(&mut buf)?;
191///
192/// // Decode the header
193/// // let hdr = VirtioNetHdr::decode(&buf[..VIRTIO_NET_HDR_LEN])?;
194/// // let packet = &buf[VIRTIO_NET_HDR_LEN..n];
195/// # }
196/// ```
197///
198/// # Fields
199///
200/// - `flags`: Bit flags for header processing (e.g., [`VIRTIO_NET_HDR_F_NEEDS_CSUM`])
201/// - `gso_type`: Type of GSO applied (e.g., [`VIRTIO_NET_HDR_GSO_TCPV4`])
202/// - `hdr_len`: Length of packet headers (Ethernet + IP + TCP/UDP)
203/// - `gso_size`: Maximum segment size for GSO
204/// - `csum_start`: Offset to start checksum calculation
205/// - `csum_offset`: Offset within checksum area to store the checksum
206///
207/// # References
208///
209/// - [Linux virtio_net.h](https://github.com/torvalds/linux/blob/master/include/uapi/linux/virtio_net.h)
210///
211/// See: <https://github.com/torvalds/linux/blob/master/include/uapi/linux/virtio_net.h>
212#[repr(C)]
213#[derive(Debug, Clone, Copy, Default)]
214pub struct VirtioNetHdr {
215    // #define VIRTIO_NET_HDR_F_NEEDS_CSUM	1	/* Use csum_start, csum_offset */
216    // #define VIRTIO_NET_HDR_F_DATA_VALID	2	/* Csum is valid */
217    // #define VIRTIO_NET_HDR_F_RSC_INFO	4	/* rsc info in csum_ fields */
218    pub flags: u8,
219    // #define VIRTIO_NET_HDR_GSO_NONE		0	/* Not a GSO frame */
220    // #define VIRTIO_NET_HDR_GSO_TCPV4	1	/* GSO frame, IPv4 TCP (TSO) */
221    // #define VIRTIO_NET_HDR_GSO_UDP		3	/* GSO frame, IPv4 UDP (UFO) */
222    // #define VIRTIO_NET_HDR_GSO_TCPV6	4	/* GSO frame, IPv6 TCP */
223    // #define VIRTIO_NET_HDR_GSO_UDP_L4	5	/* GSO frame, IPv4& IPv6 UDP (USO) */
224    // #define VIRTIO_NET_HDR_GSO_ECN		0x80	/* TCP has ECN set */
225    pub gso_type: u8,
226    // Ethernet + IP + tcp/udp hdrs
227    pub hdr_len: u16,
228    // Bytes to append to hdr_len per frame
229    pub gso_size: u16,
230    // Checksum calculation
231    pub csum_start: u16,
232    pub csum_offset: u16,
233}
234
235impl VirtioNetHdr {
236    /// Decode a virtio network header from a byte buffer.
237    ///
238    /// Reads the first [`VIRTIO_NET_HDR_LEN`] bytes from the buffer and interprets
239    /// them as a `VirtioNetHdr` structure.
240    ///
241    /// # Errors
242    ///
243    /// Returns an error if the buffer is too short (less than [`VIRTIO_NET_HDR_LEN`] bytes).
244    ///
245    /// # Example
246    ///
247    /// ```no_run
248    /// # #[cfg(target_os = "linux")]
249    /// # {
250    /// use tun_rs::{VirtioNetHdr, VIRTIO_NET_HDR_LEN};
251    ///
252    /// let buffer = vec![0u8; VIRTIO_NET_HDR_LEN + 1500];
253    /// let header = VirtioNetHdr::decode(&buffer)?;
254    /// println!("GSO type: {:?}", header.gso_type);
255    /// # }
256    /// # Ok::<(), std::io::Error>(())
257    /// ```
258    pub fn decode(buf: &[u8]) -> io::Result<VirtioNetHdr> {
259        if buf.len() < VIRTIO_NET_HDR_LEN {
260            return Err(io::Error::new(io::ErrorKind::InvalidInput, "too short"));
261        }
262        let mut hdr = std::mem::MaybeUninit::<VirtioNetHdr>::uninit();
263        unsafe {
264            // Safety:
265            // hdr is written by `buf`, both pointers satisfy the alignment requirement of `u8`
266            std::ptr::copy_nonoverlapping(
267                buf.as_ptr(),
268                hdr.as_mut_ptr() as *mut _,
269                std::mem::size_of::<VirtioNetHdr>(),
270            );
271            Ok(hdr.assume_init())
272        }
273    }
274
275    /// Encode a virtio network header into a byte buffer.
276    ///
277    /// Writes this header into the first [`VIRTIO_NET_HDR_LEN`] bytes of the buffer.
278    ///
279    /// # Errors
280    ///
281    /// Returns an error if the buffer is too short (less than [`VIRTIO_NET_HDR_LEN`] bytes).
282    ///
283    /// # Example
284    ///
285    /// ```no_run
286    /// # #[cfg(target_os = "linux")]
287    /// # {
288    /// use tun_rs::{VirtioNetHdr, VIRTIO_NET_HDR_GSO_NONE, VIRTIO_NET_HDR_LEN};
289    ///
290    /// let header = VirtioNetHdr {
291    ///     gso_type: VIRTIO_NET_HDR_GSO_NONE,
292    ///     ..Default::default()
293    /// };
294    ///
295    /// let mut buffer = vec![0u8; VIRTIO_NET_HDR_LEN + 1500];
296    /// header.encode(&mut buffer)?;
297    /// # }
298    /// # Ok::<(), std::io::Error>(())
299    /// ```
300    pub fn encode(&self, buf: &mut [u8]) -> io::Result<()> {
301        if buf.len() < VIRTIO_NET_HDR_LEN {
302            return Err(io::Error::new(io::ErrorKind::InvalidInput, "too short"));
303        }
304        unsafe {
305            let hdr_ptr = self as *const VirtioNetHdr as *const u8;
306            std::ptr::copy_nonoverlapping(hdr_ptr, buf.as_mut_ptr(), VIRTIO_NET_HDR_LEN);
307            Ok(())
308        }
309    }
310}
311
312/// Size of the virtio network header in bytes (12 bytes).
313///
314/// This constant represents the fixed size of the `VirtioNetHdr` structure.
315/// When offload is enabled on a TUN device, this header precedes every packet.
316///
317/// # Example
318///
319/// ```no_run
320/// # #[cfg(target_os = "linux")]
321/// # {
322/// use tun_rs::VIRTIO_NET_HDR_LEN;
323///
324/// // Allocate buffer with space for header + packet
325/// let mut buffer = vec![0u8; VIRTIO_NET_HDR_LEN + 1500];
326///
327/// // Header is at the start
328/// // let header_bytes = &buffer[..VIRTIO_NET_HDR_LEN];
329/// // Packet data follows the header
330/// // let packet_data = &buffer[VIRTIO_NET_HDR_LEN..];
331/// # }
332/// ```
333pub const VIRTIO_NET_HDR_LEN: usize = std::mem::size_of::<VirtioNetHdr>();
334
335/// Identifier for a TCP flow used in Generic Receive Offload (GRO).
336///
337/// This structure uniquely identifies a TCP connection for packet coalescing.
338/// Packets belonging to the same flow can be coalesced into larger segments,
339/// reducing per-packet processing overhead.
340///
341/// # Fields
342///
343/// The flow is identified by:
344/// - Source and destination IP addresses (IPv4 or IPv6)
345/// - Source and destination ports
346/// - TCP acknowledgment number (to avoid coalescing segments with different ACKs)
347/// - IP version flag
348#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
349pub struct TcpFlowKey {
350    src_addr: [u8; 16],
351    dst_addr: [u8; 16],
352    src_port: u16,
353    dst_port: u16,
354    rx_ack: u32, // varying ack values should not be coalesced. Treat them as separate flows.
355    is_v6: bool,
356}
357
358/// TCP Generic Receive Offload (GRO) table.
359///
360/// Manages the coalescing of TCP packets belonging to the same flow into larger segments.
361/// This reduces the number of packets that need to be processed by the application,
362/// improving throughput and reducing CPU usage.
363///
364/// # How TCP GRO Works
365///
366/// 1. Packets are received from the TUN device
367/// 2. The GRO table identifies packets belonging to the same TCP flow
368/// 3. Consecutive packets in the same flow are coalesced into a single large segment
369/// 4. The coalesced segment is passed to the application
370///
371/// # Usage
372///
373/// The GRO table is typically used in conjunction with [`handle_gro`]:
374///
375/// ```no_run
376/// # #[cfg(target_os = "linux")]
377/// # {
378/// use tun_rs::GROTable;
379///
380/// let mut gro_table = GROTable::default();
381///
382/// // Process received packets
383/// // handle_gro(..., &mut gro_table, ...)?;
384/// # }
385/// ```
386///
387/// # Performance Considerations
388///
389/// - Maintains a hash map of active flows
390/// - Preallocates buffers for [`IDEAL_BATCH_SIZE`] flows
391/// - Memory pooling reduces allocations
392/// - State is maintained across multiple recv_multiple calls
393pub struct TcpGROTable {
394    items_by_flow: HashMap<TcpFlowKey, Vec<TcpGROItem>>,
395    items_pool: Vec<Vec<TcpGROItem>>,
396}
397
398impl Default for TcpGROTable {
399    fn default() -> Self {
400        Self::new()
401    }
402}
403
404impl TcpGROTable {
405    fn new() -> Self {
406        let mut items_pool = Vec::with_capacity(IDEAL_BATCH_SIZE);
407        for _ in 0..IDEAL_BATCH_SIZE {
408            items_pool.push(Vec::with_capacity(IDEAL_BATCH_SIZE));
409        }
410        TcpGROTable {
411            items_by_flow: HashMap::with_capacity(IDEAL_BATCH_SIZE),
412            items_pool,
413        }
414    }
415}
416
417impl TcpFlowKey {
418    fn new(pkt: &[u8], src_addr_offset: usize, dst_addr_offset: usize, tcph_offset: usize) -> Self {
419        let mut key = TcpFlowKey {
420            src_addr: [0; 16],
421            dst_addr: [0; 16],
422            src_port: 0,
423            dst_port: 0,
424            rx_ack: 0,
425            is_v6: false,
426        };
427
428        let addr_size = dst_addr_offset - src_addr_offset;
429        key.src_addr[..addr_size].copy_from_slice(&pkt[src_addr_offset..dst_addr_offset]);
430        key.dst_addr[..addr_size]
431            .copy_from_slice(&pkt[dst_addr_offset..dst_addr_offset + addr_size]);
432        key.src_port = BigEndian::read_u16(&pkt[tcph_offset..]);
433        key.dst_port = BigEndian::read_u16(&pkt[tcph_offset + 2..]);
434        key.rx_ack = BigEndian::read_u32(&pkt[tcph_offset + 8..]);
435        key.is_v6 = addr_size == 16;
436        key
437    }
438}
439
440impl TcpGROTable {
441    /// lookupOrInsert looks up a flow for the provided packet and metadata,
442    /// returning the packets found for the flow, or inserting a new one if none
443    /// is found.
444    fn lookup_or_insert(
445        &mut self,
446        pkt: &[u8],
447        src_addr_offset: usize,
448        dst_addr_offset: usize,
449        tcph_offset: usize,
450        tcph_len: usize,
451        bufs_index: usize,
452    ) -> Option<&mut Vec<TcpGROItem>> {
453        let key = TcpFlowKey::new(pkt, src_addr_offset, dst_addr_offset, tcph_offset);
454        if self.items_by_flow.contains_key(&key) {
455            return self.items_by_flow.get_mut(&key);
456        }
457        // Insert the new item into the table
458        self.insert(
459            pkt,
460            src_addr_offset,
461            dst_addr_offset,
462            tcph_offset,
463            tcph_len,
464            bufs_index,
465        );
466        None
467    }
468    /// insert an item in the table for the provided packet and packet metadata.
469    fn insert(
470        &mut self,
471        pkt: &[u8],
472        src_addr_offset: usize,
473        dst_addr_offset: usize,
474        tcph_offset: usize,
475        tcph_len: usize,
476        bufs_index: usize,
477    ) {
478        let key = TcpFlowKey::new(pkt, src_addr_offset, dst_addr_offset, tcph_offset);
479        let item = TcpGROItem {
480            key,
481            bufs_index: bufs_index as u16,
482            num_merged: 0,
483            gso_size: pkt[tcph_offset + tcph_len..].len() as u16,
484            iph_len: tcph_offset as u8,
485            tcph_len: tcph_len as u8,
486            sent_seq: BigEndian::read_u32(&pkt[tcph_offset + 4..tcph_offset + 8]),
487            psh_set: pkt[tcph_offset + TCP_FLAGS_OFFSET] & TCP_FLAG_PSH != 0,
488        };
489
490        let items = self
491            .items_by_flow
492            .entry(key)
493            .or_insert_with(|| self.items_pool.pop().unwrap_or_default());
494        items.push(item);
495    }
496}
497// func (t *tcpGROTable) updateAt(item tcpGROItem, i int) {
498// 	items, _ := t.itemsByFlow[item.key]
499// 	items[i] = item
500// }
501//
502// func (t *tcpGROTable) deleteAt(key tcpFlowKey, i int) {
503// 	items, _ := t.itemsByFlow[key]
504// 	items = append(items[:i], items[i+1:]...)
505// 	t.itemsByFlow[key] = items
506// }
507
508/// tcpGROItem represents bookkeeping data for a TCP packet during the lifetime
509/// of a GRO evaluation across a vector of packets.
510#[derive(Debug, Clone, Copy)]
511pub struct TcpGROItem {
512    key: TcpFlowKey,
513    sent_seq: u32,   // the sequence number
514    bufs_index: u16, // the index into the original bufs slice
515    num_merged: u16, // the number of packets merged into this item
516    gso_size: u16,   // payload size
517    iph_len: u8,     // ip header len
518    tcph_len: u8,    // tcp header len
519    psh_set: bool,   // psh flag is set
520}
521
522// func (t *tcpGROTable) newItems() []tcpGROItem {
523// 	var items []tcpGROItem
524// 	items, t.itemsPool = t.itemsPool[len(t.itemsPool)-1], t.itemsPool[:len(t.itemsPool)-1]
525// 	return items
526// }
527impl TcpGROTable {
528    fn reset(&mut self) {
529        for (_key, mut items) in self.items_by_flow.drain() {
530            items.clear();
531            self.items_pool.push(items);
532        }
533    }
534}
535
536/// udpFlowKey represents the key for a UDP flow.
537#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
538pub struct UdpFlowKey {
539    src_addr: [u8; 16], // srcAddr
540    dst_addr: [u8; 16], // dstAddr
541    src_port: u16,      // srcPort
542    dst_port: u16,      // dstPort
543    is_v6: bool,        // isV6
544}
545
546///  udpGROTable holds flow and coalescing information for the purposes of UDP GRO.
547pub struct UdpGROTable {
548    items_by_flow: HashMap<UdpFlowKey, Vec<UdpGROItem>>,
549    items_pool: Vec<Vec<UdpGROItem>>,
550}
551
552impl Default for UdpGROTable {
553    fn default() -> Self {
554        UdpGROTable::new()
555    }
556}
557
558impl UdpGROTable {
559    pub fn new() -> Self {
560        let mut items_pool = Vec::with_capacity(IDEAL_BATCH_SIZE);
561        for _ in 0..IDEAL_BATCH_SIZE {
562            items_pool.push(Vec::with_capacity(IDEAL_BATCH_SIZE));
563        }
564        UdpGROTable {
565            items_by_flow: HashMap::with_capacity(IDEAL_BATCH_SIZE),
566            items_pool,
567        }
568    }
569}
570
571impl UdpFlowKey {
572    pub fn new(
573        pkt: &[u8],
574        src_addr_offset: usize,
575        dst_addr_offset: usize,
576        udph_offset: usize,
577    ) -> UdpFlowKey {
578        let mut key = UdpFlowKey {
579            src_addr: [0; 16],
580            dst_addr: [0; 16],
581            src_port: 0,
582            dst_port: 0,
583            is_v6: false,
584        };
585        let addr_size = dst_addr_offset - src_addr_offset;
586        key.src_addr[..addr_size].copy_from_slice(&pkt[src_addr_offset..dst_addr_offset]);
587        key.dst_addr[..addr_size]
588            .copy_from_slice(&pkt[dst_addr_offset..dst_addr_offset + addr_size]);
589        key.src_port = BigEndian::read_u16(&pkt[udph_offset..]);
590        key.dst_port = BigEndian::read_u16(&pkt[udph_offset + 2..]);
591        key.is_v6 = addr_size == 16;
592        key
593    }
594}
595
596impl UdpGROTable {
597    /// Looks up a flow for the provided packet and metadata.
598    /// Returns a reference to the packets found for the flow and a boolean indicating if the flow already existed.
599    /// If the flow is not found, inserts a new flow and returns `None` for the items.
600    fn lookup_or_insert(
601        &mut self,
602        pkt: &[u8],
603        src_addr_offset: usize,
604        dst_addr_offset: usize,
605        udph_offset: usize,
606        bufs_index: usize,
607    ) -> Option<&mut Vec<UdpGROItem>> {
608        let key = UdpFlowKey::new(pkt, src_addr_offset, dst_addr_offset, udph_offset);
609        if self.items_by_flow.contains_key(&key) {
610            self.items_by_flow.get_mut(&key)
611        } else {
612            // If the flow does not exist, insert a new entry.
613            self.insert(
614                pkt,
615                src_addr_offset,
616                dst_addr_offset,
617                udph_offset,
618                bufs_index,
619                false,
620            );
621            None
622        }
623    }
624    /// Inserts an item in the table for the provided packet and its metadata.
625    fn insert(
626        &mut self,
627        pkt: &[u8],
628        src_addr_offset: usize,
629        dst_addr_offset: usize,
630        udph_offset: usize,
631        bufs_index: usize,
632        c_sum_known_invalid: bool,
633    ) {
634        let key = UdpFlowKey::new(pkt, src_addr_offset, dst_addr_offset, udph_offset);
635        let item = UdpGROItem {
636            key,
637            bufs_index: bufs_index as u16,
638            num_merged: 0,
639            gso_size: (pkt.len() - (udph_offset + UDP_H_LEN)) as u16,
640            iph_len: udph_offset as u8,
641            c_sum_known_invalid,
642        };
643        let items = self
644            .items_by_flow
645            .entry(key)
646            .or_insert_with(|| self.items_pool.pop().unwrap_or_default());
647        items.push(item);
648    }
649}
650// func (u *udpGROTable) updateAt(item udpGROItem, i int) {
651// 	items, _ := u.itemsByFlow[item.key]
652// 	items[i] = item
653// }
654
655/// udpGROItem represents bookkeeping data for a UDP packet during the lifetime
656/// of a GRO evaluation across a vector of packets.
657#[derive(Debug, Clone, Copy)]
658pub struct UdpGROItem {
659    key: UdpFlowKey,           // udpFlowKey
660    bufs_index: u16,           // the index into the original bufs slice
661    num_merged: u16,           // the number of packets merged into this item
662    gso_size: u16,             // payload size
663    iph_len: u8,               // ip header len
664    c_sum_known_invalid: bool, // UDP header checksum validity; a false value DOES NOT imply valid, just unknown.
665}
666// func (u *udpGROTable) newItems() []udpGROItem {
667// 	var items []udpGROItem
668// 	items, u.itemsPool = u.itemsPool[len(u.itemsPool)-1], u.itemsPool[:len(u.itemsPool)-1]
669// 	return items
670// }
671
672impl UdpGROTable {
673    fn reset(&mut self) {
674        for (_key, mut items) in self.items_by_flow.drain() {
675            items.clear();
676            self.items_pool.push(items);
677        }
678    }
679}
680
681/// canCoalesce represents the outcome of checking if two TCP packets are
682/// candidates for coalescing.
683#[derive(Copy, Clone, Eq, PartialEq)]
684enum CanCoalesce {
685    Prepend,
686    Unavailable,
687    Append,
688}
689
690/// ipHeadersCanCoalesce returns true if the IP headers found in pktA and pktB
691/// meet all requirements to be merged as part of a GRO operation, otherwise it
692/// returns false.
693fn ip_headers_can_coalesce(pkt_a: &[u8], pkt_b: &[u8]) -> bool {
694    if pkt_a.len() < 9 || pkt_b.len() < 9 {
695        return false;
696    }
697
698    if pkt_a[0] >> 4 == 6 {
699        if pkt_a[0] != pkt_b[0] || pkt_a[1] >> 4 != pkt_b[1] >> 4 {
700            // cannot coalesce with unequal Traffic class values
701            return false;
702        }
703        if pkt_a[7] != pkt_b[7] {
704            // cannot coalesce with unequal Hop limit values
705            return false;
706        }
707    } else {
708        if pkt_a[1] != pkt_b[1] {
709            // cannot coalesce with unequal ToS values
710            return false;
711        }
712        if pkt_a[6] >> 5 != pkt_b[6] >> 5 {
713            // cannot coalesce with unequal DF or reserved bits. MF is checked
714            // further up the stack.
715            return false;
716        }
717        if pkt_a[8] != pkt_b[8] {
718            // cannot coalesce with unequal TTL values
719            return false;
720        }
721    }
722
723    true
724}
725
726/// udpPacketsCanCoalesce evaluates if pkt can be coalesced with the packet
727/// described by item. iphLen and gsoSize describe pkt. bufs is the vector of
728/// packets involved in the current GRO evaluation. bufsOffset is the offset at
729/// which packet data begins within bufs.
730fn udp_packets_can_coalesce<B: ExpandBuffer>(
731    pkt: &[u8],
732    iph_len: u8,
733    gso_size: u16,
734    item: &UdpGROItem,
735    bufs: &[B],
736    bufs_offset: usize,
737) -> CanCoalesce {
738    let pkt_target = &bufs[item.bufs_index as usize].as_ref()[bufs_offset..];
739    if !ip_headers_can_coalesce(pkt, pkt_target) {
740        return CanCoalesce::Unavailable;
741    }
742    if (pkt_target[(iph_len as usize + UDP_H_LEN)..].len()) % (item.gso_size as usize) != 0 {
743        // A smaller than gsoSize packet has been appended previously.
744        // Nothing can come after a smaller packet on the end.
745        return CanCoalesce::Unavailable;
746    }
747    if gso_size > item.gso_size {
748        // We cannot have a larger packet following a smaller one.
749        return CanCoalesce::Unavailable;
750    }
751    CanCoalesce::Append
752}
753
754/// tcpPacketsCanCoalesce evaluates if pkt can be coalesced with the packet
755/// described by item. This function makes considerations that match the kernel's
756/// GRO self tests, which can be found in tools/testing/selftests/net/gro.c.
757#[allow(clippy::too_many_arguments)]
758fn tcp_packets_can_coalesce<B: ExpandBuffer>(
759    pkt: &[u8],
760    iph_len: u8,
761    tcph_len: u8,
762    seq: u32,
763    psh_set: bool,
764    gso_size: u16,
765    item: &TcpGROItem,
766    bufs: &[B],
767    bufs_offset: usize,
768) -> CanCoalesce {
769    let pkt_target = &bufs[item.bufs_index as usize].as_ref()[bufs_offset..];
770
771    if tcph_len != item.tcph_len {
772        // cannot coalesce with unequal tcp options len
773        return CanCoalesce::Unavailable;
774    }
775
776    if tcph_len > 20
777        && pkt[iph_len as usize + 20..iph_len as usize + tcph_len as usize]
778            != pkt_target[item.iph_len as usize + 20..item.iph_len as usize + tcph_len as usize]
779    {
780        // cannot coalesce with unequal tcp options
781        return CanCoalesce::Unavailable;
782    }
783
784    if !ip_headers_can_coalesce(pkt, pkt_target) {
785        return CanCoalesce::Unavailable;
786    }
787
788    // seq adjacency
789    let mut lhs_len = item.gso_size as usize;
790    lhs_len += (item.num_merged as usize) * (item.gso_size as usize);
791
792    if seq == item.sent_seq.wrapping_add(lhs_len as u32) {
793        // pkt aligns following item from a seq num perspective
794        if item.psh_set {
795            // We cannot append to a segment that has the PSH flag set, PSH
796            // can only be set on the final segment in a reassembled group.
797            return CanCoalesce::Unavailable;
798        }
799
800        if pkt_target[iph_len as usize + tcph_len as usize..].len() % item.gso_size as usize != 0 {
801            // A smaller than gsoSize packet has been appended previously.
802            // Nothing can come after a smaller packet on the end.
803            return CanCoalesce::Unavailable;
804        }
805
806        if gso_size > item.gso_size {
807            // We cannot have a larger packet following a smaller one.
808            return CanCoalesce::Unavailable;
809        }
810
811        return CanCoalesce::Append;
812    } else if seq.wrapping_add(gso_size as u32) == item.sent_seq {
813        // pkt aligns in front of item from a seq num perspective
814        if psh_set {
815            // We cannot prepend with a segment that has the PSH flag set, PSH
816            // can only be set on the final segment in a reassembled group.
817            return CanCoalesce::Unavailable;
818        }
819
820        if gso_size < item.gso_size {
821            // We cannot have a larger packet following a smaller one.
822            return CanCoalesce::Unavailable;
823        }
824
825        if gso_size > item.gso_size && item.num_merged > 0 {
826            // There's at least one previous merge, and we're larger than all
827            // previous. This would put multiple smaller packets on the end.
828            return CanCoalesce::Unavailable;
829        }
830
831        return CanCoalesce::Prepend;
832    }
833
834    CanCoalesce::Unavailable
835}
836
837fn checksum_valid(pkt: &[u8], iph_len: u8, proto: u8, is_v6: bool) -> bool {
838    let (src_addr_at, addr_size) = if is_v6 {
839        (IPV6_SRC_ADDR_OFFSET, 16)
840    } else {
841        (IPV4_SRC_ADDR_OFFSET, 4)
842    };
843
844    let len_for_pseudo = (pkt.len() as u16).saturating_sub(iph_len as u16);
845
846    let c_sum = pseudo_header_checksum_no_fold(
847        proto,
848        &pkt[src_addr_at..src_addr_at + addr_size],
849        &pkt[src_addr_at + addr_size..src_addr_at + addr_size * 2],
850        len_for_pseudo,
851    );
852
853    !checksum(&pkt[iph_len as usize..], c_sum) == 0
854}
855
856/// coalesceResult represents the result of attempting to coalesce two TCP
857/// packets.
858enum CoalesceResult {
859    InsufficientCap,
860    PSHEnding,
861    ItemInvalidCSum,
862    PktInvalidCSum,
863    Success,
864}
865
866/// coalesceUDPPackets attempts to coalesce pkt with the packet described by
867/// item, and returns the outcome.
868fn coalesce_udp_packets<B: ExpandBuffer>(
869    pkt: &[u8],
870    item: &mut UdpGROItem,
871    bufs: &mut [B],
872    bufs_offset: usize,
873    is_v6: bool,
874) -> CoalesceResult {
875    let buf = bufs[item.bufs_index as usize].as_ref();
876    // let pkt_head = &buf[bufs_offset..]; // the packet that will end up at the front
877    let headers_len = item.iph_len as usize + UDP_H_LEN;
878    let coalesced_len = buf[bufs_offset..].len() + pkt.len() - headers_len;
879    if bufs[item.bufs_index as usize].buf_capacity() < bufs_offset * 2 + coalesced_len {
880        // We don't want to allocate a new underlying array if capacity is
881        // too small.
882        return CoalesceResult::InsufficientCap;
883    }
884
885    if item.num_merged == 0
886        && (item.c_sum_known_invalid
887            || !checksum_valid(&buf[bufs_offset..], item.iph_len, IPPROTO_UDP as _, is_v6))
888    {
889        return CoalesceResult::ItemInvalidCSum;
890    }
891
892    if !checksum_valid(pkt, item.iph_len, IPPROTO_UDP as _, is_v6) {
893        return CoalesceResult::PktInvalidCSum;
894    }
895    bufs[item.bufs_index as usize].buf_extend_from_slice(&pkt[headers_len..]);
896    item.num_merged += 1;
897    CoalesceResult::Success
898}
899
900/// coalesceTCPPackets attempts to coalesce pkt with the packet described by
901/// item, and returns the outcome. This function may swap bufs elements in the
902/// event of a prepend as item's bufs index is already being tracked for writing
903/// to a Device.
904#[allow(clippy::too_many_arguments)]
905fn coalesce_tcp_packets<B: ExpandBuffer>(
906    mode: CanCoalesce,
907    pkt: &[u8],
908    pkt_bufs_index: usize,
909    gso_size: u16,
910    seq: u32,
911    psh_set: bool,
912    item: &mut TcpGROItem,
913    bufs: &mut [B],
914    bufs_offset: usize,
915    is_v6: bool,
916) -> CoalesceResult {
917    let pkt_head: &[u8]; // the packet that will end up at the front
918    let headers_len = (item.iph_len + item.tcph_len) as usize;
919    let coalesced_len =
920        bufs[item.bufs_index as usize].as_ref()[bufs_offset..].len() + pkt.len() - headers_len;
921    // Copy data
922    if mode == CanCoalesce::Prepend {
923        pkt_head = pkt;
924        if bufs[pkt_bufs_index].buf_capacity() < 2 * bufs_offset + coalesced_len {
925            // We don't want to allocate a new underlying array if capacity is
926            // too small.
927            return CoalesceResult::InsufficientCap;
928        }
929        if psh_set {
930            return CoalesceResult::PSHEnding;
931        }
932        if item.num_merged == 0
933            && !checksum_valid(
934                &bufs[item.bufs_index as usize].as_ref()[bufs_offset..],
935                item.iph_len,
936                IPPROTO_TCP as _,
937                is_v6,
938            )
939        {
940            return CoalesceResult::ItemInvalidCSum;
941        }
942        if !checksum_valid(pkt, item.iph_len, IPPROTO_TCP as _, is_v6) {
943            return CoalesceResult::PktInvalidCSum;
944        }
945        item.sent_seq = seq;
946        let extend_by = coalesced_len - pkt_head.len();
947        let len = bufs[pkt_bufs_index].as_ref().len();
948        bufs[pkt_bufs_index].buf_resize(len + extend_by, 0);
949        let src = bufs[item.bufs_index as usize].as_ref()[bufs_offset + headers_len..].as_ptr();
950        let dst = bufs[pkt_bufs_index].as_mut()[bufs_offset + pkt.len()..].as_mut_ptr();
951        unsafe {
952            std::ptr::copy_nonoverlapping(src, dst, extend_by);
953        }
954        // Flip the slice headers in bufs as part of prepend. The index of item
955        // is already being tracked for writing.
956        bufs.swap(item.bufs_index as usize, pkt_bufs_index);
957    } else {
958        // pkt_head = &bufs[item.bufs_index as usize][bufs_offset..];
959        if bufs[item.bufs_index as usize].buf_capacity() < 2 * bufs_offset + coalesced_len {
960            // We don't want to allocate a new underlying array if capacity is
961            // too small.
962            return CoalesceResult::InsufficientCap;
963        }
964        if item.num_merged == 0
965            && !checksum_valid(
966                &bufs[item.bufs_index as usize].as_ref()[bufs_offset..],
967                item.iph_len,
968                IPPROTO_TCP as _,
969                is_v6,
970            )
971        {
972            return CoalesceResult::ItemInvalidCSum;
973        }
974        if !checksum_valid(pkt, item.iph_len, IPPROTO_TCP as _, is_v6) {
975            return CoalesceResult::PktInvalidCSum;
976        }
977        if psh_set {
978            // We are appending a segment with PSH set.
979            item.psh_set = psh_set;
980            bufs[item.bufs_index as usize].as_mut()
981                [bufs_offset + item.iph_len as usize + TCP_FLAGS_OFFSET] |= TCP_FLAG_PSH;
982        }
983        // https://github.com/WireGuard/wireguard-go/blob/12269c2761734b15625017d8565745096325392f/tun/offload_linux.go#L495
984        // extendBy := len(pkt) - int(headersLen)
985        // 		bufs[item.bufsIndex] = append(bufs[item.bufsIndex], make([]byte, extendBy)...)
986        // 		copy(bufs[item.bufsIndex][bufsOffset+len(pktHead):], pkt[headersLen:])
987        bufs[item.bufs_index as usize].buf_extend_from_slice(&pkt[headers_len..]);
988    }
989
990    if gso_size > item.gso_size {
991        item.gso_size = gso_size;
992    }
993
994    item.num_merged += 1;
995    CoalesceResult::Success
996}
997
998const IPV4_FLAG_MORE_FRAGMENTS: u8 = 0x20;
999
1000const IPV4_SRC_ADDR_OFFSET: usize = 12;
1001const IPV6_SRC_ADDR_OFFSET: usize = 8;
1002// maxUint16         = 1<<16 - 1
1003
1004#[derive(PartialEq, Eq)]
1005enum GroResult {
1006    Noop,
1007    TableInsert,
1008    Coalesced,
1009}
1010
1011/// tcpGRO evaluates the TCP packet at pktI in bufs for coalescing with
1012/// existing packets tracked in table. It returns a groResultNoop when no
1013/// action was taken, groResultTableInsert when the evaluated packet was
1014/// inserted into table, and groResultCoalesced when the evaluated packet was
1015/// coalesced with another packet in table.
1016fn tcp_gro<B: ExpandBuffer>(
1017    bufs: &mut [B],
1018    offset: usize,
1019    pkt_i: usize,
1020    table: &mut TcpGROTable,
1021    is_v6: bool,
1022) -> GroResult {
1023    let pkt = unsafe { &*(&bufs[pkt_i].as_ref()[offset..] as *const [u8]) };
1024    if pkt.len() > u16::MAX as usize {
1025        // A valid IPv4 or IPv6 packet will never exceed this.
1026        return GroResult::Noop;
1027    }
1028
1029    let mut iph_len = ((pkt[0] & 0x0F) * 4) as usize;
1030    if is_v6 {
1031        iph_len = 40;
1032        let ipv6_h_payload_len = u16::from_be_bytes([pkt[4], pkt[5]]) as usize;
1033        if ipv6_h_payload_len != pkt.len() - iph_len {
1034            return GroResult::Noop;
1035        }
1036    } else {
1037        let total_len = u16::from_be_bytes([pkt[2], pkt[3]]) as usize;
1038        if total_len != pkt.len() {
1039            return GroResult::Noop;
1040        }
1041    }
1042
1043    if pkt.len() < iph_len {
1044        return GroResult::Noop;
1045    }
1046
1047    let tcph_len = ((pkt[iph_len + 12] >> 4) * 4) as usize;
1048    if !(20..=60).contains(&tcph_len) {
1049        return GroResult::Noop;
1050    }
1051
1052    if pkt.len() < iph_len + tcph_len {
1053        return GroResult::Noop;
1054    }
1055
1056    if !is_v6 && (pkt[6] & IPV4_FLAG_MORE_FRAGMENTS != 0 || pkt[6] << 3 != 0 || pkt[7] != 0) {
1057        // no GRO support for fragmented segments for now
1058        return GroResult::Noop;
1059    }
1060
1061    let tcp_flags = pkt[iph_len + TCP_FLAGS_OFFSET];
1062    let mut psh_set = false;
1063
1064    // not a candidate if any non-ACK flags (except PSH+ACK) are set
1065    if tcp_flags != TCP_FLAG_ACK {
1066        if pkt[iph_len + TCP_FLAGS_OFFSET] != TCP_FLAG_ACK | TCP_FLAG_PSH {
1067            return GroResult::Noop;
1068        }
1069        psh_set = true;
1070    }
1071
1072    let gso_size = (pkt.len() - tcph_len - iph_len) as u16;
1073    // not a candidate if payload len is 0
1074    if gso_size < 1 {
1075        return GroResult::Noop;
1076    }
1077
1078    let seq = u32::from_be_bytes([
1079        pkt[iph_len + 4],
1080        pkt[iph_len + 5],
1081        pkt[iph_len + 6],
1082        pkt[iph_len + 7],
1083    ]);
1084
1085    let mut src_addr_offset = IPV4_SRC_ADDR_OFFSET;
1086    let mut addr_len = 4;
1087    if is_v6 {
1088        src_addr_offset = IPV6_SRC_ADDR_OFFSET;
1089        addr_len = 16;
1090    }
1091
1092    let items = if let Some(items) = table.lookup_or_insert(
1093        pkt,
1094        src_addr_offset,
1095        src_addr_offset + addr_len,
1096        iph_len,
1097        tcph_len,
1098        pkt_i,
1099    ) {
1100        items
1101    } else {
1102        return GroResult::TableInsert;
1103    };
1104
1105    for i in (0..items.len()).rev() {
1106        // In the best case of packets arriving in order iterating in reverse is
1107        // more efficient if there are multiple items for a given flow. This
1108        // also enables a natural table.delete_at() in the
1109        // coalesce_item_invalid_csum case without the need for index tracking.
1110        // This algorithm makes a best effort to coalesce in the event of
1111        // unordered packets, where pkt may land anywhere in items from a
1112        // sequence number perspective, however once an item is inserted into
1113        // the table it is never compared across other items later.
1114        let item = &mut items[i];
1115        let can = tcp_packets_can_coalesce(
1116            pkt,
1117            iph_len as u8,
1118            tcph_len as u8,
1119            seq,
1120            psh_set,
1121            gso_size,
1122            item,
1123            bufs,
1124            offset,
1125        );
1126
1127        match can {
1128            CanCoalesce::Unavailable => {}
1129            _ => {
1130                let result = coalesce_tcp_packets(
1131                    can, pkt, pkt_i, gso_size, seq, psh_set, item, bufs, offset, is_v6,
1132                );
1133
1134                match result {
1135                    CoalesceResult::Success => {
1136                        // table.update_at(item, i);
1137                        return GroResult::Coalesced;
1138                    }
1139                    CoalesceResult::ItemInvalidCSum => {
1140                        // delete the item with an invalid csum
1141                        // table.delete_at(item.key, i);
1142                        items.remove(i);
1143                    }
1144                    CoalesceResult::PktInvalidCSum => {
1145                        // no point in inserting an item that we can't coalesce
1146                        return GroResult::Noop;
1147                    }
1148                    _ => {}
1149                }
1150            }
1151        }
1152    }
1153
1154    // failed to coalesce with any other packets; store the item in the flow
1155    table.insert(
1156        pkt,
1157        src_addr_offset,
1158        src_addr_offset + addr_len,
1159        iph_len,
1160        tcph_len,
1161        pkt_i,
1162    );
1163    GroResult::TableInsert
1164}
1165
1166/// Update packet headers after TCP packet coalescing.
1167///
1168/// After [`handle_gro`] coalesces multiple TCP packets into larger segments,
1169/// this function updates the packet headers to reflect the coalesced state.
1170/// It writes virtio headers with GSO information and updates IP/TCP headers.
1171///
1172/// # Arguments
1173///
1174/// * `bufs` - Mutable slice of packet buffers that were processed by GRO
1175/// * `offset` - Offset where packet data begins (typically [`VIRTIO_NET_HDR_LEN`])
1176/// * `table` - The TCP GRO table containing coalescing metadata
1177///
1178/// # What It Does
1179///
1180/// For each coalesced packet:
1181/// 1. Creates a virtio header with GSO type set to TCP (v4 or v6)
1182/// 2. Sets the segment size (`gso_size`) for future segmentation
1183/// 3. Calculates and stores the pseudo-header checksum for TCP
1184/// 4. Updates IP total length field
1185/// 5. Recalculates IPv4 header checksum if needed
1186///
1187/// The resulting packets can be efficiently segmented by the kernel when transmitted.
1188///
1189/// # Usage
1190///
1191/// This function is typically called automatically by [`handle_gro`] after packet
1192/// coalescing is complete. You usually don't need to call it directly.
1193///
1194/// # Errors
1195///
1196/// Returns an error if:
1197/// - Buffer sizes are incorrect
1198/// - Header encoding fails
1199/// - Packet structure is invalid
1200///
1201/// # See Also
1202///
1203/// - [`handle_gro`] - Main GRO processing function that calls this
1204/// - [`TcpGROTable`] - Maintains TCP flow state for coalescing
1205pub fn apply_tcp_coalesce_accounting<B: ExpandBuffer>(
1206    bufs: &mut [B],
1207    offset: usize,
1208    table: &TcpGROTable,
1209) -> io::Result<()> {
1210    for items in table.items_by_flow.values() {
1211        for item in items {
1212            if item.num_merged > 0 {
1213                let mut hdr = VirtioNetHdr {
1214                    flags: VIRTIO_NET_HDR_F_NEEDS_CSUM,
1215                    hdr_len: (item.iph_len + item.tcph_len) as u16,
1216                    gso_size: item.gso_size,
1217                    csum_start: item.iph_len as u16,
1218                    csum_offset: 16,
1219                    gso_type: 0, // Will be set later
1220                };
1221                let buf = bufs[item.bufs_index as usize].as_mut();
1222                let pkt = &mut buf[offset..];
1223                let pkt_len = pkt.len();
1224
1225                // Calculate the pseudo header checksum and place it at the TCP
1226                // checksum offset. Downstream checksum offloading will combine
1227                // this with computation of the tcp header and payload checksum.
1228                let addr_len = if item.key.is_v6 { 16 } else { 4 };
1229                let src_addr_at = if item.key.is_v6 {
1230                    IPV6_SRC_ADDR_OFFSET
1231                } else {
1232                    IPV4_SRC_ADDR_OFFSET
1233                };
1234
1235                let src_addr =
1236                    unsafe { &*(&pkt[src_addr_at..src_addr_at + addr_len] as *const [u8]) };
1237                let dst_addr = unsafe {
1238                    &*(&pkt[src_addr_at + addr_len..src_addr_at + addr_len * 2] as *const [u8])
1239                };
1240                // Recalculate the total len (IPv4) or payload len (IPv6).
1241                // Recalculate the (IPv4) header checksum.
1242                if item.key.is_v6 {
1243                    hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1244                    BigEndian::write_u16(&mut pkt[4..6], pkt_len as u16 - item.iph_len as u16);
1245                } else {
1246                    hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1247                    pkt[10] = 0;
1248                    pkt[11] = 0;
1249                    BigEndian::write_u16(&mut pkt[2..4], pkt_len as u16);
1250                    let iph_csum = !checksum(&pkt[..item.iph_len as usize], 0);
1251                    BigEndian::write_u16(&mut pkt[10..12], iph_csum);
1252                }
1253
1254                hdr.encode(&mut buf[offset - VIRTIO_NET_HDR_LEN..])?;
1255
1256                let pkt = &mut buf[offset..];
1257
1258                let psum = pseudo_header_checksum_no_fold(
1259                    IPPROTO_TCP as _,
1260                    src_addr,
1261                    dst_addr,
1262                    pkt_len as u16 - item.iph_len as u16,
1263                );
1264                let tcp_csum = checksum(&[], psum);
1265                BigEndian::write_u16(
1266                    &mut pkt[(hdr.csum_start + hdr.csum_offset) as usize..],
1267                    tcp_csum,
1268                );
1269            } else {
1270                let hdr = VirtioNetHdr::default();
1271                hdr.encode(
1272                    &mut bufs[item.bufs_index as usize].as_mut()[offset - VIRTIO_NET_HDR_LEN..],
1273                )?;
1274            }
1275        }
1276    }
1277    Ok(())
1278}
1279
1280// applyUDPCoalesceAccounting updates bufs to account for coalescing based on the
1281// metadata found in table.
1282pub fn apply_udp_coalesce_accounting<B: ExpandBuffer>(
1283    bufs: &mut [B],
1284    offset: usize,
1285    table: &UdpGROTable,
1286) -> io::Result<()> {
1287    for items in table.items_by_flow.values() {
1288        for item in items {
1289            if item.num_merged > 0 {
1290                let hdr = VirtioNetHdr {
1291                    flags: VIRTIO_NET_HDR_F_NEEDS_CSUM, // this turns into CHECKSUM_PARTIAL in the skb
1292                    hdr_len: item.iph_len as u16 + UDP_H_LEN as u16,
1293                    gso_size: item.gso_size,
1294                    csum_start: item.iph_len as u16,
1295                    csum_offset: 6,
1296                    gso_type: VIRTIO_NET_HDR_GSO_UDP_L4,
1297                };
1298
1299                let buf = bufs[item.bufs_index as usize].as_mut();
1300                let pkt = &mut buf[offset..];
1301                let pkt_len = pkt.len();
1302
1303                // Calculate the pseudo header checksum and place it at the UDP
1304                // checksum offset. Downstream checksum offloading will combine
1305                // this with computation of the udp header and payload checksum.
1306                let (addr_len, src_addr_at) = if item.key.is_v6 {
1307                    (16, IPV6_SRC_ADDR_OFFSET)
1308                } else {
1309                    (4, IPV4_SRC_ADDR_OFFSET)
1310                };
1311
1312                let src_addr =
1313                    unsafe { &*(&pkt[src_addr_at..(src_addr_at + addr_len)] as *const [u8]) };
1314                let dst_addr = unsafe {
1315                    &*(&pkt[(src_addr_at + addr_len)..(src_addr_at + addr_len * 2)]
1316                        as *const [u8])
1317                };
1318
1319                // Recalculate the total len (IPv4) or payload len (IPv6).
1320                // Recalculate the (IPv4) header checksum.
1321                if item.key.is_v6 {
1322                    BigEndian::write_u16(&mut pkt[4..6], pkt_len as u16 - item.iph_len as u16);
1323                    // set new IPv6 header payload len
1324                } else {
1325                    pkt[10] = 0;
1326                    pkt[11] = 0;
1327                    BigEndian::write_u16(&mut pkt[2..4], pkt_len as u16); // set new total length
1328                    let iph_csum = !checksum(&pkt[..item.iph_len as usize], 0);
1329                    BigEndian::write_u16(&mut pkt[10..12], iph_csum); // set IPv4 header checksum field
1330                }
1331
1332                hdr.encode(&mut buf[offset - VIRTIO_NET_HDR_LEN..])?;
1333                let pkt = &mut buf[offset..];
1334                // Recalculate the UDP len field value
1335                BigEndian::write_u16(
1336                    &mut pkt[(item.iph_len as usize + 4)..(item.iph_len as usize + 6)],
1337                    pkt_len as u16 - item.iph_len as u16,
1338                );
1339
1340                let psum = pseudo_header_checksum_no_fold(
1341                    IPPROTO_UDP as _,
1342                    src_addr,
1343                    dst_addr,
1344                    pkt_len as u16 - item.iph_len as u16,
1345                );
1346
1347                let udp_csum = checksum(&[], psum);
1348                BigEndian::write_u16(
1349                    &mut pkt[(hdr.csum_start + hdr.csum_offset) as usize..],
1350                    udp_csum,
1351                );
1352            } else {
1353                let hdr = VirtioNetHdr::default();
1354                hdr.encode(
1355                    &mut bufs[item.bufs_index as usize].as_mut()[offset - VIRTIO_NET_HDR_LEN..],
1356                )?;
1357            }
1358        }
1359    }
1360    Ok(())
1361}
1362
1363#[derive(PartialEq, Eq)]
1364pub enum GroCandidateType {
1365    NotGRO,
1366    Tcp4GRO,
1367    Tcp6GRO,
1368    Udp4GRO,
1369    Udp6GRO,
1370}
1371
1372pub fn packet_is_gro_candidate(b: &[u8], can_udp_gro: bool) -> GroCandidateType {
1373    if b.len() < 28 {
1374        return GroCandidateType::NotGRO;
1375    }
1376    if b[0] >> 4 == 4 {
1377        if b[0] & 0x0F != 5 {
1378            // IPv4 packets w/IP options do not coalesce
1379            return GroCandidateType::NotGRO;
1380        }
1381        match b[9] {
1382            6 if b.len() >= 40 => return GroCandidateType::Tcp4GRO,
1383            17 if can_udp_gro => return GroCandidateType::Udp4GRO,
1384            _ => {}
1385        }
1386    } else if b[0] >> 4 == 6 {
1387        match b[6] {
1388            6 if b.len() >= 60 => return GroCandidateType::Tcp6GRO,
1389            17 if b.len() >= 48 && can_udp_gro => return GroCandidateType::Udp6GRO,
1390            _ => {}
1391        }
1392    }
1393    GroCandidateType::NotGRO
1394}
1395
1396const UDP_H_LEN: usize = 8;
1397
1398/// udpGRO evaluates the UDP packet at pktI in bufs for coalescing with
1399/// existing packets tracked in table. It returns a groResultNoop when no
1400/// action was taken, groResultTableInsert when the evaluated packet was
1401/// inserted into table, and groResultCoalesced when the evaluated packet was
1402/// coalesced with another packet in table.
1403fn udp_gro<B: ExpandBuffer>(
1404    bufs: &mut [B],
1405    offset: usize,
1406    pkt_i: usize,
1407    table: &mut UdpGROTable,
1408    is_v6: bool,
1409) -> GroResult {
1410    let pkt = unsafe { &*(&bufs[pkt_i].as_ref()[offset..] as *const [u8]) };
1411    if pkt.len() > u16::MAX as usize {
1412        // A valid IPv4 or IPv6 packet will never exceed this.
1413        return GroResult::Noop;
1414    }
1415
1416    let mut iph_len = ((pkt[0] & 0x0F) * 4) as usize;
1417    if is_v6 {
1418        iph_len = 40;
1419        let ipv6_payload_len = u16::from_be_bytes([pkt[4], pkt[5]]) as usize;
1420        if ipv6_payload_len != pkt.len() - iph_len {
1421            return GroResult::Noop;
1422        }
1423    } else {
1424        let total_len = u16::from_be_bytes([pkt[2], pkt[3]]) as usize;
1425        if total_len != pkt.len() {
1426            return GroResult::Noop;
1427        }
1428    }
1429
1430    if pkt.len() < iph_len || pkt.len() < iph_len + UDP_H_LEN {
1431        return GroResult::Noop;
1432    }
1433
1434    if !is_v6 && (pkt[6] & IPV4_FLAG_MORE_FRAGMENTS != 0 || pkt[6] << 3 != 0 || pkt[7] != 0) {
1435        // No GRO support for fragmented segments for now.
1436        return GroResult::Noop;
1437    }
1438
1439    let gso_size = (pkt.len() - UDP_H_LEN - iph_len) as u16;
1440    if gso_size < 1 {
1441        return GroResult::Noop;
1442    }
1443
1444    let (src_addr_offset, addr_len) = if is_v6 {
1445        (IPV6_SRC_ADDR_OFFSET, 16)
1446    } else {
1447        (IPV4_SRC_ADDR_OFFSET, 4)
1448    };
1449
1450    let items = table.lookup_or_insert(
1451        pkt,
1452        src_addr_offset,
1453        src_addr_offset + addr_len,
1454        iph_len,
1455        pkt_i,
1456    );
1457
1458    let items = if let Some(items) = items {
1459        items
1460    } else {
1461        return GroResult::TableInsert;
1462    };
1463
1464    // Only check the last item to prevent reordering packets for a flow.
1465    let items_len = items.len();
1466    let item = &mut items[items_len - 1];
1467    let can = udp_packets_can_coalesce(pkt, iph_len as u8, gso_size, item, bufs, offset);
1468    let mut pkt_csum_known_invalid = false;
1469
1470    if can == CanCoalesce::Append {
1471        match coalesce_udp_packets(pkt, item, bufs, offset, is_v6) {
1472            CoalesceResult::Success => {
1473                // 前面是引用，这里不需要再更新
1474                // table.update_at(*item, items_len - 1);
1475                return GroResult::Coalesced;
1476            }
1477            CoalesceResult::ItemInvalidCSum => {
1478                // If the existing item has an invalid checksum, take no action.
1479                // A new item will be stored, and the existing item won't be revisited.
1480            }
1481            CoalesceResult::PktInvalidCSum => {
1482                // Insert a new item but mark it with invalid checksum to avoid repeat checks.
1483                pkt_csum_known_invalid = true;
1484            }
1485            _ => {}
1486        }
1487    }
1488    let pkt = &bufs[pkt_i].as_ref()[offset..];
1489    // Failed to coalesce; store the packet in the flow.
1490    table.insert(
1491        pkt,
1492        src_addr_offset,
1493        src_addr_offset + addr_len,
1494        iph_len,
1495        pkt_i,
1496        pkt_csum_known_invalid,
1497    );
1498    GroResult::TableInsert
1499}
1500
1501/// handleGRO evaluates bufs for GRO, and writes the indices of the resulting
1502/// Process received packets and apply Generic Receive Offload (GRO) coalescing.
1503///
1504/// This function examines a batch of received packets and coalesces packets belonging
1505/// to the same TCP or UDP flow into larger segments, reducing per-packet overhead.
1506///
1507/// # Arguments
1508///
1509/// * `bufs` - Mutable slice of packet buffers. Each buffer should contain a full packet
1510///   starting at `offset` (with space before offset for the virtio header).
1511/// * `offset` - Offset where packet data begins (typically [`VIRTIO_NET_HDR_LEN`]).
1512///   The virtio header will be written before this offset.
1513/// * `tcp_table` - TCP GRO table for tracking TCP flows.
1514/// * `udp_table` - UDP GRO table for tracking UDP flows.
1515/// * `can_udp_gro` - Whether UDP GRO is supported (kernel feature).
1516/// * `to_write` - Output vector that will be filled with indices of packets to write.
1517///   Initially should be empty.
1518///
1519/// # Returns
1520///
1521/// Returns `Ok(())` on success, or an error if packet processing fails.
1522///
1523/// # Behavior
1524///
1525/// 1. Examines each packet to determine if it's a GRO candidate (TCP or UDP)
1526/// 2. Attempts to coalesce the packet with previous packets in the same flow
1527/// 3. Writes indices of final packets (coalesced or standalone) to `to_write`
1528/// 4. Updates packet headers with appropriate virtio headers
1529///
1530/// # Example
1531///
1532/// ```no_run
1533/// # #[cfg(target_os = "linux")]
1534/// # {
1535/// use tun_rs::{handle_gro, GROTable, VIRTIO_NET_HDR_LEN};
1536///
1537/// let mut gro_table = GROTable::default();
1538/// let mut bufs = vec![vec![0u8; 1500]; 128];
1539/// let mut to_write = Vec::new();
1540///
1541/// // After receiving packets into bufs with recv_multiple:
1542/// // handle_gro(
1543/// //     &mut bufs,
1544/// //     VIRTIO_NET_HDR_LEN,
1545/// //     &mut gro_table.tcp_table,
1546/// //     &mut gro_table.udp_table,
1547/// //     true,  // UDP GRO supported
1548/// //     &mut to_write
1549/// // )?;
1550///
1551/// // to_write now contains indices of packets to process
1552/// // for idx in &to_write {
1553/// //     let packet = &bufs[*idx];
1554/// //     // process packet...
1555/// // }
1556/// # }
1557/// # Ok::<(), std::io::Error>(())
1558/// ```
1559///
1560/// # Performance
1561///
1562/// - Coalescing reduces the number of packets passed to the application
1563/// - Typical coalescing ratios: 5-20 packets into 1 for bulk TCP transfers
1564/// - Most effective for sequential TCP traffic with large receive windows
1565///
1566/// # See Also
1567///
1568/// - [`GROTable`] for managing GRO state
1569/// - [`apply_tcp_coalesce_accounting`] for updating TCP headers after coalescing
1570pub fn handle_gro<B: ExpandBuffer>(
1571    bufs: &mut [B],
1572    offset: usize,
1573    tcp_table: &mut TcpGROTable,
1574    udp_table: &mut UdpGROTable,
1575    can_udp_gro: bool,
1576    to_write: &mut Vec<usize>,
1577) -> io::Result<()> {
1578    let bufs_len = bufs.len();
1579    for i in 0..bufs_len {
1580        if offset < VIRTIO_NET_HDR_LEN || offset > bufs[i].as_ref().len() - 1 {
1581            return Err(io::Error::new(
1582                io::ErrorKind::InvalidInput,
1583                "invalid offset",
1584            ));
1585        }
1586
1587        let result = match packet_is_gro_candidate(&bufs[i].as_ref()[offset..], can_udp_gro) {
1588            GroCandidateType::Tcp4GRO => tcp_gro(bufs, offset, i, tcp_table, false),
1589            GroCandidateType::Tcp6GRO => tcp_gro(bufs, offset, i, tcp_table, true),
1590            GroCandidateType::Udp4GRO => udp_gro(bufs, offset, i, udp_table, false),
1591            GroCandidateType::Udp6GRO => udp_gro(bufs, offset, i, udp_table, true),
1592            GroCandidateType::NotGRO => GroResult::Noop,
1593        };
1594
1595        match result {
1596            GroResult::Noop => {
1597                let hdr = VirtioNetHdr::default();
1598                hdr.encode(&mut bufs[i].as_mut()[offset - VIRTIO_NET_HDR_LEN..offset])?;
1599                // Fallthrough intended
1600                to_write.push(i);
1601            }
1602            GroResult::TableInsert => {
1603                to_write.push(i);
1604            }
1605            _ => {}
1606        }
1607    }
1608
1609    let err_tcp = apply_tcp_coalesce_accounting(bufs, offset, tcp_table);
1610    let err_udp = apply_udp_coalesce_accounting(bufs, offset, udp_table);
1611    err_tcp?;
1612    err_udp?;
1613    Ok(())
1614}
1615
1616/// Split a GSO (Generic Segmentation Offload) packet into multiple smaller packets.
1617///
1618/// When sending data with offload enabled, the application can provide large packets
1619/// that will be automatically segmented. This function performs the opposite operation:
1620/// splitting a large GSO packet into MTU-sized segments for transmission.
1621///
1622/// # Arguments
1623///
1624/// * `input` - The input buffer containing the large GSO packet (with virtio header).
1625/// * `hdr` - The virtio network header describing the GSO packet.
1626/// * `out_bufs` - Output buffers where segmented packets will be written.
1627/// * `sizes` - Output array where the size of each segmented packet will be written.
1628/// * `out_offset` - Offset in output buffers where packet data should start.
1629/// * `is_v6` - Whether this is an IPv6 packet (affects header offsets).
1630///
1631/// # Returns
1632///
1633/// Returns the number of output buffers populated (number of segments created),
1634/// or an error if segmentation fails.
1635///
1636/// # How GSO Splitting Works
1637///
1638/// For a large TCP packet with GSO enabled:
1639/// 1. The packet headers are parsed (IP + TCP)
1640/// 2. The payload is split into segments of size `hdr.gso_size`
1641/// 3. New packets are created with copied headers and updated fields:
1642///    - IP length field
1643///    - IP checksum (for IPv4)
1644///    - TCP sequence number (incremented for each segment)
1645///    - TCP checksum
1646///
1647/// # Example
1648///
1649/// ```no_run
1650/// # #[cfg(target_os = "linux")]
1651/// # {
1652/// use tun_rs::{gso_split, VirtioNetHdr, VIRTIO_NET_HDR_LEN};
1653///
1654/// let mut large_packet = vec![0u8; 65536];
1655/// let hdr = VirtioNetHdr::default();
1656/// let mut out_bufs = vec![vec![0u8; 1500]; 128];
1657/// let mut sizes = vec![0; 128];
1658///
1659/// // Split the GSO packet
1660/// // let num_segments = gso_split(
1661/// //     &mut large_packet,
1662/// //     hdr,
1663/// //     &mut out_bufs,
1664/// //     &mut sizes,
1665/// //     VIRTIO_NET_HDR_LEN,
1666/// //     false  // IPv4
1667/// // )?;
1668///
1669/// // Now out_bufs[0..num_segments] contain the segmented packets
1670/// # }
1671/// # Ok::<(), std::io::Error>(())
1672/// ```
1673///
1674/// # Supported Protocols
1675///
1676/// - TCP over IPv4 (GSO type: [`VIRTIO_NET_HDR_GSO_TCPV4`])
1677/// - TCP over IPv6 (GSO type: [`VIRTIO_NET_HDR_GSO_TCPV6`])
1678/// - UDP (GSO type: [`VIRTIO_NET_HDR_GSO_UDP_L4`])
1679///
1680/// # Performance
1681///
1682/// GSO allows sending fewer, larger packets to the kernel, which then performs
1683/// efficient segmentation. This reduces:
1684/// - Number of system calls
1685/// - Per-packet processing overhead in the application
1686/// - Context switches
1687///
1688/// Typical performance improvement: 2-5x for bulk transfers.
1689pub fn gso_split<B: AsRef<[u8]> + AsMut<[u8]>>(
1690    input: &mut [u8],
1691    hdr: VirtioNetHdr,
1692    out_bufs: &mut [B],
1693    sizes: &mut [usize],
1694    out_offset: usize,
1695    is_v6: bool,
1696) -> io::Result<usize> {
1697    let iph_len = hdr.csum_start as usize;
1698    let (src_addr_offset, addr_len) = if is_v6 {
1699        (IPV6_SRC_ADDR_OFFSET, 16)
1700    } else {
1701        input[10] = 0;
1702        input[11] = 0; // clear IPv4 header checksum
1703        (IPV4_SRC_ADDR_OFFSET, 4)
1704    };
1705
1706    let transport_csum_at = (hdr.csum_start + hdr.csum_offset) as usize;
1707    input[transport_csum_at] = 0;
1708    input[transport_csum_at + 1] = 0; // clear TCP/UDP checksum
1709
1710    let (first_tcp_seq_num, protocol) =
1711        if hdr.gso_type == VIRTIO_NET_HDR_GSO_TCPV4 || hdr.gso_type == VIRTIO_NET_HDR_GSO_TCPV6 {
1712            (
1713                BigEndian::read_u32(&input[hdr.csum_start as usize + 4..]),
1714                IPPROTO_TCP,
1715            )
1716        } else {
1717            (0, IPPROTO_UDP)
1718        };
1719
1720    let src_addr_bytes = &input[src_addr_offset..src_addr_offset + addr_len];
1721    let dst_addr_bytes = &input[src_addr_offset + addr_len..src_addr_offset + 2 * addr_len];
1722    let transport_header_len = (hdr.hdr_len - hdr.csum_start) as usize;
1723
1724    let nonlast_segment_data_len = hdr.gso_size as usize;
1725    let nonlast_len_for_pseudo = (transport_header_len + nonlast_segment_data_len) as u16;
1726    let nonlast_total_len = hdr.hdr_len as usize + nonlast_segment_data_len;
1727
1728    let nonlast_transport_csum_no_fold = pseudo_header_checksum_no_fold(
1729        protocol as u8,
1730        src_addr_bytes,
1731        dst_addr_bytes,
1732        nonlast_len_for_pseudo,
1733    );
1734
1735    let mut next_segment_data_at = hdr.hdr_len as usize;
1736    let mut i = 0;
1737
1738    while next_segment_data_at < input.len() {
1739        if i == out_bufs.len() {
1740            return Err(io::Error::other("ErrTooManySegments"));
1741        }
1742
1743        let next_segment_end = next_segment_data_at + hdr.gso_size as usize;
1744        let (next_segment_end, segment_data_len, total_len, transport_csum_no_fold) =
1745            if next_segment_end > input.len() {
1746                let last_segment_data_len = input.len() - next_segment_data_at;
1747                let last_len_for_pseudo = (transport_header_len + last_segment_data_len) as u16;
1748
1749                let last_total_len = hdr.hdr_len as usize + last_segment_data_len;
1750                let last_transport_csum_no_fold = pseudo_header_checksum_no_fold(
1751                    protocol as u8,
1752                    src_addr_bytes,
1753                    dst_addr_bytes,
1754                    last_len_for_pseudo,
1755                );
1756                (
1757                    input.len(),
1758                    last_segment_data_len,
1759                    last_total_len,
1760                    last_transport_csum_no_fold,
1761                )
1762            } else {
1763                (
1764                    next_segment_end,
1765                    hdr.gso_size as usize,
1766                    nonlast_total_len,
1767                    nonlast_transport_csum_no_fold,
1768                )
1769            };
1770
1771        sizes[i] = total_len;
1772        let out = &mut out_bufs[i].as_mut()[out_offset..];
1773
1774        out[..iph_len].copy_from_slice(&input[..iph_len]);
1775
1776        if !is_v6 {
1777            // For IPv4 we are responsible for incrementing the ID field,
1778            // updating the total len field, and recalculating the header
1779            // checksum.
1780            if i > 0 {
1781                let id = BigEndian::read_u16(&out[4..]).wrapping_add(i as u16);
1782                BigEndian::write_u16(&mut out[4..6], id);
1783            }
1784            BigEndian::write_u16(&mut out[2..4], total_len as u16);
1785            let ipv4_csum = !checksum(&out[..iph_len], 0);
1786            BigEndian::write_u16(&mut out[10..12], ipv4_csum);
1787        } else {
1788            // For IPv6 we are responsible for updating the payload length field.
1789            // IPv6 extensions are not checksumed, but included in the payload length.
1790            const IPV6_FIXED_HDR_LEN: usize = 40;
1791            let payload_len = total_len - IPV6_FIXED_HDR_LEN;
1792            BigEndian::write_u16(&mut out[4..6], payload_len as u16);
1793        }
1794
1795        out[hdr.csum_start as usize..hdr.hdr_len as usize]
1796            .copy_from_slice(&input[hdr.csum_start as usize..hdr.hdr_len as usize]);
1797
1798        if protocol == IPPROTO_TCP {
1799            let tcp_seq = first_tcp_seq_num.wrapping_add(hdr.gso_size as u32 * i as u32);
1800            BigEndian::write_u32(
1801                &mut out[(hdr.csum_start + 4) as usize..(hdr.csum_start + 8) as usize],
1802                tcp_seq,
1803            );
1804            if next_segment_end != input.len() {
1805                out[hdr.csum_start as usize + TCP_FLAGS_OFFSET] &= !(TCP_FLAG_FIN | TCP_FLAG_PSH);
1806            }
1807        } else {
1808            let udp_len = (segment_data_len + (hdr.hdr_len - hdr.csum_start) as usize) as u16;
1809            BigEndian::write_u16(
1810                &mut out[(hdr.csum_start + 4) as usize..(hdr.csum_start + 6) as usize],
1811                udp_len,
1812            );
1813        }
1814
1815        out[hdr.hdr_len as usize..total_len]
1816            .as_mut()
1817            .copy_from_slice(&input[next_segment_data_at..next_segment_end]);
1818
1819        let transport_csum = !checksum(
1820            &out[hdr.csum_start as usize..total_len],
1821            transport_csum_no_fold,
1822        );
1823        BigEndian::write_u16(
1824            &mut out[transport_csum_at..transport_csum_at + 2],
1825            transport_csum,
1826        );
1827
1828        next_segment_data_at += hdr.gso_size as usize;
1829        i += 1;
1830    }
1831
1832    Ok(i)
1833}
1834
1835/// Calculate checksum for packets without GSO.
1836///
1837/// This function computes and writes the transport layer (TCP/UDP) checksum for
1838/// packets that don't use Generic Segmentation Offload.
1839///
1840/// # Arguments
1841///
1842/// * `in_buf` - The packet buffer (mutable)
1843/// * `csum_start` - Offset where checksum calculation should begin
1844/// * `csum_offset` - Offset within the checksummed area where the checksum should be written
1845///
1846/// # Behavior
1847///
1848/// 1. Reads the initial checksum value (typically the pseudo-header checksum)
1849/// 2. Clears the checksum field
1850/// 3. Calculates the checksum over the transport header and data
1851/// 4. Writes the final checksum back to the buffer
1852///
1853/// This is used when [`VIRTIO_NET_HDR_F_NEEDS_CSUM`] flag is set but [`VIRTIO_NET_HDR_GSO_NONE`]
1854/// is the GSO type.
1855pub fn gso_none_checksum(in_buf: &mut [u8], csum_start: u16, csum_offset: u16) {
1856    let csum_at = (csum_start + csum_offset) as usize;
1857    // The initial value at the checksum offset should be summed with the
1858    // checksum we compute. This is typically the pseudo-header checksum.
1859    let initial = BigEndian::read_u16(&in_buf[csum_at..]);
1860    in_buf[csum_at] = 0;
1861    in_buf[csum_at + 1] = 0;
1862    let computed_checksum = checksum(&in_buf[csum_start as usize..], initial as u64);
1863    BigEndian::write_u16(&mut in_buf[csum_at..], !computed_checksum);
1864}
1865
1866/// Generic Receive Offload (GRO) table for managing packet coalescing.
1867///
1868/// This structure maintains the state needed to coalesce multiple received packets
1869/// into larger segments, reducing per-packet processing overhead. It combines both
1870/// TCP and UDP GRO capabilities.
1871///
1872/// # Purpose
1873///
1874/// When receiving many small packets of the same flow, GRO can combine them into
1875/// fewer, larger packets. This provides significant performance benefits:
1876///
1877/// - Reduces the number of packets passed to the application
1878/// - Fewer context switches and system calls
1879/// - Better cache utilization
1880/// - Lower CPU usage per gigabit of traffic
1881///
1882/// # Usage
1883///
1884/// Create a `GROTable` and reuse it across multiple `recv_multiple` calls:
1885///
1886/// ```no_run
1887/// # #[cfg(target_os = "linux")]
1888/// # {
1889/// use tun_rs::{DeviceBuilder, GROTable, IDEAL_BATCH_SIZE, VIRTIO_NET_HDR_LEN};
1890///
1891/// let dev = DeviceBuilder::new()
1892///     .offload(true)
1893///     .ipv4("10.0.0.1", 24, None)
1894///     .build_sync()?;
1895///
1896/// let mut gro_table = GROTable::default();
1897/// let mut original_buffer = vec![0; VIRTIO_NET_HDR_LEN + 65535];
1898/// let mut bufs = vec![vec![0u8; 1500]; IDEAL_BATCH_SIZE];
1899/// let mut sizes = vec![0; IDEAL_BATCH_SIZE];
1900///
1901/// loop {
1902///     let num = dev.recv_multiple(&mut original_buffer, &mut bufs, &mut sizes, 0)?;
1903///
1904///     // GRO table is automatically used by recv_multiple
1905///     // to coalesce packets
1906///     for i in 0..num {
1907///         println!("Packet: {} bytes", sizes[i]);
1908///     }
1909/// }
1910/// # }
1911/// # Ok::<(), std::io::Error>(())
1912/// ```
1913///
1914/// # Fields
1915///
1916/// - `tcp_gro_table`: State for TCP packet coalescing
1917/// - `udp_gro_table`: State for UDP packet coalescing (if supported by kernel)
1918/// - `to_write`: Internal buffer tracking which packets to emit
1919///
1920/// # Performance
1921///
1922/// The GRO table maintains internal state across calls, including:
1923/// - Hash map of active flows (preallocated for [`IDEAL_BATCH_SIZE`] flows)
1924/// - Memory pools to reduce allocations
1925/// - Per-flow coalescing state
1926///
1927/// Typical coalescing ratios:
1928/// - TCP bulk transfers: 5-20 packets coalesced into 1
1929/// - UDP: 2-5 packets coalesced into 1
1930/// - Interactive traffic: minimal coalescing (preserves latency)
1931///
1932/// # Thread Safety
1933///
1934/// `GROTable` is not thread-safe. Use one instance per thread or protect with a mutex.
1935#[derive(Default)]
1936pub struct GROTable {
1937    pub(crate) to_write: Vec<usize>,
1938    pub(crate) tcp_gro_table: TcpGROTable,
1939    pub(crate) udp_gro_table: UdpGROTable,
1940}
1941
1942impl GROTable {
1943    pub fn new() -> GROTable {
1944        GROTable {
1945            to_write: Vec::with_capacity(IDEAL_BATCH_SIZE),
1946            tcp_gro_table: TcpGROTable::new(),
1947            udp_gro_table: UdpGROTable::new(),
1948        }
1949    }
1950    pub(crate) fn reset(&mut self) {
1951        self.to_write.clear();
1952        self.tcp_gro_table.reset();
1953        self.udp_gro_table.reset();
1954    }
1955}
1956
1957/// A trait for buffers that can be expanded and resized for offload operations.
1958///
1959/// This trait extends basic buffer operations (`AsRef<[u8]>` and `AsMut<[u8]>`)
1960/// with methods needed for efficient packet processing with GRO/GSO offload support.
1961/// It allows buffers to grow dynamically as needed during packet coalescing and
1962/// segmentation operations.
1963///
1964/// # Required Methods
1965///
1966/// - `buf_capacity()` - Returns the current capacity of the buffer
1967/// - `buf_resize()` - Resizes the buffer to a new length, filling with a value
1968/// - `buf_extend_from_slice()` - Extends the buffer with data from a slice
1969///
1970/// # Implementations
1971///
1972/// This trait is implemented for:
1973/// - `BytesMut` - The primary buffer type for async operations
1974/// - `&mut BytesMut` - Mutable reference to BytesMut
1975/// - `Vec<u8>` - Standard Rust vector
1976/// - `&mut Vec<u8>` - Mutable reference to Vec
1977///
1978/// # Example
1979///
1980/// ```no_run
1981/// # #[cfg(target_os = "linux")]
1982/// # {
1983/// use bytes::BytesMut;
1984/// use tun_rs::ExpandBuffer;
1985///
1986/// let mut buffer = BytesMut::with_capacity(1500);
1987/// buffer.buf_resize(20, 0); // Resize to 20 bytes, filled with zeros
1988/// buffer.buf_extend_from_slice(b"packet data"); // Append data
1989/// assert!(buffer.buf_capacity() >= buffer.len());
1990/// # }
1991/// ```
1992pub trait ExpandBuffer: AsRef<[u8]> + AsMut<[u8]> {
1993    /// Returns the current capacity of the buffer in bytes.
1994    ///
1995    /// The capacity is the total amount of memory allocated, which may be
1996    /// greater than the current length of the buffer.
1997    fn buf_capacity(&self) -> usize;
1998
1999    /// Resizes the buffer to the specified length, filling new space with the given value.
2000    ///
2001    /// If `new_len` is greater than the current length, the buffer is extended
2002    /// and new bytes are initialized to `value`. If `new_len` is less than the
2003    /// current length, the buffer is truncated.
2004    ///
2005    /// # Arguments
2006    ///
2007    /// * `new_len` - The new length of the buffer
2008    /// * `value` - The byte value to fill any new space with
2009    fn buf_resize(&mut self, new_len: usize, value: u8);
2010
2011    /// Extends the buffer by appending data from a slice.
2012    ///
2013    /// This method appends all bytes from `src` to the end of the buffer,
2014    /// growing the buffer as necessary.
2015    ///
2016    /// # Arguments
2017    ///
2018    /// * `src` - The slice of bytes to append to the buffer
2019    fn buf_extend_from_slice(&mut self, src: &[u8]);
2020}
2021
2022impl ExpandBuffer for BytesMut {
2023    fn buf_capacity(&self) -> usize {
2024        self.capacity()
2025    }
2026
2027    fn buf_resize(&mut self, new_len: usize, value: u8) {
2028        self.resize(new_len, value)
2029    }
2030
2031    fn buf_extend_from_slice(&mut self, extend: &[u8]) {
2032        self.extend_from_slice(extend)
2033    }
2034}
2035
2036impl ExpandBuffer for &mut BytesMut {
2037    fn buf_capacity(&self) -> usize {
2038        self.capacity()
2039    }
2040    fn buf_resize(&mut self, new_len: usize, value: u8) {
2041        self.resize(new_len, value)
2042    }
2043
2044    fn buf_extend_from_slice(&mut self, extend: &[u8]) {
2045        self.extend_from_slice(extend)
2046    }
2047}
2048impl ExpandBuffer for Vec<u8> {
2049    fn buf_capacity(&self) -> usize {
2050        self.capacity()
2051    }
2052
2053    fn buf_resize(&mut self, new_len: usize, value: u8) {
2054        self.resize(new_len, value)
2055    }
2056
2057    fn buf_extend_from_slice(&mut self, extend: &[u8]) {
2058        self.extend_from_slice(extend)
2059    }
2060}
2061impl ExpandBuffer for &mut Vec<u8> {
2062    fn buf_capacity(&self) -> usize {
2063        self.capacity()
2064    }
2065
2066    fn buf_resize(&mut self, new_len: usize, value: u8) {
2067        self.resize(new_len, value)
2068    }
2069
2070    fn buf_extend_from_slice(&mut self, extend: &[u8]) {
2071        self.extend_from_slice(extend)
2072    }
2073}
tun_rs/platform/linux/offload.rs

tun_rs/platform/linux/
offload.rs