tun_rs/platform/linux/offload.rs
1/*!
2# Linux Offload Support Module
3
4This module provides Generic Receive Offload (GRO) and Generic Segmentation Offload (GSO)
5support for Linux TUN devices, significantly improving throughput for TCP and UDP traffic.
6
7## Overview
8
9Modern network cards and drivers use offload techniques to reduce CPU overhead:
10
11- **GSO (Generic Segmentation Offload)**: Allows sending large packets that are segmented by
12 the kernel/driver, reducing per-packet processing overhead.
13
14- **GRO (Generic Receive Offload)**: Coalesces multiple received packets into larger segments,
15 reducing the number of packets passed to the application.
16
17This module implements GRO/GSO for TUN devices using the `virtio_net` header format, compatible
18with the Linux kernel's TUN/TAP driver offload capabilities.
19
20## Performance Benefits
21
22Enabling offload can provide:
23- 2-10x improvement in throughput for TCP traffic
24- Reduced CPU usage per gigabit of traffic
25- Better handling of high-bandwidth applications
26
27The actual improvement depends on:
28- Packet sizes
29- TCP window sizes
30- Network round-trip time
31- CPU capabilities
32
33## Usage
34
35Enable offload when building a device:
36
37```no_run
38# #[cfg(target_os = "linux")]
39# {
40use tun_rs::{DeviceBuilder, GROTable, IDEAL_BATCH_SIZE, VIRTIO_NET_HDR_LEN};
41
42let dev = DeviceBuilder::new()
43 .offload(true) // Enable offload
44 .ipv4("10.0.0.1", 24, None)
45 .build_sync()?;
46
47// Allocate buffers for batch operations
48let mut original_buffer = vec![0; VIRTIO_NET_HDR_LEN + 65535];
49let mut bufs = vec![vec![0u8; 1500]; IDEAL_BATCH_SIZE];
50let mut sizes = vec![0; IDEAL_BATCH_SIZE];
51
52// Create GRO table for coalescing
53let mut gro_table = GROTable::default();
54
55loop {
56 // Receive multiple packets at once
57 let num = dev.recv_multiple(&mut original_buffer, &mut bufs, &mut sizes, 0)?;
58
59 for i in 0..num {
60 // Process each packet
61 println!("Packet {}: {} bytes", i, sizes[i]);
62 }
63}
64# }
65# Ok::<(), std::io::Error>(())
66```
67
68## Key Types
69
70- [`VirtioNetHdr`]: Header structure for virtio network offload
71- [`GROTable`]: Manages TCP and UDP flow coalescing for GRO
72- [`TcpGROTable`]: TCP-specific GRO state
73- [`UdpGROTable`]: UDP-specific GRO state
74
75## Key Functions
76
77- [`handle_gro`]: Process received packets and perform GRO coalescing
78- [`gso_split`]: Split a GSO packet into multiple segments
79- [`apply_tcp_coalesce_accounting`]: Update TCP headers after coalescing
80
81## Constants
82
83- [`VIRTIO_NET_HDR_LEN`]: Size of the virtio network header (12 bytes)
84- [`IDEAL_BATCH_SIZE`]: Recommended batch size for packet operations (128)
85- [`VIRTIO_NET_HDR_GSO_NONE`], [`VIRTIO_NET_HDR_GSO_TCPV4`], etc.: GSO type constants
86
87## References
88
89- [Linux virtio_net.h](https://github.com/torvalds/linux/blob/master/include/uapi/linux/virtio_net.h)
90- [WireGuard-go offload implementation](https://github.com/WireGuard/wireguard-go/blob/master/tun/offload_linux.go)
91
92## Platform Requirements
93
94- Linux kernel with TUN/TAP driver
95- Kernel support for IFF_VNET_HDR (available since Linux 2.6.32)
96- Root privileges to create TUN devices with offload enabled
97*/
98
99/// https://github.com/WireGuard/wireguard-go/blob/master/tun/offload_linux.go
100use crate::platform::linux::checksum::{checksum, pseudo_header_checksum_no_fold};
101use byteorder::{BigEndian, ByteOrder};
102use bytes::BytesMut;
103use libc::{IPPROTO_TCP, IPPROTO_UDP};
104use std::collections::HashMap;
105use std::io;
106
107/// GSO type: Not a GSO frame (normal packet).
108///
109/// This indicates a regular packet without Generic Segmentation Offload applied.
110/// See: <https://github.com/torvalds/linux/blob/master/include/uapi/linux/virtio_net.h>
111pub const VIRTIO_NET_HDR_GSO_NONE: u8 = 0;
112
113/// Flag: Use csum_start and csum_offset fields for checksum calculation.
114///
115/// When this flag is set, the packet requires checksum calculation.
116/// The `csum_start` field indicates where checksumming should begin,
117/// and `csum_offset` indicates where to write the checksum.
118pub const VIRTIO_NET_HDR_F_NEEDS_CSUM: u8 = 1;
119
120/// GSO type: IPv4 TCP segmentation (TSO - TCP Segmentation Offload).
121///
122/// Large TCP packets can be sent and will be segmented by the kernel/driver.
123pub const VIRTIO_NET_HDR_GSO_TCPV4: u8 = 1;
124
125/// GSO type: IPv6 TCP segmentation (TSO).
126///
127/// Similar to TCPV4 but for IPv6 packets.
128pub const VIRTIO_NET_HDR_GSO_TCPV6: u8 = 4;
129
130/// GSO type: UDP segmentation for IPv4 and IPv6 (USO - UDP Segmentation Offload).
131///
132/// Available in newer Linux kernels for UDP packet segmentation.
133pub const VIRTIO_NET_HDR_GSO_UDP_L4: u8 = 5;
134
135/// Recommended batch size for packet operations with offload.
136///
137/// This constant defines the optimal number of packets to handle per `recv_multiple`
138/// or `send_multiple` call. It balances between:
139/// - Amortizing system call overhead
140/// - Keeping latency reasonable
141/// - Memory usage for packet buffers
142///
143/// Based on WireGuard-go's implementation.
144///
145/// # Example
146///
147/// ```no_run
148/// # #[cfg(target_os = "linux")]
149/// # {
150/// use tun_rs::IDEAL_BATCH_SIZE;
151///
152/// // Allocate buffers for batch operations
153/// let mut bufs = vec![vec![0u8; 1500]; IDEAL_BATCH_SIZE];
154/// let mut sizes = vec![0; IDEAL_BATCH_SIZE];
155/// # }
156/// ```
157///
158/// See: <https://github.com/WireGuard/wireguard-go/blob/master/conn/conn.go#L19>
159pub const IDEAL_BATCH_SIZE: usize = 128;
160
161const TCP_FLAGS_OFFSET: usize = 13;
162
163const TCP_FLAG_FIN: u8 = 0x01;
164const TCP_FLAG_PSH: u8 = 0x08;
165const TCP_FLAG_ACK: u8 = 0x10;
166
167/// Virtio network header for offload support.
168///
169/// This structure precedes each packet when offload is enabled on a Linux TUN device.
170/// It provides metadata about Generic Segmentation Offload (GSO) and checksum requirements,
171/// allowing the kernel to perform hardware-accelerated operations.
172///
173/// The header matches the Linux kernel's `virtio_net_hdr` structure defined in
174/// `include/uapi/linux/virtio_net.h`.
175///
176/// # Memory Layout
177///
178/// The structure is `#[repr(C)]` and has a fixed size of 12 bytes ([`VIRTIO_NET_HDR_LEN`]).
179/// All multi-byte fields are in native endianness.
180///
181/// # Usage
182///
183/// When reading from a TUN device with offload enabled:
184/// ```no_run
185/// # #[cfg(target_os = "linux")]
186/// # {
187/// use tun_rs::{VirtioNetHdr, VIRTIO_NET_HDR_LEN};
188///
189/// let mut buf = vec![0u8; VIRTIO_NET_HDR_LEN + 1500];
190/// // let n = dev.recv(&mut buf)?;
191///
192/// // Decode the header
193/// // let hdr = VirtioNetHdr::decode(&buf[..VIRTIO_NET_HDR_LEN])?;
194/// // let packet = &buf[VIRTIO_NET_HDR_LEN..n];
195/// # }
196/// ```
197///
198/// # Fields
199///
200/// - `flags`: Bit flags for header processing (e.g., [`VIRTIO_NET_HDR_F_NEEDS_CSUM`])
201/// - `gso_type`: Type of GSO applied (e.g., [`VIRTIO_NET_HDR_GSO_TCPV4`])
202/// - `hdr_len`: Length of packet headers (Ethernet + IP + TCP/UDP)
203/// - `gso_size`: Maximum segment size for GSO
204/// - `csum_start`: Offset to start checksum calculation
205/// - `csum_offset`: Offset within checksum area to store the checksum
206///
207/// # References
208///
209/// - [Linux virtio_net.h](https://github.com/torvalds/linux/blob/master/include/uapi/linux/virtio_net.h)
210///
211/// See: <https://github.com/torvalds/linux/blob/master/include/uapi/linux/virtio_net.h>
212#[repr(C)]
213#[derive(Debug, Clone, Copy, Default)]
214pub struct VirtioNetHdr {
215 // #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */
216 // #define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */
217 // #define VIRTIO_NET_HDR_F_RSC_INFO 4 /* rsc info in csum_ fields */
218 pub flags: u8,
219 // #define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */
220 // #define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */
221 // #define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */
222 // #define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */
223 // #define VIRTIO_NET_HDR_GSO_UDP_L4 5 /* GSO frame, IPv4& IPv6 UDP (USO) */
224 // #define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */
225 pub gso_type: u8,
226 // Ethernet + IP + tcp/udp hdrs
227 pub hdr_len: u16,
228 // Bytes to append to hdr_len per frame
229 pub gso_size: u16,
230 // Checksum calculation
231 pub csum_start: u16,
232 pub csum_offset: u16,
233}
234
235impl VirtioNetHdr {
236 /// Decode a virtio network header from a byte buffer.
237 ///
238 /// Reads the first [`VIRTIO_NET_HDR_LEN`] bytes from the buffer and interprets
239 /// them as a `VirtioNetHdr` structure.
240 ///
241 /// # Errors
242 ///
243 /// Returns an error if the buffer is too short (less than [`VIRTIO_NET_HDR_LEN`] bytes).
244 ///
245 /// # Example
246 ///
247 /// ```no_run
248 /// # #[cfg(target_os = "linux")]
249 /// # {
250 /// use tun_rs::{VirtioNetHdr, VIRTIO_NET_HDR_LEN};
251 ///
252 /// let buffer = vec![0u8; VIRTIO_NET_HDR_LEN + 1500];
253 /// let header = VirtioNetHdr::decode(&buffer)?;
254 /// println!("GSO type: {:?}", header.gso_type);
255 /// # }
256 /// # Ok::<(), std::io::Error>(())
257 /// ```
258 pub fn decode(buf: &[u8]) -> io::Result<VirtioNetHdr> {
259 if buf.len() < VIRTIO_NET_HDR_LEN {
260 return Err(io::Error::new(io::ErrorKind::InvalidInput, "too short"));
261 }
262 let mut hdr = std::mem::MaybeUninit::<VirtioNetHdr>::uninit();
263 unsafe {
264 // Safety:
265 // hdr is written by `buf`, both pointers satisfy the alignment requirement of `u8`
266 std::ptr::copy_nonoverlapping(
267 buf.as_ptr(),
268 hdr.as_mut_ptr() as *mut _,
269 std::mem::size_of::<VirtioNetHdr>(),
270 );
271 Ok(hdr.assume_init())
272 }
273 }
274
275 /// Encode a virtio network header into a byte buffer.
276 ///
277 /// Writes this header into the first [`VIRTIO_NET_HDR_LEN`] bytes of the buffer.
278 ///
279 /// # Errors
280 ///
281 /// Returns an error if the buffer is too short (less than [`VIRTIO_NET_HDR_LEN`] bytes).
282 ///
283 /// # Example
284 ///
285 /// ```no_run
286 /// # #[cfg(target_os = "linux")]
287 /// # {
288 /// use tun_rs::{VirtioNetHdr, VIRTIO_NET_HDR_GSO_NONE, VIRTIO_NET_HDR_LEN};
289 ///
290 /// let header = VirtioNetHdr {
291 /// gso_type: VIRTIO_NET_HDR_GSO_NONE,
292 /// ..Default::default()
293 /// };
294 ///
295 /// let mut buffer = vec![0u8; VIRTIO_NET_HDR_LEN + 1500];
296 /// header.encode(&mut buffer)?;
297 /// # }
298 /// # Ok::<(), std::io::Error>(())
299 /// ```
300 pub fn encode(&self, buf: &mut [u8]) -> io::Result<()> {
301 if buf.len() < VIRTIO_NET_HDR_LEN {
302 return Err(io::Error::new(io::ErrorKind::InvalidInput, "too short"));
303 }
304 unsafe {
305 let hdr_ptr = self as *const VirtioNetHdr as *const u8;
306 std::ptr::copy_nonoverlapping(hdr_ptr, buf.as_mut_ptr(), VIRTIO_NET_HDR_LEN);
307 Ok(())
308 }
309 }
310}
311
312/// Size of the virtio network header in bytes (12 bytes).
313///
314/// This constant represents the fixed size of the `VirtioNetHdr` structure.
315/// When offload is enabled on a TUN device, this header precedes every packet.
316///
317/// # Example
318///
319/// ```no_run
320/// # #[cfg(target_os = "linux")]
321/// # {
322/// use tun_rs::VIRTIO_NET_HDR_LEN;
323///
324/// // Allocate buffer with space for header + packet
325/// let mut buffer = vec![0u8; VIRTIO_NET_HDR_LEN + 1500];
326///
327/// // Header is at the start
328/// // let header_bytes = &buffer[..VIRTIO_NET_HDR_LEN];
329/// // Packet data follows the header
330/// // let packet_data = &buffer[VIRTIO_NET_HDR_LEN..];
331/// # }
332/// ```
333pub const VIRTIO_NET_HDR_LEN: usize = std::mem::size_of::<VirtioNetHdr>();
334
335/// Identifier for a TCP flow used in Generic Receive Offload (GRO).
336///
337/// This structure uniquely identifies a TCP connection for packet coalescing.
338/// Packets belonging to the same flow can be coalesced into larger segments,
339/// reducing per-packet processing overhead.
340///
341/// # Fields
342///
343/// The flow is identified by:
344/// - Source and destination IP addresses (IPv4 or IPv6)
345/// - Source and destination ports
346/// - TCP acknowledgment number (to avoid coalescing segments with different ACKs)
347/// - IP version flag
348#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
349pub struct TcpFlowKey {
350 src_addr: [u8; 16],
351 dst_addr: [u8; 16],
352 src_port: u16,
353 dst_port: u16,
354 rx_ack: u32, // varying ack values should not be coalesced. Treat them as separate flows.
355 is_v6: bool,
356}
357
358/// TCP Generic Receive Offload (GRO) table.
359///
360/// Manages the coalescing of TCP packets belonging to the same flow into larger segments.
361/// This reduces the number of packets that need to be processed by the application,
362/// improving throughput and reducing CPU usage.
363///
364/// # How TCP GRO Works
365///
366/// 1. Packets are received from the TUN device
367/// 2. The GRO table identifies packets belonging to the same TCP flow
368/// 3. Consecutive packets in the same flow are coalesced into a single large segment
369/// 4. The coalesced segment is passed to the application
370///
371/// # Usage
372///
373/// The GRO table is typically used in conjunction with [`handle_gro`]:
374///
375/// ```no_run
376/// # #[cfg(target_os = "linux")]
377/// # {
378/// use tun_rs::GROTable;
379///
380/// let mut gro_table = GROTable::default();
381///
382/// // Process received packets
383/// // handle_gro(..., &mut gro_table, ...)?;
384/// # }
385/// ```
386///
387/// # Performance Considerations
388///
389/// - Maintains a hash map of active flows
390/// - Preallocates buffers for [`IDEAL_BATCH_SIZE`] flows
391/// - Memory pooling reduces allocations
392/// - State is maintained across multiple recv_multiple calls
393pub struct TcpGROTable {
394 items_by_flow: HashMap<TcpFlowKey, Vec<TcpGROItem>>,
395 items_pool: Vec<Vec<TcpGROItem>>,
396}
397
398impl Default for TcpGROTable {
399 fn default() -> Self {
400 Self::new()
401 }
402}
403
404impl TcpGROTable {
405 fn new() -> Self {
406 let mut items_pool = Vec::with_capacity(IDEAL_BATCH_SIZE);
407 for _ in 0..IDEAL_BATCH_SIZE {
408 items_pool.push(Vec::with_capacity(IDEAL_BATCH_SIZE));
409 }
410 TcpGROTable {
411 items_by_flow: HashMap::with_capacity(IDEAL_BATCH_SIZE),
412 items_pool,
413 }
414 }
415}
416
417impl TcpFlowKey {
418 fn new(pkt: &[u8], src_addr_offset: usize, dst_addr_offset: usize, tcph_offset: usize) -> Self {
419 let mut key = TcpFlowKey {
420 src_addr: [0; 16],
421 dst_addr: [0; 16],
422 src_port: 0,
423 dst_port: 0,
424 rx_ack: 0,
425 is_v6: false,
426 };
427
428 let addr_size = dst_addr_offset - src_addr_offset;
429 key.src_addr[..addr_size].copy_from_slice(&pkt[src_addr_offset..dst_addr_offset]);
430 key.dst_addr[..addr_size]
431 .copy_from_slice(&pkt[dst_addr_offset..dst_addr_offset + addr_size]);
432 key.src_port = BigEndian::read_u16(&pkt[tcph_offset..]);
433 key.dst_port = BigEndian::read_u16(&pkt[tcph_offset + 2..]);
434 key.rx_ack = BigEndian::read_u32(&pkt[tcph_offset + 8..]);
435 key.is_v6 = addr_size == 16;
436 key
437 }
438}
439
440impl TcpGROTable {
441 /// lookupOrInsert looks up a flow for the provided packet and metadata,
442 /// returning the packets found for the flow, or inserting a new one if none
443 /// is found.
444 fn lookup_or_insert(
445 &mut self,
446 pkt: &[u8],
447 src_addr_offset: usize,
448 dst_addr_offset: usize,
449 tcph_offset: usize,
450 tcph_len: usize,
451 bufs_index: usize,
452 ) -> Option<&mut Vec<TcpGROItem>> {
453 let key = TcpFlowKey::new(pkt, src_addr_offset, dst_addr_offset, tcph_offset);
454 if self.items_by_flow.contains_key(&key) {
455 return self.items_by_flow.get_mut(&key);
456 }
457 // Insert the new item into the table
458 self.insert(
459 pkt,
460 src_addr_offset,
461 dst_addr_offset,
462 tcph_offset,
463 tcph_len,
464 bufs_index,
465 );
466 None
467 }
468 /// insert an item in the table for the provided packet and packet metadata.
469 fn insert(
470 &mut self,
471 pkt: &[u8],
472 src_addr_offset: usize,
473 dst_addr_offset: usize,
474 tcph_offset: usize,
475 tcph_len: usize,
476 bufs_index: usize,
477 ) {
478 let key = TcpFlowKey::new(pkt, src_addr_offset, dst_addr_offset, tcph_offset);
479 let item = TcpGROItem {
480 key,
481 bufs_index: bufs_index as u16,
482 num_merged: 0,
483 gso_size: pkt[tcph_offset + tcph_len..].len() as u16,
484 iph_len: tcph_offset as u8,
485 tcph_len: tcph_len as u8,
486 sent_seq: BigEndian::read_u32(&pkt[tcph_offset + 4..tcph_offset + 8]),
487 psh_set: pkt[tcph_offset + TCP_FLAGS_OFFSET] & TCP_FLAG_PSH != 0,
488 };
489
490 let items = self
491 .items_by_flow
492 .entry(key)
493 .or_insert_with(|| self.items_pool.pop().unwrap_or_default());
494 items.push(item);
495 }
496}
497// func (t *tcpGROTable) updateAt(item tcpGROItem, i int) {
498// items, _ := t.itemsByFlow[item.key]
499// items[i] = item
500// }
501//
502// func (t *tcpGROTable) deleteAt(key tcpFlowKey, i int) {
503// items, _ := t.itemsByFlow[key]
504// items = append(items[:i], items[i+1:]...)
505// t.itemsByFlow[key] = items
506// }
507
508/// tcpGROItem represents bookkeeping data for a TCP packet during the lifetime
509/// of a GRO evaluation across a vector of packets.
510#[derive(Debug, Clone, Copy)]
511pub struct TcpGROItem {
512 key: TcpFlowKey,
513 sent_seq: u32, // the sequence number
514 bufs_index: u16, // the index into the original bufs slice
515 num_merged: u16, // the number of packets merged into this item
516 gso_size: u16, // payload size
517 iph_len: u8, // ip header len
518 tcph_len: u8, // tcp header len
519 psh_set: bool, // psh flag is set
520}
521
522// func (t *tcpGROTable) newItems() []tcpGROItem {
523// var items []tcpGROItem
524// items, t.itemsPool = t.itemsPool[len(t.itemsPool)-1], t.itemsPool[:len(t.itemsPool)-1]
525// return items
526// }
527impl TcpGROTable {
528 fn reset(&mut self) {
529 for (_key, mut items) in self.items_by_flow.drain() {
530 items.clear();
531 self.items_pool.push(items);
532 }
533 }
534}
535
536/// udpFlowKey represents the key for a UDP flow.
537#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
538pub struct UdpFlowKey {
539 src_addr: [u8; 16], // srcAddr
540 dst_addr: [u8; 16], // dstAddr
541 src_port: u16, // srcPort
542 dst_port: u16, // dstPort
543 is_v6: bool, // isV6
544}
545
546/// udpGROTable holds flow and coalescing information for the purposes of UDP GRO.
547pub struct UdpGROTable {
548 items_by_flow: HashMap<UdpFlowKey, Vec<UdpGROItem>>,
549 items_pool: Vec<Vec<UdpGROItem>>,
550}
551
552impl Default for UdpGROTable {
553 fn default() -> Self {
554 UdpGROTable::new()
555 }
556}
557
558impl UdpGROTable {
559 pub fn new() -> Self {
560 let mut items_pool = Vec::with_capacity(IDEAL_BATCH_SIZE);
561 for _ in 0..IDEAL_BATCH_SIZE {
562 items_pool.push(Vec::with_capacity(IDEAL_BATCH_SIZE));
563 }
564 UdpGROTable {
565 items_by_flow: HashMap::with_capacity(IDEAL_BATCH_SIZE),
566 items_pool,
567 }
568 }
569}
570
571impl UdpFlowKey {
572 pub fn new(
573 pkt: &[u8],
574 src_addr_offset: usize,
575 dst_addr_offset: usize,
576 udph_offset: usize,
577 ) -> UdpFlowKey {
578 let mut key = UdpFlowKey {
579 src_addr: [0; 16],
580 dst_addr: [0; 16],
581 src_port: 0,
582 dst_port: 0,
583 is_v6: false,
584 };
585 let addr_size = dst_addr_offset - src_addr_offset;
586 key.src_addr[..addr_size].copy_from_slice(&pkt[src_addr_offset..dst_addr_offset]);
587 key.dst_addr[..addr_size]
588 .copy_from_slice(&pkt[dst_addr_offset..dst_addr_offset + addr_size]);
589 key.src_port = BigEndian::read_u16(&pkt[udph_offset..]);
590 key.dst_port = BigEndian::read_u16(&pkt[udph_offset + 2..]);
591 key.is_v6 = addr_size == 16;
592 key
593 }
594}
595
596impl UdpGROTable {
597 /// Looks up a flow for the provided packet and metadata.
598 /// Returns a reference to the packets found for the flow and a boolean indicating if the flow already existed.
599 /// If the flow is not found, inserts a new flow and returns `None` for the items.
600 fn lookup_or_insert(
601 &mut self,
602 pkt: &[u8],
603 src_addr_offset: usize,
604 dst_addr_offset: usize,
605 udph_offset: usize,
606 bufs_index: usize,
607 ) -> Option<&mut Vec<UdpGROItem>> {
608 let key = UdpFlowKey::new(pkt, src_addr_offset, dst_addr_offset, udph_offset);
609 if self.items_by_flow.contains_key(&key) {
610 self.items_by_flow.get_mut(&key)
611 } else {
612 // If the flow does not exist, insert a new entry.
613 self.insert(
614 pkt,
615 src_addr_offset,
616 dst_addr_offset,
617 udph_offset,
618 bufs_index,
619 false,
620 );
621 None
622 }
623 }
624 /// Inserts an item in the table for the provided packet and its metadata.
625 fn insert(
626 &mut self,
627 pkt: &[u8],
628 src_addr_offset: usize,
629 dst_addr_offset: usize,
630 udph_offset: usize,
631 bufs_index: usize,
632 c_sum_known_invalid: bool,
633 ) {
634 let key = UdpFlowKey::new(pkt, src_addr_offset, dst_addr_offset, udph_offset);
635 let item = UdpGROItem {
636 key,
637 bufs_index: bufs_index as u16,
638 num_merged: 0,
639 gso_size: (pkt.len() - (udph_offset + UDP_H_LEN)) as u16,
640 iph_len: udph_offset as u8,
641 c_sum_known_invalid,
642 };
643 let items = self
644 .items_by_flow
645 .entry(key)
646 .or_insert_with(|| self.items_pool.pop().unwrap_or_default());
647 items.push(item);
648 }
649}
650// func (u *udpGROTable) updateAt(item udpGROItem, i int) {
651// items, _ := u.itemsByFlow[item.key]
652// items[i] = item
653// }
654
655/// udpGROItem represents bookkeeping data for a UDP packet during the lifetime
656/// of a GRO evaluation across a vector of packets.
657#[derive(Debug, Clone, Copy)]
658pub struct UdpGROItem {
659 key: UdpFlowKey, // udpFlowKey
660 bufs_index: u16, // the index into the original bufs slice
661 num_merged: u16, // the number of packets merged into this item
662 gso_size: u16, // payload size
663 iph_len: u8, // ip header len
664 c_sum_known_invalid: bool, // UDP header checksum validity; a false value DOES NOT imply valid, just unknown.
665}
666// func (u *udpGROTable) newItems() []udpGROItem {
667// var items []udpGROItem
668// items, u.itemsPool = u.itemsPool[len(u.itemsPool)-1], u.itemsPool[:len(u.itemsPool)-1]
669// return items
670// }
671
672impl UdpGROTable {
673 fn reset(&mut self) {
674 for (_key, mut items) in self.items_by_flow.drain() {
675 items.clear();
676 self.items_pool.push(items);
677 }
678 }
679}
680
681/// canCoalesce represents the outcome of checking if two TCP packets are
682/// candidates for coalescing.
683#[derive(Copy, Clone, Eq, PartialEq)]
684enum CanCoalesce {
685 Prepend,
686 Unavailable,
687 Append,
688}
689
690/// ipHeadersCanCoalesce returns true if the IP headers found in pktA and pktB
691/// meet all requirements to be merged as part of a GRO operation, otherwise it
692/// returns false.
693fn ip_headers_can_coalesce(pkt_a: &[u8], pkt_b: &[u8]) -> bool {
694 if pkt_a.len() < 9 || pkt_b.len() < 9 {
695 return false;
696 }
697
698 if pkt_a[0] >> 4 == 6 {
699 if pkt_a[0] != pkt_b[0] || pkt_a[1] >> 4 != pkt_b[1] >> 4 {
700 // cannot coalesce with unequal Traffic class values
701 return false;
702 }
703 if pkt_a[7] != pkt_b[7] {
704 // cannot coalesce with unequal Hop limit values
705 return false;
706 }
707 } else {
708 if pkt_a[1] != pkt_b[1] {
709 // cannot coalesce with unequal ToS values
710 return false;
711 }
712 if pkt_a[6] >> 5 != pkt_b[6] >> 5 {
713 // cannot coalesce with unequal DF or reserved bits. MF is checked
714 // further up the stack.
715 return false;
716 }
717 if pkt_a[8] != pkt_b[8] {
718 // cannot coalesce with unequal TTL values
719 return false;
720 }
721 }
722
723 true
724}
725
726/// udpPacketsCanCoalesce evaluates if pkt can be coalesced with the packet
727/// described by item. iphLen and gsoSize describe pkt. bufs is the vector of
728/// packets involved in the current GRO evaluation. bufsOffset is the offset at
729/// which packet data begins within bufs.
730fn udp_packets_can_coalesce<B: ExpandBuffer>(
731 pkt: &[u8],
732 iph_len: u8,
733 gso_size: u16,
734 item: &UdpGROItem,
735 bufs: &[B],
736 bufs_offset: usize,
737) -> CanCoalesce {
738 let pkt_target = &bufs[item.bufs_index as usize].as_ref()[bufs_offset..];
739 if !ip_headers_can_coalesce(pkt, pkt_target) {
740 return CanCoalesce::Unavailable;
741 }
742 if (pkt_target[(iph_len as usize + UDP_H_LEN)..].len()) % (item.gso_size as usize) != 0 {
743 // A smaller than gsoSize packet has been appended previously.
744 // Nothing can come after a smaller packet on the end.
745 return CanCoalesce::Unavailable;
746 }
747 if gso_size > item.gso_size {
748 // We cannot have a larger packet following a smaller one.
749 return CanCoalesce::Unavailable;
750 }
751 CanCoalesce::Append
752}
753
754/// tcpPacketsCanCoalesce evaluates if pkt can be coalesced with the packet
755/// described by item. This function makes considerations that match the kernel's
756/// GRO self tests, which can be found in tools/testing/selftests/net/gro.c.
757#[allow(clippy::too_many_arguments)]
758fn tcp_packets_can_coalesce<B: ExpandBuffer>(
759 pkt: &[u8],
760 iph_len: u8,
761 tcph_len: u8,
762 seq: u32,
763 psh_set: bool,
764 gso_size: u16,
765 item: &TcpGROItem,
766 bufs: &[B],
767 bufs_offset: usize,
768) -> CanCoalesce {
769 let pkt_target = &bufs[item.bufs_index as usize].as_ref()[bufs_offset..];
770
771 if tcph_len != item.tcph_len {
772 // cannot coalesce with unequal tcp options len
773 return CanCoalesce::Unavailable;
774 }
775
776 if tcph_len > 20
777 && pkt[iph_len as usize + 20..iph_len as usize + tcph_len as usize]
778 != pkt_target[item.iph_len as usize + 20..item.iph_len as usize + tcph_len as usize]
779 {
780 // cannot coalesce with unequal tcp options
781 return CanCoalesce::Unavailable;
782 }
783
784 if !ip_headers_can_coalesce(pkt, pkt_target) {
785 return CanCoalesce::Unavailable;
786 }
787
788 // seq adjacency
789 let mut lhs_len = item.gso_size as usize;
790 lhs_len += (item.num_merged as usize) * (item.gso_size as usize);
791
792 if seq == item.sent_seq.wrapping_add(lhs_len as u32) {
793 // pkt aligns following item from a seq num perspective
794 if item.psh_set {
795 // We cannot append to a segment that has the PSH flag set, PSH
796 // can only be set on the final segment in a reassembled group.
797 return CanCoalesce::Unavailable;
798 }
799
800 if pkt_target[iph_len as usize + tcph_len as usize..].len() % item.gso_size as usize != 0 {
801 // A smaller than gsoSize packet has been appended previously.
802 // Nothing can come after a smaller packet on the end.
803 return CanCoalesce::Unavailable;
804 }
805
806 if gso_size > item.gso_size {
807 // We cannot have a larger packet following a smaller one.
808 return CanCoalesce::Unavailable;
809 }
810
811 return CanCoalesce::Append;
812 } else if seq.wrapping_add(gso_size as u32) == item.sent_seq {
813 // pkt aligns in front of item from a seq num perspective
814 if psh_set {
815 // We cannot prepend with a segment that has the PSH flag set, PSH
816 // can only be set on the final segment in a reassembled group.
817 return CanCoalesce::Unavailable;
818 }
819
820 if gso_size < item.gso_size {
821 // We cannot have a larger packet following a smaller one.
822 return CanCoalesce::Unavailable;
823 }
824
825 if gso_size > item.gso_size && item.num_merged > 0 {
826 // There's at least one previous merge, and we're larger than all
827 // previous. This would put multiple smaller packets on the end.
828 return CanCoalesce::Unavailable;
829 }
830
831 return CanCoalesce::Prepend;
832 }
833
834 CanCoalesce::Unavailable
835}
836
837fn checksum_valid(pkt: &[u8], iph_len: u8, proto: u8, is_v6: bool) -> bool {
838 let (src_addr_at, addr_size) = if is_v6 {
839 (IPV6_SRC_ADDR_OFFSET, 16)
840 } else {
841 (IPV4_SRC_ADDR_OFFSET, 4)
842 };
843
844 let len_for_pseudo = (pkt.len() as u16).saturating_sub(iph_len as u16);
845
846 let c_sum = pseudo_header_checksum_no_fold(
847 proto,
848 &pkt[src_addr_at..src_addr_at + addr_size],
849 &pkt[src_addr_at + addr_size..src_addr_at + addr_size * 2],
850 len_for_pseudo,
851 );
852
853 !checksum(&pkt[iph_len as usize..], c_sum) == 0
854}
855
856/// coalesceResult represents the result of attempting to coalesce two TCP
857/// packets.
858enum CoalesceResult {
859 InsufficientCap,
860 PSHEnding,
861 ItemInvalidCSum,
862 PktInvalidCSum,
863 Success,
864}
865
866/// coalesceUDPPackets attempts to coalesce pkt with the packet described by
867/// item, and returns the outcome.
868fn coalesce_udp_packets<B: ExpandBuffer>(
869 pkt: &[u8],
870 item: &mut UdpGROItem,
871 bufs: &mut [B],
872 bufs_offset: usize,
873 is_v6: bool,
874) -> CoalesceResult {
875 let buf = bufs[item.bufs_index as usize].as_ref();
876 // let pkt_head = &buf[bufs_offset..]; // the packet that will end up at the front
877 let headers_len = item.iph_len as usize + UDP_H_LEN;
878 let coalesced_len = buf[bufs_offset..].len() + pkt.len() - headers_len;
879 if bufs[item.bufs_index as usize].buf_capacity() < bufs_offset * 2 + coalesced_len {
880 // We don't want to allocate a new underlying array if capacity is
881 // too small.
882 return CoalesceResult::InsufficientCap;
883 }
884
885 if item.num_merged == 0
886 && (item.c_sum_known_invalid
887 || !checksum_valid(&buf[bufs_offset..], item.iph_len, IPPROTO_UDP as _, is_v6))
888 {
889 return CoalesceResult::ItemInvalidCSum;
890 }
891
892 if !checksum_valid(pkt, item.iph_len, IPPROTO_UDP as _, is_v6) {
893 return CoalesceResult::PktInvalidCSum;
894 }
895 bufs[item.bufs_index as usize].buf_extend_from_slice(&pkt[headers_len..]);
896 item.num_merged += 1;
897 CoalesceResult::Success
898}
899
900/// coalesceTCPPackets attempts to coalesce pkt with the packet described by
901/// item, and returns the outcome. This function may swap bufs elements in the
902/// event of a prepend as item's bufs index is already being tracked for writing
903/// to a Device.
904#[allow(clippy::too_many_arguments)]
905fn coalesce_tcp_packets<B: ExpandBuffer>(
906 mode: CanCoalesce,
907 pkt: &[u8],
908 pkt_bufs_index: usize,
909 gso_size: u16,
910 seq: u32,
911 psh_set: bool,
912 item: &mut TcpGROItem,
913 bufs: &mut [B],
914 bufs_offset: usize,
915 is_v6: bool,
916) -> CoalesceResult {
917 let pkt_head: &[u8]; // the packet that will end up at the front
918 let headers_len = (item.iph_len + item.tcph_len) as usize;
919 let coalesced_len =
920 bufs[item.bufs_index as usize].as_ref()[bufs_offset..].len() + pkt.len() - headers_len;
921 // Copy data
922 if mode == CanCoalesce::Prepend {
923 pkt_head = pkt;
924 if bufs[pkt_bufs_index].buf_capacity() < 2 * bufs_offset + coalesced_len {
925 // We don't want to allocate a new underlying array if capacity is
926 // too small.
927 return CoalesceResult::InsufficientCap;
928 }
929 if psh_set {
930 return CoalesceResult::PSHEnding;
931 }
932 if item.num_merged == 0
933 && !checksum_valid(
934 &bufs[item.bufs_index as usize].as_ref()[bufs_offset..],
935 item.iph_len,
936 IPPROTO_TCP as _,
937 is_v6,
938 )
939 {
940 return CoalesceResult::ItemInvalidCSum;
941 }
942 if !checksum_valid(pkt, item.iph_len, IPPROTO_TCP as _, is_v6) {
943 return CoalesceResult::PktInvalidCSum;
944 }
945 item.sent_seq = seq;
946 let extend_by = coalesced_len - pkt_head.len();
947 let len = bufs[pkt_bufs_index].as_ref().len();
948 bufs[pkt_bufs_index].buf_resize(len + extend_by, 0);
949 let src = bufs[item.bufs_index as usize].as_ref()[bufs_offset + headers_len..].as_ptr();
950 let dst = bufs[pkt_bufs_index].as_mut()[bufs_offset + pkt.len()..].as_mut_ptr();
951 unsafe {
952 std::ptr::copy_nonoverlapping(src, dst, extend_by);
953 }
954 // Flip the slice headers in bufs as part of prepend. The index of item
955 // is already being tracked for writing.
956 bufs.swap(item.bufs_index as usize, pkt_bufs_index);
957 } else {
958 // pkt_head = &bufs[item.bufs_index as usize][bufs_offset..];
959 if bufs[item.bufs_index as usize].buf_capacity() < 2 * bufs_offset + coalesced_len {
960 // We don't want to allocate a new underlying array if capacity is
961 // too small.
962 return CoalesceResult::InsufficientCap;
963 }
964 if item.num_merged == 0
965 && !checksum_valid(
966 &bufs[item.bufs_index as usize].as_ref()[bufs_offset..],
967 item.iph_len,
968 IPPROTO_TCP as _,
969 is_v6,
970 )
971 {
972 return CoalesceResult::ItemInvalidCSum;
973 }
974 if !checksum_valid(pkt, item.iph_len, IPPROTO_TCP as _, is_v6) {
975 return CoalesceResult::PktInvalidCSum;
976 }
977 if psh_set {
978 // We are appending a segment with PSH set.
979 item.psh_set = psh_set;
980 bufs[item.bufs_index as usize].as_mut()
981 [bufs_offset + item.iph_len as usize + TCP_FLAGS_OFFSET] |= TCP_FLAG_PSH;
982 }
983 // https://github.com/WireGuard/wireguard-go/blob/12269c2761734b15625017d8565745096325392f/tun/offload_linux.go#L495
984 // extendBy := len(pkt) - int(headersLen)
985 // bufs[item.bufsIndex] = append(bufs[item.bufsIndex], make([]byte, extendBy)...)
986 // copy(bufs[item.bufsIndex][bufsOffset+len(pktHead):], pkt[headersLen:])
987 bufs[item.bufs_index as usize].buf_extend_from_slice(&pkt[headers_len..]);
988 }
989
990 if gso_size > item.gso_size {
991 item.gso_size = gso_size;
992 }
993
994 item.num_merged += 1;
995 CoalesceResult::Success
996}
997
998const IPV4_FLAG_MORE_FRAGMENTS: u8 = 0x20;
999
1000const IPV4_SRC_ADDR_OFFSET: usize = 12;
1001const IPV6_SRC_ADDR_OFFSET: usize = 8;
1002// maxUint16 = 1<<16 - 1
1003
1004#[derive(PartialEq, Eq)]
1005enum GroResult {
1006 Noop,
1007 TableInsert,
1008 Coalesced,
1009}
1010
1011/// tcpGRO evaluates the TCP packet at pktI in bufs for coalescing with
1012/// existing packets tracked in table. It returns a groResultNoop when no
1013/// action was taken, groResultTableInsert when the evaluated packet was
1014/// inserted into table, and groResultCoalesced when the evaluated packet was
1015/// coalesced with another packet in table.
1016fn tcp_gro<B: ExpandBuffer>(
1017 bufs: &mut [B],
1018 offset: usize,
1019 pkt_i: usize,
1020 table: &mut TcpGROTable,
1021 is_v6: bool,
1022) -> GroResult {
1023 let pkt = unsafe { &*(&bufs[pkt_i].as_ref()[offset..] as *const [u8]) };
1024 if pkt.len() > u16::MAX as usize {
1025 // A valid IPv4 or IPv6 packet will never exceed this.
1026 return GroResult::Noop;
1027 }
1028
1029 let mut iph_len = ((pkt[0] & 0x0F) * 4) as usize;
1030 if is_v6 {
1031 iph_len = 40;
1032 let ipv6_h_payload_len = u16::from_be_bytes([pkt[4], pkt[5]]) as usize;
1033 if ipv6_h_payload_len != pkt.len() - iph_len {
1034 return GroResult::Noop;
1035 }
1036 } else {
1037 let total_len = u16::from_be_bytes([pkt[2], pkt[3]]) as usize;
1038 if total_len != pkt.len() {
1039 return GroResult::Noop;
1040 }
1041 }
1042
1043 if pkt.len() < iph_len {
1044 return GroResult::Noop;
1045 }
1046
1047 let tcph_len = ((pkt[iph_len + 12] >> 4) * 4) as usize;
1048 if !(20..=60).contains(&tcph_len) {
1049 return GroResult::Noop;
1050 }
1051
1052 if pkt.len() < iph_len + tcph_len {
1053 return GroResult::Noop;
1054 }
1055
1056 if !is_v6 && (pkt[6] & IPV4_FLAG_MORE_FRAGMENTS != 0 || pkt[6] << 3 != 0 || pkt[7] != 0) {
1057 // no GRO support for fragmented segments for now
1058 return GroResult::Noop;
1059 }
1060
1061 let tcp_flags = pkt[iph_len + TCP_FLAGS_OFFSET];
1062 let mut psh_set = false;
1063
1064 // not a candidate if any non-ACK flags (except PSH+ACK) are set
1065 if tcp_flags != TCP_FLAG_ACK {
1066 if pkt[iph_len + TCP_FLAGS_OFFSET] != TCP_FLAG_ACK | TCP_FLAG_PSH {
1067 return GroResult::Noop;
1068 }
1069 psh_set = true;
1070 }
1071
1072 let gso_size = (pkt.len() - tcph_len - iph_len) as u16;
1073 // not a candidate if payload len is 0
1074 if gso_size < 1 {
1075 return GroResult::Noop;
1076 }
1077
1078 let seq = u32::from_be_bytes([
1079 pkt[iph_len + 4],
1080 pkt[iph_len + 5],
1081 pkt[iph_len + 6],
1082 pkt[iph_len + 7],
1083 ]);
1084
1085 let mut src_addr_offset = IPV4_SRC_ADDR_OFFSET;
1086 let mut addr_len = 4;
1087 if is_v6 {
1088 src_addr_offset = IPV6_SRC_ADDR_OFFSET;
1089 addr_len = 16;
1090 }
1091
1092 let items = if let Some(items) = table.lookup_or_insert(
1093 pkt,
1094 src_addr_offset,
1095 src_addr_offset + addr_len,
1096 iph_len,
1097 tcph_len,
1098 pkt_i,
1099 ) {
1100 items
1101 } else {
1102 return GroResult::TableInsert;
1103 };
1104
1105 for i in (0..items.len()).rev() {
1106 // In the best case of packets arriving in order iterating in reverse is
1107 // more efficient if there are multiple items for a given flow. This
1108 // also enables a natural table.delete_at() in the
1109 // coalesce_item_invalid_csum case without the need for index tracking.
1110 // This algorithm makes a best effort to coalesce in the event of
1111 // unordered packets, where pkt may land anywhere in items from a
1112 // sequence number perspective, however once an item is inserted into
1113 // the table it is never compared across other items later.
1114 let item = &mut items[i];
1115 let can = tcp_packets_can_coalesce(
1116 pkt,
1117 iph_len as u8,
1118 tcph_len as u8,
1119 seq,
1120 psh_set,
1121 gso_size,
1122 item,
1123 bufs,
1124 offset,
1125 );
1126
1127 match can {
1128 CanCoalesce::Unavailable => {}
1129 _ => {
1130 let result = coalesce_tcp_packets(
1131 can, pkt, pkt_i, gso_size, seq, psh_set, item, bufs, offset, is_v6,
1132 );
1133
1134 match result {
1135 CoalesceResult::Success => {
1136 // table.update_at(item, i);
1137 return GroResult::Coalesced;
1138 }
1139 CoalesceResult::ItemInvalidCSum => {
1140 // delete the item with an invalid csum
1141 // table.delete_at(item.key, i);
1142 items.remove(i);
1143 }
1144 CoalesceResult::PktInvalidCSum => {
1145 // no point in inserting an item that we can't coalesce
1146 return GroResult::Noop;
1147 }
1148 _ => {}
1149 }
1150 }
1151 }
1152 }
1153
1154 // failed to coalesce with any other packets; store the item in the flow
1155 table.insert(
1156 pkt,
1157 src_addr_offset,
1158 src_addr_offset + addr_len,
1159 iph_len,
1160 tcph_len,
1161 pkt_i,
1162 );
1163 GroResult::TableInsert
1164}
1165
1166/// Update packet headers after TCP packet coalescing.
1167///
1168/// After [`handle_gro`] coalesces multiple TCP packets into larger segments,
1169/// this function updates the packet headers to reflect the coalesced state.
1170/// It writes virtio headers with GSO information and updates IP/TCP headers.
1171///
1172/// # Arguments
1173///
1174/// * `bufs` - Mutable slice of packet buffers that were processed by GRO
1175/// * `offset` - Offset where packet data begins (typically [`VIRTIO_NET_HDR_LEN`])
1176/// * `table` - The TCP GRO table containing coalescing metadata
1177///
1178/// # What It Does
1179///
1180/// For each coalesced packet:
1181/// 1. Creates a virtio header with GSO type set to TCP (v4 or v6)
1182/// 2. Sets the segment size (`gso_size`) for future segmentation
1183/// 3. Calculates and stores the pseudo-header checksum for TCP
1184/// 4. Updates IP total length field
1185/// 5. Recalculates IPv4 header checksum if needed
1186///
1187/// The resulting packets can be efficiently segmented by the kernel when transmitted.
1188///
1189/// # Usage
1190///
1191/// This function is typically called automatically by [`handle_gro`] after packet
1192/// coalescing is complete. You usually don't need to call it directly.
1193///
1194/// # Errors
1195///
1196/// Returns an error if:
1197/// - Buffer sizes are incorrect
1198/// - Header encoding fails
1199/// - Packet structure is invalid
1200///
1201/// # See Also
1202///
1203/// - [`handle_gro`] - Main GRO processing function that calls this
1204/// - [`TcpGROTable`] - Maintains TCP flow state for coalescing
1205pub fn apply_tcp_coalesce_accounting<B: ExpandBuffer>(
1206 bufs: &mut [B],
1207 offset: usize,
1208 table: &TcpGROTable,
1209) -> io::Result<()> {
1210 for items in table.items_by_flow.values() {
1211 for item in items {
1212 if item.num_merged > 0 {
1213 let mut hdr = VirtioNetHdr {
1214 flags: VIRTIO_NET_HDR_F_NEEDS_CSUM,
1215 hdr_len: (item.iph_len + item.tcph_len) as u16,
1216 gso_size: item.gso_size,
1217 csum_start: item.iph_len as u16,
1218 csum_offset: 16,
1219 gso_type: 0, // Will be set later
1220 };
1221 let buf = bufs[item.bufs_index as usize].as_mut();
1222 let pkt = &mut buf[offset..];
1223 let pkt_len = pkt.len();
1224
1225 // Calculate the pseudo header checksum and place it at the TCP
1226 // checksum offset. Downstream checksum offloading will combine
1227 // this with computation of the tcp header and payload checksum.
1228 let addr_len = if item.key.is_v6 { 16 } else { 4 };
1229 let src_addr_at = if item.key.is_v6 {
1230 IPV6_SRC_ADDR_OFFSET
1231 } else {
1232 IPV4_SRC_ADDR_OFFSET
1233 };
1234
1235 let src_addr =
1236 unsafe { &*(&pkt[src_addr_at..src_addr_at + addr_len] as *const [u8]) };
1237 let dst_addr = unsafe {
1238 &*(&pkt[src_addr_at + addr_len..src_addr_at + addr_len * 2] as *const [u8])
1239 };
1240 // Recalculate the total len (IPv4) or payload len (IPv6).
1241 // Recalculate the (IPv4) header checksum.
1242 if item.key.is_v6 {
1243 hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1244 BigEndian::write_u16(&mut pkt[4..6], pkt_len as u16 - item.iph_len as u16);
1245 } else {
1246 hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1247 pkt[10] = 0;
1248 pkt[11] = 0;
1249 BigEndian::write_u16(&mut pkt[2..4], pkt_len as u16);
1250 let iph_csum = !checksum(&pkt[..item.iph_len as usize], 0);
1251 BigEndian::write_u16(&mut pkt[10..12], iph_csum);
1252 }
1253
1254 hdr.encode(&mut buf[offset - VIRTIO_NET_HDR_LEN..])?;
1255
1256 let pkt = &mut buf[offset..];
1257
1258 let psum = pseudo_header_checksum_no_fold(
1259 IPPROTO_TCP as _,
1260 src_addr,
1261 dst_addr,
1262 pkt_len as u16 - item.iph_len as u16,
1263 );
1264 let tcp_csum = checksum(&[], psum);
1265 BigEndian::write_u16(
1266 &mut pkt[(hdr.csum_start + hdr.csum_offset) as usize..],
1267 tcp_csum,
1268 );
1269 } else {
1270 let hdr = VirtioNetHdr::default();
1271 hdr.encode(
1272 &mut bufs[item.bufs_index as usize].as_mut()[offset - VIRTIO_NET_HDR_LEN..],
1273 )?;
1274 }
1275 }
1276 }
1277 Ok(())
1278}
1279
1280// applyUDPCoalesceAccounting updates bufs to account for coalescing based on the
1281// metadata found in table.
1282pub fn apply_udp_coalesce_accounting<B: ExpandBuffer>(
1283 bufs: &mut [B],
1284 offset: usize,
1285 table: &UdpGROTable,
1286) -> io::Result<()> {
1287 for items in table.items_by_flow.values() {
1288 for item in items {
1289 if item.num_merged > 0 {
1290 let hdr = VirtioNetHdr {
1291 flags: VIRTIO_NET_HDR_F_NEEDS_CSUM, // this turns into CHECKSUM_PARTIAL in the skb
1292 hdr_len: item.iph_len as u16 + UDP_H_LEN as u16,
1293 gso_size: item.gso_size,
1294 csum_start: item.iph_len as u16,
1295 csum_offset: 6,
1296 gso_type: VIRTIO_NET_HDR_GSO_UDP_L4,
1297 };
1298
1299 let buf = bufs[item.bufs_index as usize].as_mut();
1300 let pkt = &mut buf[offset..];
1301 let pkt_len = pkt.len();
1302
1303 // Calculate the pseudo header checksum and place it at the UDP
1304 // checksum offset. Downstream checksum offloading will combine
1305 // this with computation of the udp header and payload checksum.
1306 let (addr_len, src_addr_at) = if item.key.is_v6 {
1307 (16, IPV6_SRC_ADDR_OFFSET)
1308 } else {
1309 (4, IPV4_SRC_ADDR_OFFSET)
1310 };
1311
1312 let src_addr =
1313 unsafe { &*(&pkt[src_addr_at..(src_addr_at + addr_len)] as *const [u8]) };
1314 let dst_addr = unsafe {
1315 &*(&pkt[(src_addr_at + addr_len)..(src_addr_at + addr_len * 2)]
1316 as *const [u8])
1317 };
1318
1319 // Recalculate the total len (IPv4) or payload len (IPv6).
1320 // Recalculate the (IPv4) header checksum.
1321 if item.key.is_v6 {
1322 BigEndian::write_u16(&mut pkt[4..6], pkt_len as u16 - item.iph_len as u16);
1323 // set new IPv6 header payload len
1324 } else {
1325 pkt[10] = 0;
1326 pkt[11] = 0;
1327 BigEndian::write_u16(&mut pkt[2..4], pkt_len as u16); // set new total length
1328 let iph_csum = !checksum(&pkt[..item.iph_len as usize], 0);
1329 BigEndian::write_u16(&mut pkt[10..12], iph_csum); // set IPv4 header checksum field
1330 }
1331
1332 hdr.encode(&mut buf[offset - VIRTIO_NET_HDR_LEN..])?;
1333 let pkt = &mut buf[offset..];
1334 // Recalculate the UDP len field value
1335 BigEndian::write_u16(
1336 &mut pkt[(item.iph_len as usize + 4)..(item.iph_len as usize + 6)],
1337 pkt_len as u16 - item.iph_len as u16,
1338 );
1339
1340 let psum = pseudo_header_checksum_no_fold(
1341 IPPROTO_UDP as _,
1342 src_addr,
1343 dst_addr,
1344 pkt_len as u16 - item.iph_len as u16,
1345 );
1346
1347 let udp_csum = checksum(&[], psum);
1348 BigEndian::write_u16(
1349 &mut pkt[(hdr.csum_start + hdr.csum_offset) as usize..],
1350 udp_csum,
1351 );
1352 } else {
1353 let hdr = VirtioNetHdr::default();
1354 hdr.encode(
1355 &mut bufs[item.bufs_index as usize].as_mut()[offset - VIRTIO_NET_HDR_LEN..],
1356 )?;
1357 }
1358 }
1359 }
1360 Ok(())
1361}
1362
1363#[derive(PartialEq, Eq)]
1364pub enum GroCandidateType {
1365 NotGRO,
1366 Tcp4GRO,
1367 Tcp6GRO,
1368 Udp4GRO,
1369 Udp6GRO,
1370}
1371
1372pub fn packet_is_gro_candidate(b: &[u8], can_udp_gro: bool) -> GroCandidateType {
1373 if b.len() < 28 {
1374 return GroCandidateType::NotGRO;
1375 }
1376 if b[0] >> 4 == 4 {
1377 if b[0] & 0x0F != 5 {
1378 // IPv4 packets w/IP options do not coalesce
1379 return GroCandidateType::NotGRO;
1380 }
1381 match b[9] {
1382 6 if b.len() >= 40 => return GroCandidateType::Tcp4GRO,
1383 17 if can_udp_gro => return GroCandidateType::Udp4GRO,
1384 _ => {}
1385 }
1386 } else if b[0] >> 4 == 6 {
1387 match b[6] {
1388 6 if b.len() >= 60 => return GroCandidateType::Tcp6GRO,
1389 17 if b.len() >= 48 && can_udp_gro => return GroCandidateType::Udp6GRO,
1390 _ => {}
1391 }
1392 }
1393 GroCandidateType::NotGRO
1394}
1395
1396const UDP_H_LEN: usize = 8;
1397
1398/// udpGRO evaluates the UDP packet at pktI in bufs for coalescing with
1399/// existing packets tracked in table. It returns a groResultNoop when no
1400/// action was taken, groResultTableInsert when the evaluated packet was
1401/// inserted into table, and groResultCoalesced when the evaluated packet was
1402/// coalesced with another packet in table.
1403fn udp_gro<B: ExpandBuffer>(
1404 bufs: &mut [B],
1405 offset: usize,
1406 pkt_i: usize,
1407 table: &mut UdpGROTable,
1408 is_v6: bool,
1409) -> GroResult {
1410 let pkt = unsafe { &*(&bufs[pkt_i].as_ref()[offset..] as *const [u8]) };
1411 if pkt.len() > u16::MAX as usize {
1412 // A valid IPv4 or IPv6 packet will never exceed this.
1413 return GroResult::Noop;
1414 }
1415
1416 let mut iph_len = ((pkt[0] & 0x0F) * 4) as usize;
1417 if is_v6 {
1418 iph_len = 40;
1419 let ipv6_payload_len = u16::from_be_bytes([pkt[4], pkt[5]]) as usize;
1420 if ipv6_payload_len != pkt.len() - iph_len {
1421 return GroResult::Noop;
1422 }
1423 } else {
1424 let total_len = u16::from_be_bytes([pkt[2], pkt[3]]) as usize;
1425 if total_len != pkt.len() {
1426 return GroResult::Noop;
1427 }
1428 }
1429
1430 if pkt.len() < iph_len || pkt.len() < iph_len + UDP_H_LEN {
1431 return GroResult::Noop;
1432 }
1433
1434 if !is_v6 && (pkt[6] & IPV4_FLAG_MORE_FRAGMENTS != 0 || pkt[6] << 3 != 0 || pkt[7] != 0) {
1435 // No GRO support for fragmented segments for now.
1436 return GroResult::Noop;
1437 }
1438
1439 let gso_size = (pkt.len() - UDP_H_LEN - iph_len) as u16;
1440 if gso_size < 1 {
1441 return GroResult::Noop;
1442 }
1443
1444 let (src_addr_offset, addr_len) = if is_v6 {
1445 (IPV6_SRC_ADDR_OFFSET, 16)
1446 } else {
1447 (IPV4_SRC_ADDR_OFFSET, 4)
1448 };
1449
1450 let items = table.lookup_or_insert(
1451 pkt,
1452 src_addr_offset,
1453 src_addr_offset + addr_len,
1454 iph_len,
1455 pkt_i,
1456 );
1457
1458 let items = if let Some(items) = items {
1459 items
1460 } else {
1461 return GroResult::TableInsert;
1462 };
1463
1464 // Only check the last item to prevent reordering packets for a flow.
1465 let items_len = items.len();
1466 let item = &mut items[items_len - 1];
1467 let can = udp_packets_can_coalesce(pkt, iph_len as u8, gso_size, item, bufs, offset);
1468 let mut pkt_csum_known_invalid = false;
1469
1470 if can == CanCoalesce::Append {
1471 match coalesce_udp_packets(pkt, item, bufs, offset, is_v6) {
1472 CoalesceResult::Success => {
1473 // 前面是引用,这里不需要再更新
1474 // table.update_at(*item, items_len - 1);
1475 return GroResult::Coalesced;
1476 }
1477 CoalesceResult::ItemInvalidCSum => {
1478 // If the existing item has an invalid checksum, take no action.
1479 // A new item will be stored, and the existing item won't be revisited.
1480 }
1481 CoalesceResult::PktInvalidCSum => {
1482 // Insert a new item but mark it with invalid checksum to avoid repeat checks.
1483 pkt_csum_known_invalid = true;
1484 }
1485 _ => {}
1486 }
1487 }
1488 let pkt = &bufs[pkt_i].as_ref()[offset..];
1489 // Failed to coalesce; store the packet in the flow.
1490 table.insert(
1491 pkt,
1492 src_addr_offset,
1493 src_addr_offset + addr_len,
1494 iph_len,
1495 pkt_i,
1496 pkt_csum_known_invalid,
1497 );
1498 GroResult::TableInsert
1499}
1500
1501/// handleGRO evaluates bufs for GRO, and writes the indices of the resulting
1502/// Process received packets and apply Generic Receive Offload (GRO) coalescing.
1503///
1504/// This function examines a batch of received packets and coalesces packets belonging
1505/// to the same TCP or UDP flow into larger segments, reducing per-packet overhead.
1506///
1507/// # Arguments
1508///
1509/// * `bufs` - Mutable slice of packet buffers. Each buffer should contain a full packet
1510/// starting at `offset` (with space before offset for the virtio header).
1511/// * `offset` - Offset where packet data begins (typically [`VIRTIO_NET_HDR_LEN`]).
1512/// The virtio header will be written before this offset.
1513/// * `tcp_table` - TCP GRO table for tracking TCP flows.
1514/// * `udp_table` - UDP GRO table for tracking UDP flows.
1515/// * `can_udp_gro` - Whether UDP GRO is supported (kernel feature).
1516/// * `to_write` - Output vector that will be filled with indices of packets to write.
1517/// Initially should be empty.
1518///
1519/// # Returns
1520///
1521/// Returns `Ok(())` on success, or an error if packet processing fails.
1522///
1523/// # Behavior
1524///
1525/// 1. Examines each packet to determine if it's a GRO candidate (TCP or UDP)
1526/// 2. Attempts to coalesce the packet with previous packets in the same flow
1527/// 3. Writes indices of final packets (coalesced or standalone) to `to_write`
1528/// 4. Updates packet headers with appropriate virtio headers
1529///
1530/// # Example
1531///
1532/// ```no_run
1533/// # #[cfg(target_os = "linux")]
1534/// # {
1535/// use tun_rs::{handle_gro, GROTable, VIRTIO_NET_HDR_LEN};
1536///
1537/// let mut gro_table = GROTable::default();
1538/// let mut bufs = vec![vec![0u8; 1500]; 128];
1539/// let mut to_write = Vec::new();
1540///
1541/// // After receiving packets into bufs with recv_multiple:
1542/// // handle_gro(
1543/// // &mut bufs,
1544/// // VIRTIO_NET_HDR_LEN,
1545/// // &mut gro_table.tcp_table,
1546/// // &mut gro_table.udp_table,
1547/// // true, // UDP GRO supported
1548/// // &mut to_write
1549/// // )?;
1550///
1551/// // to_write now contains indices of packets to process
1552/// // for idx in &to_write {
1553/// // let packet = &bufs[*idx];
1554/// // // process packet...
1555/// // }
1556/// # }
1557/// # Ok::<(), std::io::Error>(())
1558/// ```
1559///
1560/// # Performance
1561///
1562/// - Coalescing reduces the number of packets passed to the application
1563/// - Typical coalescing ratios: 5-20 packets into 1 for bulk TCP transfers
1564/// - Most effective for sequential TCP traffic with large receive windows
1565///
1566/// # See Also
1567///
1568/// - [`GROTable`] for managing GRO state
1569/// - [`apply_tcp_coalesce_accounting`] for updating TCP headers after coalescing
1570pub fn handle_gro<B: ExpandBuffer>(
1571 bufs: &mut [B],
1572 offset: usize,
1573 tcp_table: &mut TcpGROTable,
1574 udp_table: &mut UdpGROTable,
1575 can_udp_gro: bool,
1576 to_write: &mut Vec<usize>,
1577) -> io::Result<()> {
1578 let bufs_len = bufs.len();
1579 for i in 0..bufs_len {
1580 if offset < VIRTIO_NET_HDR_LEN || offset > bufs[i].as_ref().len() - 1 {
1581 return Err(io::Error::new(
1582 io::ErrorKind::InvalidInput,
1583 "invalid offset",
1584 ));
1585 }
1586
1587 let result = match packet_is_gro_candidate(&bufs[i].as_ref()[offset..], can_udp_gro) {
1588 GroCandidateType::Tcp4GRO => tcp_gro(bufs, offset, i, tcp_table, false),
1589 GroCandidateType::Tcp6GRO => tcp_gro(bufs, offset, i, tcp_table, true),
1590 GroCandidateType::Udp4GRO => udp_gro(bufs, offset, i, udp_table, false),
1591 GroCandidateType::Udp6GRO => udp_gro(bufs, offset, i, udp_table, true),
1592 GroCandidateType::NotGRO => GroResult::Noop,
1593 };
1594
1595 match result {
1596 GroResult::Noop => {
1597 let hdr = VirtioNetHdr::default();
1598 hdr.encode(&mut bufs[i].as_mut()[offset - VIRTIO_NET_HDR_LEN..offset])?;
1599 // Fallthrough intended
1600 to_write.push(i);
1601 }
1602 GroResult::TableInsert => {
1603 to_write.push(i);
1604 }
1605 _ => {}
1606 }
1607 }
1608
1609 let err_tcp = apply_tcp_coalesce_accounting(bufs, offset, tcp_table);
1610 let err_udp = apply_udp_coalesce_accounting(bufs, offset, udp_table);
1611 err_tcp?;
1612 err_udp?;
1613 Ok(())
1614}
1615
1616/// Split a GSO (Generic Segmentation Offload) packet into multiple smaller packets.
1617///
1618/// When sending data with offload enabled, the application can provide large packets
1619/// that will be automatically segmented. This function performs the opposite operation:
1620/// splitting a large GSO packet into MTU-sized segments for transmission.
1621///
1622/// # Arguments
1623///
1624/// * `input` - The input buffer containing the large GSO packet (with virtio header).
1625/// * `hdr` - The virtio network header describing the GSO packet.
1626/// * `out_bufs` - Output buffers where segmented packets will be written.
1627/// * `sizes` - Output array where the size of each segmented packet will be written.
1628/// * `out_offset` - Offset in output buffers where packet data should start.
1629/// * `is_v6` - Whether this is an IPv6 packet (affects header offsets).
1630///
1631/// # Returns
1632///
1633/// Returns the number of output buffers populated (number of segments created),
1634/// or an error if segmentation fails.
1635///
1636/// # How GSO Splitting Works
1637///
1638/// For a large TCP packet with GSO enabled:
1639/// 1. The packet headers are parsed (IP + TCP)
1640/// 2. The payload is split into segments of size `hdr.gso_size`
1641/// 3. New packets are created with copied headers and updated fields:
1642/// - IP length field
1643/// - IP checksum (for IPv4)
1644/// - TCP sequence number (incremented for each segment)
1645/// - TCP checksum
1646///
1647/// # Example
1648///
1649/// ```no_run
1650/// # #[cfg(target_os = "linux")]
1651/// # {
1652/// use tun_rs::{gso_split, VirtioNetHdr, VIRTIO_NET_HDR_LEN};
1653///
1654/// let mut large_packet = vec![0u8; 65536];
1655/// let hdr = VirtioNetHdr::default();
1656/// let mut out_bufs = vec![vec![0u8; 1500]; 128];
1657/// let mut sizes = vec![0; 128];
1658///
1659/// // Split the GSO packet
1660/// // let num_segments = gso_split(
1661/// // &mut large_packet,
1662/// // hdr,
1663/// // &mut out_bufs,
1664/// // &mut sizes,
1665/// // VIRTIO_NET_HDR_LEN,
1666/// // false // IPv4
1667/// // )?;
1668///
1669/// // Now out_bufs[0..num_segments] contain the segmented packets
1670/// # }
1671/// # Ok::<(), std::io::Error>(())
1672/// ```
1673///
1674/// # Supported Protocols
1675///
1676/// - TCP over IPv4 (GSO type: [`VIRTIO_NET_HDR_GSO_TCPV4`])
1677/// - TCP over IPv6 (GSO type: [`VIRTIO_NET_HDR_GSO_TCPV6`])
1678/// - UDP (GSO type: [`VIRTIO_NET_HDR_GSO_UDP_L4`])
1679///
1680/// # Performance
1681///
1682/// GSO allows sending fewer, larger packets to the kernel, which then performs
1683/// efficient segmentation. This reduces:
1684/// - Number of system calls
1685/// - Per-packet processing overhead in the application
1686/// - Context switches
1687///
1688/// Typical performance improvement: 2-5x for bulk transfers.
1689pub fn gso_split<B: AsRef<[u8]> + AsMut<[u8]>>(
1690 input: &mut [u8],
1691 hdr: VirtioNetHdr,
1692 out_bufs: &mut [B],
1693 sizes: &mut [usize],
1694 out_offset: usize,
1695 is_v6: bool,
1696) -> io::Result<usize> {
1697 let iph_len = hdr.csum_start as usize;
1698 let (src_addr_offset, addr_len) = if is_v6 {
1699 (IPV6_SRC_ADDR_OFFSET, 16)
1700 } else {
1701 input[10] = 0;
1702 input[11] = 0; // clear IPv4 header checksum
1703 (IPV4_SRC_ADDR_OFFSET, 4)
1704 };
1705
1706 let transport_csum_at = (hdr.csum_start + hdr.csum_offset) as usize;
1707 input[transport_csum_at] = 0;
1708 input[transport_csum_at + 1] = 0; // clear TCP/UDP checksum
1709
1710 let (first_tcp_seq_num, protocol) =
1711 if hdr.gso_type == VIRTIO_NET_HDR_GSO_TCPV4 || hdr.gso_type == VIRTIO_NET_HDR_GSO_TCPV6 {
1712 (
1713 BigEndian::read_u32(&input[hdr.csum_start as usize + 4..]),
1714 IPPROTO_TCP,
1715 )
1716 } else {
1717 (0, IPPROTO_UDP)
1718 };
1719
1720 let src_addr_bytes = &input[src_addr_offset..src_addr_offset + addr_len];
1721 let dst_addr_bytes = &input[src_addr_offset + addr_len..src_addr_offset + 2 * addr_len];
1722 let transport_header_len = (hdr.hdr_len - hdr.csum_start) as usize;
1723
1724 let nonlast_segment_data_len = hdr.gso_size as usize;
1725 let nonlast_len_for_pseudo = (transport_header_len + nonlast_segment_data_len) as u16;
1726 let nonlast_total_len = hdr.hdr_len as usize + nonlast_segment_data_len;
1727
1728 let nonlast_transport_csum_no_fold = pseudo_header_checksum_no_fold(
1729 protocol as u8,
1730 src_addr_bytes,
1731 dst_addr_bytes,
1732 nonlast_len_for_pseudo,
1733 );
1734
1735 let mut next_segment_data_at = hdr.hdr_len as usize;
1736 let mut i = 0;
1737
1738 while next_segment_data_at < input.len() {
1739 if i == out_bufs.len() {
1740 return Err(io::Error::other("ErrTooManySegments"));
1741 }
1742
1743 let next_segment_end = next_segment_data_at + hdr.gso_size as usize;
1744 let (next_segment_end, segment_data_len, total_len, transport_csum_no_fold) =
1745 if next_segment_end > input.len() {
1746 let last_segment_data_len = input.len() - next_segment_data_at;
1747 let last_len_for_pseudo = (transport_header_len + last_segment_data_len) as u16;
1748
1749 let last_total_len = hdr.hdr_len as usize + last_segment_data_len;
1750 let last_transport_csum_no_fold = pseudo_header_checksum_no_fold(
1751 protocol as u8,
1752 src_addr_bytes,
1753 dst_addr_bytes,
1754 last_len_for_pseudo,
1755 );
1756 (
1757 input.len(),
1758 last_segment_data_len,
1759 last_total_len,
1760 last_transport_csum_no_fold,
1761 )
1762 } else {
1763 (
1764 next_segment_end,
1765 hdr.gso_size as usize,
1766 nonlast_total_len,
1767 nonlast_transport_csum_no_fold,
1768 )
1769 };
1770
1771 sizes[i] = total_len;
1772 let out = &mut out_bufs[i].as_mut()[out_offset..];
1773
1774 out[..iph_len].copy_from_slice(&input[..iph_len]);
1775
1776 if !is_v6 {
1777 // For IPv4 we are responsible for incrementing the ID field,
1778 // updating the total len field, and recalculating the header
1779 // checksum.
1780 if i > 0 {
1781 let id = BigEndian::read_u16(&out[4..]).wrapping_add(i as u16);
1782 BigEndian::write_u16(&mut out[4..6], id);
1783 }
1784 BigEndian::write_u16(&mut out[2..4], total_len as u16);
1785 let ipv4_csum = !checksum(&out[..iph_len], 0);
1786 BigEndian::write_u16(&mut out[10..12], ipv4_csum);
1787 } else {
1788 // For IPv6 we are responsible for updating the payload length field.
1789 // IPv6 extensions are not checksumed, but included in the payload length.
1790 const IPV6_FIXED_HDR_LEN: usize = 40;
1791 let payload_len = total_len - IPV6_FIXED_HDR_LEN;
1792 BigEndian::write_u16(&mut out[4..6], payload_len as u16);
1793 }
1794
1795 out[hdr.csum_start as usize..hdr.hdr_len as usize]
1796 .copy_from_slice(&input[hdr.csum_start as usize..hdr.hdr_len as usize]);
1797
1798 if protocol == IPPROTO_TCP {
1799 let tcp_seq = first_tcp_seq_num.wrapping_add(hdr.gso_size as u32 * i as u32);
1800 BigEndian::write_u32(
1801 &mut out[(hdr.csum_start + 4) as usize..(hdr.csum_start + 8) as usize],
1802 tcp_seq,
1803 );
1804 if next_segment_end != input.len() {
1805 out[hdr.csum_start as usize + TCP_FLAGS_OFFSET] &= !(TCP_FLAG_FIN | TCP_FLAG_PSH);
1806 }
1807 } else {
1808 let udp_len = (segment_data_len + (hdr.hdr_len - hdr.csum_start) as usize) as u16;
1809 BigEndian::write_u16(
1810 &mut out[(hdr.csum_start + 4) as usize..(hdr.csum_start + 6) as usize],
1811 udp_len,
1812 );
1813 }
1814
1815 out[hdr.hdr_len as usize..total_len]
1816 .as_mut()
1817 .copy_from_slice(&input[next_segment_data_at..next_segment_end]);
1818
1819 let transport_csum = !checksum(
1820 &out[hdr.csum_start as usize..total_len],
1821 transport_csum_no_fold,
1822 );
1823 BigEndian::write_u16(
1824 &mut out[transport_csum_at..transport_csum_at + 2],
1825 transport_csum,
1826 );
1827
1828 next_segment_data_at += hdr.gso_size as usize;
1829 i += 1;
1830 }
1831
1832 Ok(i)
1833}
1834
1835/// Calculate checksum for packets without GSO.
1836///
1837/// This function computes and writes the transport layer (TCP/UDP) checksum for
1838/// packets that don't use Generic Segmentation Offload.
1839///
1840/// # Arguments
1841///
1842/// * `in_buf` - The packet buffer (mutable)
1843/// * `csum_start` - Offset where checksum calculation should begin
1844/// * `csum_offset` - Offset within the checksummed area where the checksum should be written
1845///
1846/// # Behavior
1847///
1848/// 1. Reads the initial checksum value (typically the pseudo-header checksum)
1849/// 2. Clears the checksum field
1850/// 3. Calculates the checksum over the transport header and data
1851/// 4. Writes the final checksum back to the buffer
1852///
1853/// This is used when [`VIRTIO_NET_HDR_F_NEEDS_CSUM`] flag is set but [`VIRTIO_NET_HDR_GSO_NONE`]
1854/// is the GSO type.
1855pub fn gso_none_checksum(in_buf: &mut [u8], csum_start: u16, csum_offset: u16) {
1856 let csum_at = (csum_start + csum_offset) as usize;
1857 // The initial value at the checksum offset should be summed with the
1858 // checksum we compute. This is typically the pseudo-header checksum.
1859 let initial = BigEndian::read_u16(&in_buf[csum_at..]);
1860 in_buf[csum_at] = 0;
1861 in_buf[csum_at + 1] = 0;
1862 let computed_checksum = checksum(&in_buf[csum_start as usize..], initial as u64);
1863 BigEndian::write_u16(&mut in_buf[csum_at..], !computed_checksum);
1864}
1865
1866/// Generic Receive Offload (GRO) table for managing packet coalescing.
1867///
1868/// This structure maintains the state needed to coalesce multiple received packets
1869/// into larger segments, reducing per-packet processing overhead. It combines both
1870/// TCP and UDP GRO capabilities.
1871///
1872/// # Purpose
1873///
1874/// When receiving many small packets of the same flow, GRO can combine them into
1875/// fewer, larger packets. This provides significant performance benefits:
1876///
1877/// - Reduces the number of packets passed to the application
1878/// - Fewer context switches and system calls
1879/// - Better cache utilization
1880/// - Lower CPU usage per gigabit of traffic
1881///
1882/// # Usage
1883///
1884/// Create a `GROTable` and reuse it across multiple `recv_multiple` calls:
1885///
1886/// ```no_run
1887/// # #[cfg(target_os = "linux")]
1888/// # {
1889/// use tun_rs::{DeviceBuilder, GROTable, IDEAL_BATCH_SIZE, VIRTIO_NET_HDR_LEN};
1890///
1891/// let dev = DeviceBuilder::new()
1892/// .offload(true)
1893/// .ipv4("10.0.0.1", 24, None)
1894/// .build_sync()?;
1895///
1896/// let mut gro_table = GROTable::default();
1897/// let mut original_buffer = vec![0; VIRTIO_NET_HDR_LEN + 65535];
1898/// let mut bufs = vec![vec![0u8; 1500]; IDEAL_BATCH_SIZE];
1899/// let mut sizes = vec![0; IDEAL_BATCH_SIZE];
1900///
1901/// loop {
1902/// let num = dev.recv_multiple(&mut original_buffer, &mut bufs, &mut sizes, 0)?;
1903///
1904/// // GRO table is automatically used by recv_multiple
1905/// // to coalesce packets
1906/// for i in 0..num {
1907/// println!("Packet: {} bytes", sizes[i]);
1908/// }
1909/// }
1910/// # }
1911/// # Ok::<(), std::io::Error>(())
1912/// ```
1913///
1914/// # Fields
1915///
1916/// - `tcp_gro_table`: State for TCP packet coalescing
1917/// - `udp_gro_table`: State for UDP packet coalescing (if supported by kernel)
1918/// - `to_write`: Internal buffer tracking which packets to emit
1919///
1920/// # Performance
1921///
1922/// The GRO table maintains internal state across calls, including:
1923/// - Hash map of active flows (preallocated for [`IDEAL_BATCH_SIZE`] flows)
1924/// - Memory pools to reduce allocations
1925/// - Per-flow coalescing state
1926///
1927/// Typical coalescing ratios:
1928/// - TCP bulk transfers: 5-20 packets coalesced into 1
1929/// - UDP: 2-5 packets coalesced into 1
1930/// - Interactive traffic: minimal coalescing (preserves latency)
1931///
1932/// # Thread Safety
1933///
1934/// `GROTable` is not thread-safe. Use one instance per thread or protect with a mutex.
1935#[derive(Default)]
1936pub struct GROTable {
1937 pub(crate) to_write: Vec<usize>,
1938 pub(crate) tcp_gro_table: TcpGROTable,
1939 pub(crate) udp_gro_table: UdpGROTable,
1940}
1941
1942impl GROTable {
1943 pub fn new() -> GROTable {
1944 GROTable {
1945 to_write: Vec::with_capacity(IDEAL_BATCH_SIZE),
1946 tcp_gro_table: TcpGROTable::new(),
1947 udp_gro_table: UdpGROTable::new(),
1948 }
1949 }
1950 pub(crate) fn reset(&mut self) {
1951 self.to_write.clear();
1952 self.tcp_gro_table.reset();
1953 self.udp_gro_table.reset();
1954 }
1955}
1956
1957/// A trait for buffers that can be expanded and resized for offload operations.
1958///
1959/// This trait extends basic buffer operations (`AsRef<[u8]>` and `AsMut<[u8]>`)
1960/// with methods needed for efficient packet processing with GRO/GSO offload support.
1961/// It allows buffers to grow dynamically as needed during packet coalescing and
1962/// segmentation operations.
1963///
1964/// # Required Methods
1965///
1966/// - `buf_capacity()` - Returns the current capacity of the buffer
1967/// - `buf_resize()` - Resizes the buffer to a new length, filling with a value
1968/// - `buf_extend_from_slice()` - Extends the buffer with data from a slice
1969///
1970/// # Implementations
1971///
1972/// This trait is implemented for:
1973/// - `BytesMut` - The primary buffer type for async operations
1974/// - `&mut BytesMut` - Mutable reference to BytesMut
1975/// - `Vec<u8>` - Standard Rust vector
1976/// - `&mut Vec<u8>` - Mutable reference to Vec
1977///
1978/// # Example
1979///
1980/// ```no_run
1981/// # #[cfg(target_os = "linux")]
1982/// # {
1983/// use bytes::BytesMut;
1984/// use tun_rs::ExpandBuffer;
1985///
1986/// let mut buffer = BytesMut::with_capacity(1500);
1987/// buffer.buf_resize(20, 0); // Resize to 20 bytes, filled with zeros
1988/// buffer.buf_extend_from_slice(b"packet data"); // Append data
1989/// assert!(buffer.buf_capacity() >= buffer.len());
1990/// # }
1991/// ```
1992pub trait ExpandBuffer: AsRef<[u8]> + AsMut<[u8]> {
1993 /// Returns the current capacity of the buffer in bytes.
1994 ///
1995 /// The capacity is the total amount of memory allocated, which may be
1996 /// greater than the current length of the buffer.
1997 fn buf_capacity(&self) -> usize;
1998
1999 /// Resizes the buffer to the specified length, filling new space with the given value.
2000 ///
2001 /// If `new_len` is greater than the current length, the buffer is extended
2002 /// and new bytes are initialized to `value`. If `new_len` is less than the
2003 /// current length, the buffer is truncated.
2004 ///
2005 /// # Arguments
2006 ///
2007 /// * `new_len` - The new length of the buffer
2008 /// * `value` - The byte value to fill any new space with
2009 fn buf_resize(&mut self, new_len: usize, value: u8);
2010
2011 /// Extends the buffer by appending data from a slice.
2012 ///
2013 /// This method appends all bytes from `src` to the end of the buffer,
2014 /// growing the buffer as necessary.
2015 ///
2016 /// # Arguments
2017 ///
2018 /// * `src` - The slice of bytes to append to the buffer
2019 fn buf_extend_from_slice(&mut self, src: &[u8]);
2020}
2021
2022impl ExpandBuffer for BytesMut {
2023 fn buf_capacity(&self) -> usize {
2024 self.capacity()
2025 }
2026
2027 fn buf_resize(&mut self, new_len: usize, value: u8) {
2028 self.resize(new_len, value)
2029 }
2030
2031 fn buf_extend_from_slice(&mut self, extend: &[u8]) {
2032 self.extend_from_slice(extend)
2033 }
2034}
2035
2036impl ExpandBuffer for &mut BytesMut {
2037 fn buf_capacity(&self) -> usize {
2038 self.capacity()
2039 }
2040 fn buf_resize(&mut self, new_len: usize, value: u8) {
2041 self.resize(new_len, value)
2042 }
2043
2044 fn buf_extend_from_slice(&mut self, extend: &[u8]) {
2045 self.extend_from_slice(extend)
2046 }
2047}
2048impl ExpandBuffer for Vec<u8> {
2049 fn buf_capacity(&self) -> usize {
2050 self.capacity()
2051 }
2052
2053 fn buf_resize(&mut self, new_len: usize, value: u8) {
2054 self.resize(new_len, value)
2055 }
2056
2057 fn buf_extend_from_slice(&mut self, extend: &[u8]) {
2058 self.extend_from_slice(extend)
2059 }
2060}
2061impl ExpandBuffer for &mut Vec<u8> {
2062 fn buf_capacity(&self) -> usize {
2063 self.capacity()
2064 }
2065
2066 fn buf_resize(&mut self, new_len: usize, value: u8) {
2067 self.resize(new_len, value)
2068 }
2069
2070 fn buf_extend_from_slice(&mut self, extend: &[u8]) {
2071 self.extend_from_slice(extend)
2072 }
2073}