Skip to main content

turmoil_net/kernel/
mod.rs

1//! Per-host network stack.
2//!
3//! The [`Kernel`] owns one host's socket table, packet queues, and
4//! interface configuration. It does not route between hosts — that's a
5//! concern for the transport layer above.
6//!
7//! Mutability and thread-safety aren't yet decided — the kernel is a
8//! plain struct with `&mut self` on mutating methods. Sharing and
9//! interior mutability (`RefCell`, `Arc<Mutex<_>>`, thread-local, etc.)
10//! will be layered on later once the transport story is clearer.
11
12use std::collections::VecDeque;
13use std::io::{Error, ErrorKind};
14use std::net::{IpAddr, SocketAddr};
15use std::task::{Context, Poll};
16
17use tokio::io::ReadBuf;
18
19use crate::kernel::socket::{BindKey, SocketTable, DEFAULT_RECV_BUF_CAP, DEFAULT_SEND_BUF_CAP};
20
21mod packet;
22mod socket;
23mod tcp;
24mod udp;
25mod uds;
26
27// for shims
28pub use socket::{Addr, Domain, Fd, SocketOption, SocketOptionKind, Type};
29// for netstat
30pub use socket::{ListenState, Socket, Tcb, TcpState};
31// for rules
32pub use packet::{Packet, TcpFlags, TcpSegment, Transport, UdpDatagram};
33
34/// Default MTU for non-loopback traffic (bytes). Matches standard Ethernet.
35pub const DEFAULT_MTU: u32 = 1500;
36/// Default MTU for loopback (bytes). Matches Linux `lo`.
37pub const DEFAULT_LOOPBACK_MTU: u32 = 65536;
38/// Default backlog for `TcpListener::bind` when the caller doesn't
39/// specify one. Mirrors Linux's `SOMAXCONN`.
40pub const DEFAULT_BACKLOG: usize = 1024;
41
42/// Default number of egress passes without ACK progress before TCP
43/// retransmits. Real Linux uses a time-based RTO; we count egress
44/// ticks to stay harness-agnostic (harnesses with fixed-width ticks
45/// translate this to sim time; interleaving harnesses see it as a
46/// quantum count). Three is small enough that tests don't wait
47/// forever, large enough that a reliable loopback doesn't spuriously
48/// retransmit.
49pub const DEFAULT_RETX_THRESHOLD: u32 = 3;
50
51/// Default retransmit attempts before a TCP connection is aborted
52/// with `ConnectionReset`. Roughly analogous to Linux's
53/// `tcp_retries2` (default 15) but tuned for count-based retx — we
54/// don't need the long tail, tests want to observe failure quickly.
55pub const DEFAULT_RETX_MAX: u32 = 5;
56
57/// Tunable limits for a host's network stack (MTU, buffer caps,
58/// backlog, retransmit thresholds). Constructed via [`Self::default`]
59/// and adjusted with the builder-style setters. Pass to
60/// [`Net::with_config`](crate::Net::with_config) to apply to every
61/// host added afterwards.
62#[derive(Debug, Clone)]
63pub struct KernelConfig {
64    pub mtu: u32,
65    pub loopback_mtu: u32,
66    pub send_buf_cap: usize,
67    pub recv_buf_cap: usize,
68    pub default_backlog: usize,
69    pub retx_threshold: u32,
70    pub retx_max: u32,
71}
72
73impl Default for KernelConfig {
74    fn default() -> Self {
75        Self {
76            mtu: DEFAULT_MTU,
77            loopback_mtu: DEFAULT_LOOPBACK_MTU,
78            send_buf_cap: DEFAULT_SEND_BUF_CAP,
79            recv_buf_cap: DEFAULT_RECV_BUF_CAP,
80            default_backlog: DEFAULT_BACKLOG,
81            retx_threshold: DEFAULT_RETX_THRESHOLD,
82            retx_max: DEFAULT_RETX_MAX,
83        }
84    }
85}
86
87impl KernelConfig {
88    pub fn mtu(mut self, v: u32) -> Self {
89        self.mtu = v;
90        self
91    }
92    pub fn loopback_mtu(mut self, v: u32) -> Self {
93        self.loopback_mtu = v;
94        self
95    }
96    pub fn send_buf_cap(mut self, v: usize) -> Self {
97        self.send_buf_cap = v;
98        self
99    }
100    pub fn recv_buf_cap(mut self, v: usize) -> Self {
101        self.recv_buf_cap = v;
102        self
103    }
104    pub fn default_backlog(mut self, v: usize) -> Self {
105        self.default_backlog = v;
106        self
107    }
108    pub fn retx_threshold(mut self, v: u32) -> Self {
109        self.retx_threshold = v;
110        self
111    }
112    pub fn retx_max(mut self, v: u32) -> Self {
113        self.retx_max = v;
114        self
115    }
116}
117
118/// Linux errno for `EAFNOSUPPORT`. Used where we want `kind() ==
119/// Uncategorized` to match what `tokio::net` surfaces for
120/// socket-family mismatches (the `Uncategorized` variant itself is
121/// unstable). `from_raw_os_error` on any platform we care about maps
122/// this to `Uncategorized`.
123const EAFNOSUPPORT: i32 = 97;
124
125/// Linux errno for `EMSGSIZE`. Surfaced by UDP `sendto` when the
126/// payload exceeds the link MTU under `IP_PMTUDISC_DO` semantics
127/// (real Linux; `ErrorKind::FileTooLarge` is unstable).
128pub(crate) const EMSGSIZE: i32 = 90;
129
130/// A per-host network stack.
131///
132/// Owns the socket table and the inbound/outbound packet queues. Does
133/// not know how packets reach other hosts — that's the transport
134/// layer's job.
135#[derive(Debug)]
136pub struct Kernel {
137    sockets: SocketTable,
138    addresses: Vec<IpAddr>,
139    pub(crate) mtu: u32,
140    pub(crate) loopback_mtu: u32,
141    pub(crate) send_buf_cap: usize,
142    pub(crate) recv_buf_cap: usize,
143    pub(crate) default_backlog: usize,
144    pub(crate) retx_threshold: u32,
145    pub(crate) retx_max: u32,
146    /// Packets queued by `poll_send_*` awaiting `egress()`.
147    outbound: VecDeque<Packet>,
148    /// Monotonic TCP initial-sequence-number source. Deterministic by
149    /// design — real kernels randomize.
150    tcp_isn: u32,
151}
152
153impl Kernel {
154    /// Construct a fresh kernel with default settings and no
155    /// configured addresses beyond implicit loopback.
156    pub fn new() -> Self {
157        Self::with_config(KernelConfig::default())
158    }
159
160    pub fn with_config(cfg: KernelConfig) -> Self {
161        Self {
162            sockets: SocketTable::new(),
163            addresses: Vec::new(),
164            mtu: cfg.mtu,
165            loopback_mtu: cfg.loopback_mtu,
166            send_buf_cap: cfg.send_buf_cap,
167            recv_buf_cap: cfg.recv_buf_cap,
168            default_backlog: cfg.default_backlog,
169            retx_threshold: cfg.retx_threshold,
170            retx_max: cfg.retx_max,
171            outbound: VecDeque::new(),
172            tcp_isn: 0x0100_0000,
173        }
174    }
175
176    /// Create a new socket in this kernel's socket table.
177    fn mk_socket(&mut self, domain: Domain, ty: Type) -> Fd {
178        self.sockets.insert(Socket::new(domain, ty))
179    }
180
181    /// `socket(2)`. Creates an unbound socket; useful for
182    /// `connect`-without-`bind` flows like `TcpStream::connect`.
183    pub fn open(&mut self, domain: Domain, ty: Type) -> Fd {
184        self.mk_socket(domain, ty)
185    }
186
187    /// `close(2)`. Removes the entry from the socket table along with
188    /// any binding. For an active TCP connection, decides between a
189    /// clean close (silent) and an abortive close (emits RST):
190    ///
191    /// - RST if the app drops with unread bytes still queued in
192    ///   `recv_buf`, or with a live write side that never sent FIN.
193    ///   Matches Linux behavior and the `tokio::net::TcpStream` Drop
194    ///   semantics the upstream crate recently fixed.
195    /// - Silent close otherwise — the connection already reached a
196    ///   clean terminal state via FIN exchange.
197    ///
198    /// Idempotent on an already-closed fd.
199    pub fn close(&mut self, fd: Fd) {
200        if tcp::on_close(self, fd) {
201            self.sockets.remove(fd);
202        }
203        // else: lingering — `reap_closed` at the end of each egress
204        // pass will clean up once the TCP state reaches `Closed`.
205    }
206
207    /// Iterate every socket in the table, in insertion order.
208    pub fn sockets(&self) -> impl Iterator<Item = (Fd, &Socket)> {
209        self.sockets.iter()
210    }
211
212    pub(crate) fn lookup(&self, fd: Fd) -> std::io::Result<&Socket> {
213        self.sockets
214            .get(fd)
215            .ok_or_else(|| Error::from(ErrorKind::NotFound))
216    }
217
218    pub(crate) fn lookup_mut(&mut self, fd: Fd) -> std::io::Result<&mut Socket> {
219        self.sockets
220            .get_mut(fd)
221            .ok_or_else(|| Error::from(ErrorKind::NotFound))
222    }
223
224    /// Configure an additional local address for this host. Loopback
225    /// (`127.0.0.1`, `::1`) is always implicit.
226    pub fn add_address(&mut self, addr: IpAddr) {
227        if !self.addresses.contains(&addr) {
228            self.addresses.push(addr);
229        }
230    }
231
232    /// `true` if `addr` is one of this host's local addresses
233    /// (including implicit loopback).
234    pub fn is_local(&self, addr: IpAddr) -> bool {
235        if addr.is_loopback() {
236            return true;
237        }
238        self.addresses.contains(&addr)
239    }
240
241    /// `socket(2)` + `bind(2)`. Creates a socket of the given type,
242    /// assigns it `addr`, and returns the handle. Domain is inferred
243    /// from `addr`.
244    pub fn bind(&mut self, addr: &Addr, ty: Type) -> std::io::Result<Fd> {
245        let (domain, ip, port) = match addr {
246            Addr::Inet(sa) if sa.is_ipv4() => (Domain::Inet, sa.ip(), sa.port()),
247            Addr::Inet(sa) => (Domain::Inet6, sa.ip(), sa.port()),
248            Addr::Unix(_) => unimplemented!("AF_UNIX bind"),
249        };
250
251        // Non-wildcard IPs must be configured on this host.
252        if !ip.is_unspecified() && !self.is_local(ip) {
253            return Err(Error::from(ErrorKind::AddrNotAvailable));
254        }
255
256        // Pick a port if the caller asked for one.
257        let port = if port == 0 {
258            self.sockets
259                .allocate_port(domain, ty)
260                .ok_or_else(|| Error::from(ErrorKind::AddrInUse))?
261        } else {
262            port
263        };
264
265        let key = BindKey {
266            domain,
267            ty,
268            local_addr: ip,
269            local_port: port,
270        };
271
272        // Walk every existing binding on the same (domain, ty, port) to
273        // evaluate conflicts. Two scenarios:
274        //
275        // 1. Exact-tuple match. Allowed only if every socket in the
276        //    existing group AND the new socket has SO_REUSEPORT set.
277        // 2. Wildcard-vs-specific mismatch (one side bound to 0.0.0.0/::,
278        //    the other to a concrete IP). Allowed only if both sides
279        //    have SO_REUSEADDR set.
280        //
281        // Distinct concrete IPs on the same port never conflict on
282        // Linux — those just coexist regardless of options.
283        //
284        // Socket defaults (reuse_addr, reuse_port) are both `false`, so
285        // a freshly created socket never satisfies either reuse
286        // condition — that means any overlap at this step is a
287        // conflict and we can bail before creating the table entry.
288        for (existing, _ids) in self
289            .sockets
290            .bindings_on_port(key.domain, key.ty, key.local_port)
291        {
292            if existing.local_addr == key.local_addr
293                || existing.local_addr.is_unspecified()
294                || key.local_addr.is_unspecified()
295            {
296                return Err(Error::from(ErrorKind::AddrInUse));
297            }
298        }
299
300        let fd = self.mk_socket(domain, ty);
301        self.sockets.insert_binding(key.clone(), fd);
302        self.lookup_mut(fd).expect("socket entry present").bound = Some(key);
303        Ok(fd)
304    }
305
306    /// `setsockopt(2)`. Most variants are still unimplemented.
307    pub fn set_option(&mut self, fd: Fd, opt: SocketOption) -> std::io::Result<()> {
308        let st = self.lookup_mut(fd)?;
309        match opt {
310            SocketOption::Broadcast(v) => st.broadcast = v,
311            SocketOption::IpTtl(v) => st.ttl = v,
312            SocketOption::TcpNoDelay(v) => st.tcp_nodelay = v,
313            _ => unimplemented!("set_option {:?}", opt),
314        }
315        Ok(())
316    }
317
318    /// `getsockopt(2)`. Most variants are still unimplemented.
319    pub fn get_option(&self, fd: Fd, kind: SocketOptionKind) -> std::io::Result<SocketOption> {
320        let st = self.lookup(fd)?;
321        Ok(match kind {
322            SocketOptionKind::Broadcast => SocketOption::Broadcast(st.broadcast),
323            SocketOptionKind::IpTtl => SocketOption::IpTtl(st.ttl),
324            SocketOptionKind::TcpNoDelay => SocketOption::TcpNoDelay(st.tcp_nodelay),
325            _ => unimplemented!("get_option {:?}", kind),
326        })
327    }
328
329    /// `getsockname(2)`.
330    pub fn local_addr(&self, fd: Fd) -> std::io::Result<Addr> {
331        let key = self
332            .lookup(fd)?
333            .bound
334            .as_ref()
335            .ok_or_else(|| Error::from(ErrorKind::InvalidInput))?;
336        Ok(Addr::Inet(SocketAddr::new(key.local_addr, key.local_port)))
337    }
338
339    /// `getpeername(2)`. Returns `NotConnected` if `connect` was never
340    /// called.
341    pub fn peer_addr(&self, fd: Fd) -> std::io::Result<Addr> {
342        self.lookup(fd)?
343            .peer
344            .clone()
345            .ok_or_else(|| Error::from(ErrorKind::NotConnected))
346    }
347
348    /// `sendto(2)` for UDP. Auto-binds an ephemeral local address if
349    /// `socket` was never explicitly bound, matching Linux semantics.
350    pub fn poll_send_to(
351        &mut self,
352        fd: Fd,
353        cx: &mut Context<'_>,
354        buf: &[u8],
355        dst: &Addr,
356    ) -> Poll<std::io::Result<usize>> {
357        let Addr::Inet(dst_sa) = dst else {
358            panic!("AF_UNIX not wired through poll_send_to");
359        };
360        let (ty, domain) = match self.lookup(fd) {
361            Ok(st) => (st.ty, st.domain),
362            Err(e) => return Poll::Ready(Err(e)),
363        };
364        assert_eq!(ty, Type::Dgram, "poll_send_to on non-Dgram fd");
365        match (domain, dst_sa) {
366            (Domain::Inet, SocketAddr::V4(_)) | (Domain::Inet6, SocketAddr::V6(_)) => {}
367            _ => return Poll::Ready(Err(Error::from_raw_os_error(EAFNOSUPPORT))),
368        }
369        udp::send_to(self, fd, cx, buf, dst_sa)
370    }
371
372    /// `recvfrom(2)`. Returns the peer address alongside the filled
373    /// buffer. Returns `Pending` and stores the waker when the
374    /// socket's recv queue is empty.
375    pub fn poll_recv_from(
376        &mut self,
377        fd: Fd,
378        cx: &mut Context<'_>,
379        buf: &mut ReadBuf<'_>,
380    ) -> Poll<std::io::Result<Addr>> {
381        let st = match self.lookup_mut(fd) {
382            Ok(st) => st,
383            Err(e) => return Poll::Ready(Err(e)),
384        };
385        assert_eq!(st.ty, Type::Dgram, "poll_recv_from on non-Dgram fd");
386        udp::recv_from(st, cx, buf)
387    }
388
389    /// `connect(2)`. UDP is eager (no handshake, always `Ready` on
390    /// first poll). TCP is real: first poll sends SYN and parks; later
391    /// polls finish once the handshake completes.
392    pub fn poll_connect(
393        &mut self,
394        fd: Fd,
395        cx: &mut Context<'_>,
396        addr: &Addr,
397    ) -> Poll<std::io::Result<()>> {
398        let Addr::Inet(peer) = addr else {
399            panic!("AF_UNIX not wired through connect");
400        };
401        let (domain, ty, is_bound) = match self.lookup(fd) {
402            Ok(st) => (st.domain, st.ty, st.bound.is_some()),
403            Err(e) => return Poll::Ready(Err(e)),
404        };
405        match (domain, peer) {
406            (Domain::Inet, SocketAddr::V4(_)) | (Domain::Inet6, SocketAddr::V6(_)) => {}
407            _ => return Poll::Ready(Err(Error::from_raw_os_error(EAFNOSUPPORT))),
408        }
409        match ty {
410            Type::Dgram => {
411                if !is_bound {
412                    if let Err(e) = udp::auto_bind(self, fd, domain, ty, peer.ip()) {
413                        return Poll::Ready(Err(e));
414                    }
415                }
416                self.lookup_mut(fd).expect("socket present").peer = Some(Addr::Inet(*peer));
417                Poll::Ready(Ok(()))
418            }
419            Type::Stream => tcp::poll_connect(self, fd, cx, domain, *peer, is_bound),
420            Type::SeqPacket => unimplemented!("SOCK_SEQPACKET connect"),
421        }
422    }
423
424    /// `listen(2)`. Flips a bound stream socket into passive mode with
425    /// the given backlog. Panics if called on a non-Stream or unbound
426    /// fd — the shim only calls this after a successful `bind`, so
427    /// either is an internal bug.
428    pub fn listen(&mut self, fd: Fd, backlog: usize) -> std::io::Result<()> {
429        let st = self.lookup_mut(fd)?;
430        assert_eq!(st.ty, Type::Stream, "listen on non-Stream fd");
431        assert!(st.bound.is_some(), "listen on unbound fd");
432        st.listen = Some(ListenState::new(backlog));
433        Ok(())
434    }
435
436    /// `accept(2)`. Pops a fully-established connection off the
437    /// listener's ready queue; parks the caller if the queue is empty.
438    /// Panics if called on a socket that isn't listening — structural
439    /// guarantee from the `TcpListener` shim.
440    pub fn poll_accept(
441        &mut self,
442        fd: Fd,
443        cx: &mut Context<'_>,
444    ) -> Poll<std::io::Result<(Fd, SocketAddr)>> {
445        let st = match self.lookup_mut(fd) {
446            Ok(st) => st,
447            Err(e) => return Poll::Ready(Err(e)),
448        };
449        let listen = st.listen.as_mut().expect("poll_accept on non-listener fd");
450        if let Some(child) = listen.ready.pop_front() {
451            let peer = self
452                .lookup(child)
453                .expect("accepted fd present")
454                .tcb
455                .as_ref()
456                .expect("accepted fd has TCB")
457                .peer;
458            return Poll::Ready(Ok((child, peer)));
459        }
460        if !listen.accept_wakers.iter().any(|w| w.will_wake(cx.waker())) {
461            listen.accept_wakers.push(cx.waker().clone());
462        }
463        Poll::Pending
464    }
465
466    /// `send(2)` for connected sockets. UDP: resolves the stored peer
467    /// and delegates to `sendto`. TCP: copies into `send_buf` and
468    /// lets `egress` segment and emit.
469    pub fn poll_send(
470        &mut self,
471        fd: Fd,
472        cx: &mut Context<'_>,
473        buf: &[u8],
474    ) -> Poll<std::io::Result<usize>> {
475        let (ty, peer) = match self.lookup(fd) {
476            Ok(st) => (st.ty, st.peer.clone()),
477            Err(e) => return Poll::Ready(Err(e)),
478        };
479        match ty {
480            Type::Dgram => {
481                let Some(peer) = peer else {
482                    return Poll::Ready(Err(Error::from(ErrorKind::NotConnected)));
483                };
484                let Addr::Inet(peer_sa) = peer else {
485                    panic!("UDP peer stored as Addr::Unix");
486                };
487                udp::send_to(self, fd, cx, buf, &peer_sa)
488            }
489            Type::Stream => tcp::poll_send(self, fd, cx, buf),
490            Type::SeqPacket => unimplemented!("SOCK_SEQPACKET poll_send"),
491        }
492    }
493
494    /// `recv(2)` for connected sockets. UDP: pops one datagram into
495    /// `buf`. TCP: drains from `recv_buf`.
496    pub fn poll_recv(
497        &mut self,
498        fd: Fd,
499        cx: &mut Context<'_>,
500        buf: &mut [u8],
501    ) -> Poll<std::io::Result<usize>> {
502        let ty = match self.lookup(fd) {
503            Ok(st) => st.ty,
504            Err(e) => return Poll::Ready(Err(e)),
505        };
506        match ty {
507            Type::Dgram => {
508                let st = self.lookup_mut(fd).expect("fd validated");
509                let mut rb = ReadBuf::new(buf);
510                match udp::recv(st, cx, &mut rb) {
511                    Poll::Ready(Ok(())) => Poll::Ready(Ok(rb.filled().len())),
512                    Poll::Ready(Err(e)) => Poll::Ready(Err(e)),
513                    Poll::Pending => Poll::Pending,
514                }
515            }
516            Type::Stream => tcp::poll_recv(self, fd, cx, buf),
517            Type::SeqPacket => unimplemented!("SOCK_SEQPACKET poll_recv"),
518        }
519    }
520
521    /// `shutdown(SHUT_WR)`. TCP-only in practice; UDP sockets have no
522    /// FIN to send. Kernel-level shutdown-read isn't exposed because
523    /// tokio's API doesn't carry the idea (see `TcpStream` docs).
524    pub fn poll_shutdown_write(
525        &mut self,
526        fd: Fd,
527        cx: &mut Context<'_>,
528    ) -> Poll<std::io::Result<()>> {
529        let ty = match self.lookup(fd) {
530            Ok(st) => st.ty,
531            Err(e) => return Poll::Ready(Err(e)),
532        };
533        match ty {
534            Type::Stream => tcp::poll_shutdown_write(self, fd, cx),
535            Type::Dgram | Type::SeqPacket => {
536                unimplemented!("poll_shutdown_write on non-Stream fd")
537            }
538        }
539    }
540
541    /// `recvfrom(2)` with `MSG_PEEK`. Like [`Self::poll_recv_from`] but
542    /// leaves the datagram in the recv queue.
543    pub fn poll_peek_from(
544        &mut self,
545        fd: Fd,
546        cx: &mut Context<'_>,
547        buf: &mut ReadBuf<'_>,
548    ) -> Poll<std::io::Result<Addr>> {
549        let st = match self.lookup_mut(fd) {
550            Ok(st) => st,
551            Err(e) => return Poll::Ready(Err(e)),
552        };
553        assert_eq!(st.ty, Type::Dgram, "poll_peek_from on non-Dgram fd");
554        udp::peek_from(st, cx, buf)
555    }
556
557    /// `recv(2)` with `MSG_PEEK` — connected-socket peek. UDP returns
558    /// the next datagram without consuming it; TCP returns buffered
559    /// bytes without draining.
560    pub fn poll_peek(
561        &mut self,
562        fd: Fd,
563        cx: &mut Context<'_>,
564        buf: &mut [u8],
565    ) -> Poll<std::io::Result<usize>> {
566        let ty = match self.lookup(fd) {
567            Ok(st) => st.ty,
568            Err(e) => return Poll::Ready(Err(e)),
569        };
570        match ty {
571            Type::Dgram => {
572                let st = self.lookup_mut(fd).expect("fd validated");
573                let mut rb = ReadBuf::new(buf);
574                match udp::peek_from(st, cx, &mut rb) {
575                    Poll::Ready(Ok(_)) => Poll::Ready(Ok(rb.filled().len())),
576                    Poll::Ready(Err(e)) => Poll::Ready(Err(e)),
577                    Poll::Pending => Poll::Pending,
578                }
579            }
580            Type::Stream => tcp::poll_peek(self, fd, cx, buf),
581            Type::SeqPacket => unimplemented!("SOCK_SEQPACKET poll_peek"),
582        }
583    }
584
585    /// Hand an inbound packet to the stack — dispatches to the socket
586    /// bound at the destination tuple, appending to its recv queue and
587    /// waking any pending receiver. Drops the packet if no matching
588    /// socket is bound.
589    pub fn deliver(&mut self, pkt: Packet) {
590        match pkt.payload.clone() {
591            Transport::Udp(d) => udp::deliver(self, &pkt, &d),
592            Transport::Tcp(s) => tcp::deliver(self, &pkt, &s),
593        }
594    }
595
596    /// Drain packets the stack has produced since the last call,
597    /// appending those leaving this host to `out`. Passing the buffer
598    /// in lets callers amortize the allocation across ticks.
599    ///
600    /// Runs TCP segmentation first (draining each established socket's
601    /// `send_buf` into MSS-sized segments on `outbound`), then drains
602    /// `outbound`. Loopback packets fold back through
603    /// [`deliver`](Self::deliver) inline and never reach `out`.
604    ///
605    /// Loops until a full pass produces no new outbound packets, so
606    /// that an ACK folded through `deliver` can open a window and let
607    /// the next segmentation pass pick up queued bytes.
608    pub fn egress(&mut self, out: &mut Vec<Packet>) {
609        // Count-based retx check runs once per egress, before
610        // segmentation — if it rewinds snd_nxt for a socket, this
611        // call's segment_all pass picks up the re-emission.
612        tcp::check_retx(self);
613        loop {
614            tcp::segment_all(self);
615            if self.outbound.is_empty() {
616                break;
617            }
618            let drained: Vec<_> = std::mem::take(&mut self.outbound).into_iter().collect();
619            for pkt in drained {
620                if self.is_local(pkt.dst) {
621                    self.deliver(pkt);
622                } else {
623                    out.push(pkt);
624                }
625            }
626        }
627        // Reap any `fd_closed` sockets that have finished closing.
628        tcp::reap_closed(self);
629    }
630}
631
632impl Default for Kernel {
633    fn default() -> Self {
634        Self::new()
635    }
636}
637
638#[cfg(test)]
639mod tests {
640    use std::io::ErrorKind;
641    use std::net::SocketAddr;
642
643    use super::*;
644
645    fn inet(s: &str) -> Addr {
646        Addr::Inet(s.parse().unwrap())
647    }
648
649    #[test]
650    fn loopback_is_implicit_local() {
651        let mut k = Kernel::new();
652        assert!(k.is_local("127.0.0.1".parse().unwrap()));
653        assert!(k.is_local("::1".parse().unwrap()));
654        assert!(!k.is_local("10.0.0.1".parse().unwrap()));
655        k.add_address("10.0.0.1".parse().unwrap());
656        assert!(k.is_local("10.0.0.1".parse().unwrap()));
657    }
658
659    #[test]
660    fn bind_records_local_addr() {
661        let mut k = Kernel::new();
662        let s = k.bind(&inet("127.0.0.1:5000"), Type::Dgram).unwrap();
663        assert_eq!(k.local_addr(s).unwrap(), inet("127.0.0.1:5000"));
664    }
665
666    #[test]
667    fn bind_port_zero_allocates_ephemeral() {
668        let mut k = Kernel::new();
669        let s = k.bind(&inet("127.0.0.1:0"), Type::Dgram).unwrap();
670        let Addr::Inet(SocketAddr::V4(v4)) = k.local_addr(s).unwrap() else {
671            panic!("expected v4")
672        };
673        assert!((49152..=65535).contains(&v4.port()));
674    }
675
676    #[test]
677    fn bind_conflict_is_addr_in_use() {
678        let mut k = Kernel::new();
679        k.bind(&inet("127.0.0.1:5000"), Type::Dgram).unwrap();
680        let err = k.bind(&inet("127.0.0.1:5000"), Type::Dgram).unwrap_err();
681        assert_eq!(err.kind(), ErrorKind::AddrInUse);
682    }
683
684    #[test]
685    fn bind_different_protocols_can_share_port() {
686        let mut k = Kernel::new();
687        // TCP and UDP live in separate port spaces.
688        k.bind(&inet("127.0.0.1:5000"), Type::Dgram).unwrap();
689        k.bind(&inet("127.0.0.1:5000"), Type::Stream).unwrap();
690    }
691
692    #[test]
693    fn bind_rejects_non_local_addr() {
694        let mut k = Kernel::new();
695        let err = k.bind(&inet("10.0.0.1:5000"), Type::Dgram).unwrap_err();
696        assert_eq!(err.kind(), ErrorKind::AddrNotAvailable);
697    }
698
699    #[test]
700    fn bind_wildcard_addr_is_allowed() {
701        let mut k = Kernel::new();
702        k.bind(&inet("0.0.0.0:5000"), Type::Dgram).unwrap();
703    }
704
705    #[test]
706    fn distinct_specific_ips_coexist() {
707        // Linux: two sockets bound to different specific IPs on the
708        // same port never conflict.
709        let mut k = Kernel::new();
710        k.add_address("10.0.0.1".parse().unwrap());
711        k.add_address("10.0.0.2".parse().unwrap());
712        k.bind(&inet("10.0.0.1:5000"), Type::Dgram).unwrap();
713        k.bind(&inet("10.0.0.2:5000"), Type::Dgram).unwrap();
714    }
715
716    // Helpers for driving poll_* directly. Kernel-level tests don't
717    // have a tokio runtime — noop waker is enough because we never
718    // expect `Pending` from these specific syscalls (send to a
719    // configured local IP with a valid target succeeds immediately).
720    fn noop_cx() -> Context<'static> {
721        use std::task::Waker;
722        Context::from_waker(Waker::noop())
723    }
724
725    #[test]
726    fn udp_broadcast_send_requires_broadcast_option() {
727        // Send to a broadcast destination fails with PermissionDenied
728        // unless SO_BROADCAST is set — Linux behavior.
729        let mut k = Kernel::new();
730        k.add_address("10.0.0.1".parse().unwrap());
731        let s = k.bind(&inet("10.0.0.1:0"), Type::Dgram).unwrap();
732
733        let dst = Addr::Inet("255.255.255.255:9000".parse().unwrap());
734        let Poll::Ready(Err(e)) = k.poll_send_to(s, &mut noop_cx(), b"x", &dst) else {
735            panic!("expected broadcast rejection");
736        };
737        assert_eq!(e.kind(), ErrorKind::PermissionDenied);
738
739        k.set_option(s, SocketOption::Broadcast(true)).unwrap();
740        let Poll::Ready(Ok(_)) = k.poll_send_to(s, &mut noop_cx(), b"x", &dst) else {
741            panic!("broadcast send should succeed with SO_BROADCAST");
742        };
743    }
744
745    #[test]
746    fn bind_zero_avoids_ports_taken_on_other_ips() {
747        // Linux: `:0` picks a port not in use at any IP for the same
748        // (domain, ty). The allocator cursor starts at 49152, so that
749        // would be the first port handed out. Squatting it on 10.0.0.1
750        // should force the next `:0` bind to pick a different port
751        // rather than colliding.
752        let mut k = Kernel::new();
753        k.add_address("10.0.0.1".parse().unwrap());
754
755        k.bind(&inet("10.0.0.1:49152"), Type::Dgram).unwrap();
756        let s = k.bind(&inet("127.0.0.1:0"), Type::Dgram).unwrap();
757        let Addr::Inet(sa) = k.local_addr(s).unwrap() else {
758            panic!("v4 expected")
759        };
760        assert_ne!(sa.port(), 49152);
761    }
762
763    #[test]
764    fn broadcast_option_roundtrips() {
765        let mut k = Kernel::new();
766        let s = k.bind(&inet("127.0.0.1:0"), Type::Dgram).unwrap();
767        assert_eq!(
768            k.get_option(s, SocketOptionKind::Broadcast).unwrap(),
769            SocketOption::Broadcast(false)
770        );
771        k.set_option(s, SocketOption::Broadcast(true)).unwrap();
772        assert_eq!(
773            k.get_option(s, SocketOptionKind::Broadcast).unwrap(),
774            SocketOption::Broadcast(true)
775        );
776    }
777}