1use std::collections::VecDeque;
13use std::io::{Error, ErrorKind};
14use std::net::{IpAddr, SocketAddr};
15use std::task::{Context, Poll};
16
17use tokio::io::ReadBuf;
18
19use crate::kernel::socket::{BindKey, SocketTable, DEFAULT_RECV_BUF_CAP, DEFAULT_SEND_BUF_CAP};
20
21mod packet;
22mod socket;
23mod tcp;
24mod udp;
25mod uds;
26
27pub use socket::{Addr, Domain, Fd, SocketOption, SocketOptionKind, Type};
29pub use socket::{ListenState, Socket, Tcb, TcpState};
31pub use packet::{Packet, TcpFlags, TcpSegment, Transport, UdpDatagram};
33
34pub const DEFAULT_MTU: u32 = 1500;
36pub const DEFAULT_LOOPBACK_MTU: u32 = 65536;
38pub const DEFAULT_BACKLOG: usize = 1024;
41
42pub const DEFAULT_RETX_THRESHOLD: u32 = 3;
50
51pub const DEFAULT_RETX_MAX: u32 = 5;
56
57#[derive(Debug, Clone)]
63pub struct KernelConfig {
64 pub mtu: u32,
65 pub loopback_mtu: u32,
66 pub send_buf_cap: usize,
67 pub recv_buf_cap: usize,
68 pub default_backlog: usize,
69 pub retx_threshold: u32,
70 pub retx_max: u32,
71}
72
73impl Default for KernelConfig {
74 fn default() -> Self {
75 Self {
76 mtu: DEFAULT_MTU,
77 loopback_mtu: DEFAULT_LOOPBACK_MTU,
78 send_buf_cap: DEFAULT_SEND_BUF_CAP,
79 recv_buf_cap: DEFAULT_RECV_BUF_CAP,
80 default_backlog: DEFAULT_BACKLOG,
81 retx_threshold: DEFAULT_RETX_THRESHOLD,
82 retx_max: DEFAULT_RETX_MAX,
83 }
84 }
85}
86
87impl KernelConfig {
88 pub fn mtu(mut self, v: u32) -> Self {
89 self.mtu = v;
90 self
91 }
92 pub fn loopback_mtu(mut self, v: u32) -> Self {
93 self.loopback_mtu = v;
94 self
95 }
96 pub fn send_buf_cap(mut self, v: usize) -> Self {
97 self.send_buf_cap = v;
98 self
99 }
100 pub fn recv_buf_cap(mut self, v: usize) -> Self {
101 self.recv_buf_cap = v;
102 self
103 }
104 pub fn default_backlog(mut self, v: usize) -> Self {
105 self.default_backlog = v;
106 self
107 }
108 pub fn retx_threshold(mut self, v: u32) -> Self {
109 self.retx_threshold = v;
110 self
111 }
112 pub fn retx_max(mut self, v: u32) -> Self {
113 self.retx_max = v;
114 self
115 }
116}
117
118const EAFNOSUPPORT: i32 = 97;
124
125pub(crate) const EMSGSIZE: i32 = 90;
129
130#[derive(Debug)]
136pub struct Kernel {
137 sockets: SocketTable,
138 addresses: Vec<IpAddr>,
139 pub(crate) mtu: u32,
140 pub(crate) loopback_mtu: u32,
141 pub(crate) send_buf_cap: usize,
142 pub(crate) recv_buf_cap: usize,
143 pub(crate) default_backlog: usize,
144 pub(crate) retx_threshold: u32,
145 pub(crate) retx_max: u32,
146 outbound: VecDeque<Packet>,
148 tcp_isn: u32,
151}
152
153impl Kernel {
154 pub fn new() -> Self {
157 Self::with_config(KernelConfig::default())
158 }
159
160 pub fn with_config(cfg: KernelConfig) -> Self {
161 Self {
162 sockets: SocketTable::new(),
163 addresses: Vec::new(),
164 mtu: cfg.mtu,
165 loopback_mtu: cfg.loopback_mtu,
166 send_buf_cap: cfg.send_buf_cap,
167 recv_buf_cap: cfg.recv_buf_cap,
168 default_backlog: cfg.default_backlog,
169 retx_threshold: cfg.retx_threshold,
170 retx_max: cfg.retx_max,
171 outbound: VecDeque::new(),
172 tcp_isn: 0x0100_0000,
173 }
174 }
175
176 fn mk_socket(&mut self, domain: Domain, ty: Type) -> Fd {
178 self.sockets.insert(Socket::new(domain, ty))
179 }
180
181 pub fn open(&mut self, domain: Domain, ty: Type) -> Fd {
184 self.mk_socket(domain, ty)
185 }
186
187 pub fn close(&mut self, fd: Fd) {
200 if tcp::on_close(self, fd) {
201 self.sockets.remove(fd);
202 }
203 }
206
207 pub fn sockets(&self) -> impl Iterator<Item = (Fd, &Socket)> {
209 self.sockets.iter()
210 }
211
212 pub(crate) fn lookup(&self, fd: Fd) -> std::io::Result<&Socket> {
213 self.sockets
214 .get(fd)
215 .ok_or_else(|| Error::from(ErrorKind::NotFound))
216 }
217
218 pub(crate) fn lookup_mut(&mut self, fd: Fd) -> std::io::Result<&mut Socket> {
219 self.sockets
220 .get_mut(fd)
221 .ok_or_else(|| Error::from(ErrorKind::NotFound))
222 }
223
224 pub fn add_address(&mut self, addr: IpAddr) {
227 if !self.addresses.contains(&addr) {
228 self.addresses.push(addr);
229 }
230 }
231
232 pub fn is_local(&self, addr: IpAddr) -> bool {
235 if addr.is_loopback() {
236 return true;
237 }
238 self.addresses.contains(&addr)
239 }
240
241 pub fn bind(&mut self, addr: &Addr, ty: Type) -> std::io::Result<Fd> {
245 let (domain, ip, port) = match addr {
246 Addr::Inet(sa) if sa.is_ipv4() => (Domain::Inet, sa.ip(), sa.port()),
247 Addr::Inet(sa) => (Domain::Inet6, sa.ip(), sa.port()),
248 Addr::Unix(_) => unimplemented!("AF_UNIX bind"),
249 };
250
251 if !ip.is_unspecified() && !self.is_local(ip) {
253 return Err(Error::from(ErrorKind::AddrNotAvailable));
254 }
255
256 let port = if port == 0 {
258 self.sockets
259 .allocate_port(domain, ty)
260 .ok_or_else(|| Error::from(ErrorKind::AddrInUse))?
261 } else {
262 port
263 };
264
265 let key = BindKey {
266 domain,
267 ty,
268 local_addr: ip,
269 local_port: port,
270 };
271
272 for (existing, _ids) in self
289 .sockets
290 .bindings_on_port(key.domain, key.ty, key.local_port)
291 {
292 if existing.local_addr == key.local_addr
293 || existing.local_addr.is_unspecified()
294 || key.local_addr.is_unspecified()
295 {
296 return Err(Error::from(ErrorKind::AddrInUse));
297 }
298 }
299
300 let fd = self.mk_socket(domain, ty);
301 self.sockets.insert_binding(key.clone(), fd);
302 self.lookup_mut(fd).expect("socket entry present").bound = Some(key);
303 Ok(fd)
304 }
305
306 pub fn set_option(&mut self, fd: Fd, opt: SocketOption) -> std::io::Result<()> {
308 let st = self.lookup_mut(fd)?;
309 match opt {
310 SocketOption::Broadcast(v) => st.broadcast = v,
311 SocketOption::IpTtl(v) => st.ttl = v,
312 SocketOption::TcpNoDelay(v) => st.tcp_nodelay = v,
313 _ => unimplemented!("set_option {:?}", opt),
314 }
315 Ok(())
316 }
317
318 pub fn get_option(&self, fd: Fd, kind: SocketOptionKind) -> std::io::Result<SocketOption> {
320 let st = self.lookup(fd)?;
321 Ok(match kind {
322 SocketOptionKind::Broadcast => SocketOption::Broadcast(st.broadcast),
323 SocketOptionKind::IpTtl => SocketOption::IpTtl(st.ttl),
324 SocketOptionKind::TcpNoDelay => SocketOption::TcpNoDelay(st.tcp_nodelay),
325 _ => unimplemented!("get_option {:?}", kind),
326 })
327 }
328
329 pub fn local_addr(&self, fd: Fd) -> std::io::Result<Addr> {
331 let key = self
332 .lookup(fd)?
333 .bound
334 .as_ref()
335 .ok_or_else(|| Error::from(ErrorKind::InvalidInput))?;
336 Ok(Addr::Inet(SocketAddr::new(key.local_addr, key.local_port)))
337 }
338
339 pub fn peer_addr(&self, fd: Fd) -> std::io::Result<Addr> {
342 self.lookup(fd)?
343 .peer
344 .clone()
345 .ok_or_else(|| Error::from(ErrorKind::NotConnected))
346 }
347
348 pub fn poll_send_to(
351 &mut self,
352 fd: Fd,
353 cx: &mut Context<'_>,
354 buf: &[u8],
355 dst: &Addr,
356 ) -> Poll<std::io::Result<usize>> {
357 let Addr::Inet(dst_sa) = dst else {
358 panic!("AF_UNIX not wired through poll_send_to");
359 };
360 let (ty, domain) = match self.lookup(fd) {
361 Ok(st) => (st.ty, st.domain),
362 Err(e) => return Poll::Ready(Err(e)),
363 };
364 assert_eq!(ty, Type::Dgram, "poll_send_to on non-Dgram fd");
365 match (domain, dst_sa) {
366 (Domain::Inet, SocketAddr::V4(_)) | (Domain::Inet6, SocketAddr::V6(_)) => {}
367 _ => return Poll::Ready(Err(Error::from_raw_os_error(EAFNOSUPPORT))),
368 }
369 udp::send_to(self, fd, cx, buf, dst_sa)
370 }
371
372 pub fn poll_recv_from(
376 &mut self,
377 fd: Fd,
378 cx: &mut Context<'_>,
379 buf: &mut ReadBuf<'_>,
380 ) -> Poll<std::io::Result<Addr>> {
381 let st = match self.lookup_mut(fd) {
382 Ok(st) => st,
383 Err(e) => return Poll::Ready(Err(e)),
384 };
385 assert_eq!(st.ty, Type::Dgram, "poll_recv_from on non-Dgram fd");
386 udp::recv_from(st, cx, buf)
387 }
388
389 pub fn poll_connect(
393 &mut self,
394 fd: Fd,
395 cx: &mut Context<'_>,
396 addr: &Addr,
397 ) -> Poll<std::io::Result<()>> {
398 let Addr::Inet(peer) = addr else {
399 panic!("AF_UNIX not wired through connect");
400 };
401 let (domain, ty, is_bound) = match self.lookup(fd) {
402 Ok(st) => (st.domain, st.ty, st.bound.is_some()),
403 Err(e) => return Poll::Ready(Err(e)),
404 };
405 match (domain, peer) {
406 (Domain::Inet, SocketAddr::V4(_)) | (Domain::Inet6, SocketAddr::V6(_)) => {}
407 _ => return Poll::Ready(Err(Error::from_raw_os_error(EAFNOSUPPORT))),
408 }
409 match ty {
410 Type::Dgram => {
411 if !is_bound {
412 if let Err(e) = udp::auto_bind(self, fd, domain, ty, peer.ip()) {
413 return Poll::Ready(Err(e));
414 }
415 }
416 self.lookup_mut(fd).expect("socket present").peer = Some(Addr::Inet(*peer));
417 Poll::Ready(Ok(()))
418 }
419 Type::Stream => tcp::poll_connect(self, fd, cx, domain, *peer, is_bound),
420 Type::SeqPacket => unimplemented!("SOCK_SEQPACKET connect"),
421 }
422 }
423
424 pub fn listen(&mut self, fd: Fd, backlog: usize) -> std::io::Result<()> {
429 let st = self.lookup_mut(fd)?;
430 assert_eq!(st.ty, Type::Stream, "listen on non-Stream fd");
431 assert!(st.bound.is_some(), "listen on unbound fd");
432 st.listen = Some(ListenState::new(backlog));
433 Ok(())
434 }
435
436 pub fn poll_accept(
441 &mut self,
442 fd: Fd,
443 cx: &mut Context<'_>,
444 ) -> Poll<std::io::Result<(Fd, SocketAddr)>> {
445 let st = match self.lookup_mut(fd) {
446 Ok(st) => st,
447 Err(e) => return Poll::Ready(Err(e)),
448 };
449 let listen = st.listen.as_mut().expect("poll_accept on non-listener fd");
450 if let Some(child) = listen.ready.pop_front() {
451 let peer = self
452 .lookup(child)
453 .expect("accepted fd present")
454 .tcb
455 .as_ref()
456 .expect("accepted fd has TCB")
457 .peer;
458 return Poll::Ready(Ok((child, peer)));
459 }
460 if !listen.accept_wakers.iter().any(|w| w.will_wake(cx.waker())) {
461 listen.accept_wakers.push(cx.waker().clone());
462 }
463 Poll::Pending
464 }
465
466 pub fn poll_send(
470 &mut self,
471 fd: Fd,
472 cx: &mut Context<'_>,
473 buf: &[u8],
474 ) -> Poll<std::io::Result<usize>> {
475 let (ty, peer) = match self.lookup(fd) {
476 Ok(st) => (st.ty, st.peer.clone()),
477 Err(e) => return Poll::Ready(Err(e)),
478 };
479 match ty {
480 Type::Dgram => {
481 let Some(peer) = peer else {
482 return Poll::Ready(Err(Error::from(ErrorKind::NotConnected)));
483 };
484 let Addr::Inet(peer_sa) = peer else {
485 panic!("UDP peer stored as Addr::Unix");
486 };
487 udp::send_to(self, fd, cx, buf, &peer_sa)
488 }
489 Type::Stream => tcp::poll_send(self, fd, cx, buf),
490 Type::SeqPacket => unimplemented!("SOCK_SEQPACKET poll_send"),
491 }
492 }
493
494 pub fn poll_recv(
497 &mut self,
498 fd: Fd,
499 cx: &mut Context<'_>,
500 buf: &mut [u8],
501 ) -> Poll<std::io::Result<usize>> {
502 let ty = match self.lookup(fd) {
503 Ok(st) => st.ty,
504 Err(e) => return Poll::Ready(Err(e)),
505 };
506 match ty {
507 Type::Dgram => {
508 let st = self.lookup_mut(fd).expect("fd validated");
509 let mut rb = ReadBuf::new(buf);
510 match udp::recv(st, cx, &mut rb) {
511 Poll::Ready(Ok(())) => Poll::Ready(Ok(rb.filled().len())),
512 Poll::Ready(Err(e)) => Poll::Ready(Err(e)),
513 Poll::Pending => Poll::Pending,
514 }
515 }
516 Type::Stream => tcp::poll_recv(self, fd, cx, buf),
517 Type::SeqPacket => unimplemented!("SOCK_SEQPACKET poll_recv"),
518 }
519 }
520
521 pub fn poll_shutdown_write(
525 &mut self,
526 fd: Fd,
527 cx: &mut Context<'_>,
528 ) -> Poll<std::io::Result<()>> {
529 let ty = match self.lookup(fd) {
530 Ok(st) => st.ty,
531 Err(e) => return Poll::Ready(Err(e)),
532 };
533 match ty {
534 Type::Stream => tcp::poll_shutdown_write(self, fd, cx),
535 Type::Dgram | Type::SeqPacket => {
536 unimplemented!("poll_shutdown_write on non-Stream fd")
537 }
538 }
539 }
540
541 pub fn poll_peek_from(
544 &mut self,
545 fd: Fd,
546 cx: &mut Context<'_>,
547 buf: &mut ReadBuf<'_>,
548 ) -> Poll<std::io::Result<Addr>> {
549 let st = match self.lookup_mut(fd) {
550 Ok(st) => st,
551 Err(e) => return Poll::Ready(Err(e)),
552 };
553 assert_eq!(st.ty, Type::Dgram, "poll_peek_from on non-Dgram fd");
554 udp::peek_from(st, cx, buf)
555 }
556
557 pub fn poll_peek(
561 &mut self,
562 fd: Fd,
563 cx: &mut Context<'_>,
564 buf: &mut [u8],
565 ) -> Poll<std::io::Result<usize>> {
566 let ty = match self.lookup(fd) {
567 Ok(st) => st.ty,
568 Err(e) => return Poll::Ready(Err(e)),
569 };
570 match ty {
571 Type::Dgram => {
572 let st = self.lookup_mut(fd).expect("fd validated");
573 let mut rb = ReadBuf::new(buf);
574 match udp::peek_from(st, cx, &mut rb) {
575 Poll::Ready(Ok(_)) => Poll::Ready(Ok(rb.filled().len())),
576 Poll::Ready(Err(e)) => Poll::Ready(Err(e)),
577 Poll::Pending => Poll::Pending,
578 }
579 }
580 Type::Stream => tcp::poll_peek(self, fd, cx, buf),
581 Type::SeqPacket => unimplemented!("SOCK_SEQPACKET poll_peek"),
582 }
583 }
584
585 pub fn deliver(&mut self, pkt: Packet) {
590 match pkt.payload.clone() {
591 Transport::Udp(d) => udp::deliver(self, &pkt, &d),
592 Transport::Tcp(s) => tcp::deliver(self, &pkt, &s),
593 }
594 }
595
596 pub fn egress(&mut self, out: &mut Vec<Packet>) {
609 tcp::check_retx(self);
613 loop {
614 tcp::segment_all(self);
615 if self.outbound.is_empty() {
616 break;
617 }
618 let drained: Vec<_> = std::mem::take(&mut self.outbound).into_iter().collect();
619 for pkt in drained {
620 if self.is_local(pkt.dst) {
621 self.deliver(pkt);
622 } else {
623 out.push(pkt);
624 }
625 }
626 }
627 tcp::reap_closed(self);
629 }
630}
631
632impl Default for Kernel {
633 fn default() -> Self {
634 Self::new()
635 }
636}
637
638#[cfg(test)]
639mod tests {
640 use std::io::ErrorKind;
641 use std::net::SocketAddr;
642
643 use super::*;
644
645 fn inet(s: &str) -> Addr {
646 Addr::Inet(s.parse().unwrap())
647 }
648
649 #[test]
650 fn loopback_is_implicit_local() {
651 let mut k = Kernel::new();
652 assert!(k.is_local("127.0.0.1".parse().unwrap()));
653 assert!(k.is_local("::1".parse().unwrap()));
654 assert!(!k.is_local("10.0.0.1".parse().unwrap()));
655 k.add_address("10.0.0.1".parse().unwrap());
656 assert!(k.is_local("10.0.0.1".parse().unwrap()));
657 }
658
659 #[test]
660 fn bind_records_local_addr() {
661 let mut k = Kernel::new();
662 let s = k.bind(&inet("127.0.0.1:5000"), Type::Dgram).unwrap();
663 assert_eq!(k.local_addr(s).unwrap(), inet("127.0.0.1:5000"));
664 }
665
666 #[test]
667 fn bind_port_zero_allocates_ephemeral() {
668 let mut k = Kernel::new();
669 let s = k.bind(&inet("127.0.0.1:0"), Type::Dgram).unwrap();
670 let Addr::Inet(SocketAddr::V4(v4)) = k.local_addr(s).unwrap() else {
671 panic!("expected v4")
672 };
673 assert!((49152..=65535).contains(&v4.port()));
674 }
675
676 #[test]
677 fn bind_conflict_is_addr_in_use() {
678 let mut k = Kernel::new();
679 k.bind(&inet("127.0.0.1:5000"), Type::Dgram).unwrap();
680 let err = k.bind(&inet("127.0.0.1:5000"), Type::Dgram).unwrap_err();
681 assert_eq!(err.kind(), ErrorKind::AddrInUse);
682 }
683
684 #[test]
685 fn bind_different_protocols_can_share_port() {
686 let mut k = Kernel::new();
687 k.bind(&inet("127.0.0.1:5000"), Type::Dgram).unwrap();
689 k.bind(&inet("127.0.0.1:5000"), Type::Stream).unwrap();
690 }
691
692 #[test]
693 fn bind_rejects_non_local_addr() {
694 let mut k = Kernel::new();
695 let err = k.bind(&inet("10.0.0.1:5000"), Type::Dgram).unwrap_err();
696 assert_eq!(err.kind(), ErrorKind::AddrNotAvailable);
697 }
698
699 #[test]
700 fn bind_wildcard_addr_is_allowed() {
701 let mut k = Kernel::new();
702 k.bind(&inet("0.0.0.0:5000"), Type::Dgram).unwrap();
703 }
704
705 #[test]
706 fn distinct_specific_ips_coexist() {
707 let mut k = Kernel::new();
710 k.add_address("10.0.0.1".parse().unwrap());
711 k.add_address("10.0.0.2".parse().unwrap());
712 k.bind(&inet("10.0.0.1:5000"), Type::Dgram).unwrap();
713 k.bind(&inet("10.0.0.2:5000"), Type::Dgram).unwrap();
714 }
715
716 fn noop_cx() -> Context<'static> {
721 use std::task::Waker;
722 Context::from_waker(Waker::noop())
723 }
724
725 #[test]
726 fn udp_broadcast_send_requires_broadcast_option() {
727 let mut k = Kernel::new();
730 k.add_address("10.0.0.1".parse().unwrap());
731 let s = k.bind(&inet("10.0.0.1:0"), Type::Dgram).unwrap();
732
733 let dst = Addr::Inet("255.255.255.255:9000".parse().unwrap());
734 let Poll::Ready(Err(e)) = k.poll_send_to(s, &mut noop_cx(), b"x", &dst) else {
735 panic!("expected broadcast rejection");
736 };
737 assert_eq!(e.kind(), ErrorKind::PermissionDenied);
738
739 k.set_option(s, SocketOption::Broadcast(true)).unwrap();
740 let Poll::Ready(Ok(_)) = k.poll_send_to(s, &mut noop_cx(), b"x", &dst) else {
741 panic!("broadcast send should succeed with SO_BROADCAST");
742 };
743 }
744
745 #[test]
746 fn bind_zero_avoids_ports_taken_on_other_ips() {
747 let mut k = Kernel::new();
753 k.add_address("10.0.0.1".parse().unwrap());
754
755 k.bind(&inet("10.0.0.1:49152"), Type::Dgram).unwrap();
756 let s = k.bind(&inet("127.0.0.1:0"), Type::Dgram).unwrap();
757 let Addr::Inet(sa) = k.local_addr(s).unwrap() else {
758 panic!("v4 expected")
759 };
760 assert_ne!(sa.port(), 49152);
761 }
762
763 #[test]
764 fn broadcast_option_roundtrips() {
765 let mut k = Kernel::new();
766 let s = k.bind(&inet("127.0.0.1:0"), Type::Dgram).unwrap();
767 assert_eq!(
768 k.get_option(s, SocketOptionKind::Broadcast).unwrap(),
769 SocketOption::Broadcast(false)
770 );
771 k.set_option(s, SocketOption::Broadcast(true)).unwrap();
772 assert_eq!(
773 k.get_option(s, SocketOptionKind::Broadcast).unwrap(),
774 SocketOption::Broadcast(true)
775 );
776 }
777}