Skip to main content

microsandbox_network/
stack.rs

1//! smoltcp interface setup, frame classification, and poll loop.
2//!
3//! This module contains the core networking event loop that runs on a
4//! dedicated OS thread. It bridges guest ethernet frames (via
5//! [`SmoltcpDevice`]) to smoltcp's TCP/IP stack and services connections
6//! through tokio proxy tasks.
7
8use std::collections::HashSet;
9use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr};
10use std::sync::Arc;
11use std::sync::atomic::Ordering;
12
13use smoltcp::iface::{Config, Interface, SocketSet};
14use smoltcp::time::Instant;
15
16use smoltcp::wire::{
17    EthernetAddress, EthernetFrame, EthernetProtocol, HardwareAddress, Icmpv4Packet, Icmpv4Repr,
18    Icmpv6Packet, Icmpv6Repr, IpAddress, IpCidr, IpProtocol, Ipv4Packet, Ipv4Repr, Ipv6Packet,
19    Ipv6Repr, TcpPacket, UdpPacket,
20};
21
22use crate::config::{DnsConfig, PublishedPort};
23use crate::conn::ConnectionTracker;
24use crate::device::SmoltcpDevice;
25use crate::dns::common::ports::DnsPortType;
26use crate::dns::{
27    interceptor::DnsInterceptor,
28    proxies::{dot::DotProxy, tcp::DnsTcpProxy},
29};
30use crate::icmp_relay::IcmpRelay;
31use crate::policy::{EgressEvaluation, HostnameSource, NetworkPolicy, Protocol};
32use crate::proxy;
33use crate::publisher::PortPublisher;
34use crate::shared::SharedState;
35use crate::tls::{proxy as tls_proxy, state::TlsState};
36use crate::udp_relay::UdpRelay;
37
38//--------------------------------------------------------------------------------------------------
39// Types
40//--------------------------------------------------------------------------------------------------
41
42/// Result of classifying a guest ethernet frame before smoltcp processes it.
43///
44/// Pre-inspection allows the poll loop to:
45/// - Create TCP sockets before smoltcp sees a SYN (preventing auto-RST).
46/// - Handle non-DNS UDP outside smoltcp (smoltcp lacks wildcard port binding).
47/// - Route DNS queries to the interception handler.
48pub enum FrameAction {
49    /// TCP SYN to a new destination — create a smoltcp socket before
50    /// letting smoltcp process the frame.
51    TcpSyn { src: SocketAddr, dst: SocketAddr },
52
53    /// Non-DNS UDP datagram — handle entirely outside smoltcp via the UDP
54    /// relay.
55    UdpRelay { src: SocketAddr, dst: SocketAddr },
56
57    /// DNS query (UDP to port 53) — let smoltcp's bound UDP socket handle it.
58    Dns,
59
60    /// Everything else (ARP, NDP, ICMP, TCP data/ACK/FIN, etc.) — let
61    /// smoltcp process normally.
62    Passthrough,
63}
64
65/// Resolved network parameters for the poll loop. Created by
66/// `SmoltcpNetwork::new()` from `NetworkConfig` + sandbox slot.
67pub struct PollLoopConfig {
68    /// Gateway MAC address (smoltcp's identity on the virtual LAN).
69    pub gateway_mac: [u8; 6],
70    /// Guest MAC address.
71    pub guest_mac: [u8; 6],
72    /// Gateway addresses (IPv4 + IPv6) owned by the smoltcp virtual
73    /// stack.
74    pub gateway: GatewayIps,
75    /// Guest IPv4 address.
76    pub guest_ipv4: Ipv4Addr,
77    /// IP-level MTU (e.g. 1500).
78    pub mtu: usize,
79}
80
81/// Per-sandbox gateway addresses (v4 + v6) owned by the smoltcp virtual stack.
82/// Both families are always assigned. The proxy's `resolve_host_dst` helper uses
83/// these to rewrite gateway-bound connections to loopback at dial time.
84#[derive(Debug, Clone, Copy)]
85pub struct GatewayIps {
86    /// Gateway IPv4.
87    pub ipv4: Ipv4Addr,
88    /// Gateway IPv6.
89    pub ipv6: Ipv6Addr,
90}
91
92//--------------------------------------------------------------------------------------------------
93// Functions
94//--------------------------------------------------------------------------------------------------
95
96/// Classify a raw ethernet frame for pre-inspection.
97///
98/// Uses smoltcp's wire module for zero-copy parsing. Returns
99/// [`FrameAction::Passthrough`] for any frame that cannot be parsed or
100/// doesn't match a special case.
101pub fn classify_frame(frame: &[u8]) -> FrameAction {
102    let Ok(eth) = EthernetFrame::new_checked(frame) else {
103        return FrameAction::Passthrough;
104    };
105
106    match eth.ethertype() {
107        EthernetProtocol::Ipv4 => classify_ipv4(eth.payload()),
108        EthernetProtocol::Ipv6 => classify_ipv6(eth.payload()),
109        _ => FrameAction::Passthrough, // ARP, etc.
110    }
111}
112
113/// Create and configure the smoltcp [`Interface`].
114///
115/// The interface is configured as the **gateway**: it owns the gateway IP
116/// addresses and responds to ARP/NDP for them. `any_ip` mode is enabled so
117/// smoltcp accepts traffic destined for arbitrary remote IPs (not just the
118/// gateway), combined with default routes.
119pub fn create_interface(device: &mut SmoltcpDevice, config: &PollLoopConfig) -> Interface {
120    let hw_addr = HardwareAddress::Ethernet(EthernetAddress(config.gateway_mac));
121    let iface_config = Config::new(hw_addr);
122    let mut iface = Interface::new(iface_config, device, smoltcp_now());
123
124    // Configure gateway IP addresses.
125    iface.update_ip_addrs(|addrs| {
126        addrs
127            .push(IpCidr::new(
128                IpAddress::Ipv4(config.gateway.ipv4),
129                // /30 subnet: gateway + guest.
130                30,
131            ))
132            .expect("failed to add gateway IPv4 address");
133        addrs
134            .push(IpCidr::new(IpAddress::Ipv6(config.gateway.ipv6), 64))
135            .expect("failed to add gateway IPv6 address");
136    });
137
138    // Default routes so smoltcp accepts traffic for all destinations.
139    iface
140        .routes_mut()
141        .add_default_ipv4_route(config.gateway.ipv4)
142        .expect("failed to add default IPv4 route");
143    iface
144        .routes_mut()
145        .add_default_ipv6_route(config.gateway.ipv6)
146        .expect("failed to add default IPv6 route");
147
148    // Accept traffic destined for any IP, not just gateway addresses.
149    iface.set_any_ip(true);
150
151    iface
152}
153
154/// Main smoltcp poll loop. Runs on a dedicated OS thread.
155///
156/// Processes guest frames with pre-inspection, drives smoltcp's TCP/IP stack,
157/// and sleeps via `poll(2)` between events.
158///
159/// # Phases per iteration
160///
161/// 1. **Drain guest frames** — pop from `tx_ring`, classify, pre-inspect.
162/// 2. **smoltcp egress + maintenance** — transmit queued packets, run timers.
163/// 3. **Service connections** — relay data between smoltcp sockets and proxy
164///    tasks (added by later tasks).
165/// 4. **Sleep** — `poll(2)` on `tx_wake` + `proxy_wake` pipes with smoltcp's
166///    requested timeout.
167///
168/// # Arguments
169///
170/// * `shared` - Stack-wide shared state: `tx_ring` / `rx_ring` for the virtio-net boundary
171///   and the wake eventfds.
172/// * `config` - Resolved per-sandbox parameters (gateway / guest MAC + IPv4 + IPv6, MTU).
173/// * `network_policy` - User-provided egress policy. Evaluated against the sandbox's
174///   gateway IPs (stored on [`SharedState`]) so `DestinationGroup::Host` rules match.
175/// * `dns_config` - DNS interception settings (block lists, upstreams, timeout).
176/// * `tls_state` - Optional TLS MITM state; drives interception of intercepted ports and DoT
177///   when present.
178/// * `published_ports` - Host → guest port publishes; the publisher accepts inbound
179///   connections on the host-bind address and forwards into the guest.
180/// * `max_connections` - Optional cap on concurrent guest connections tracked by
181///   [`ConnectionTracker`]; `None` uses the default.
182/// * `tokio_handle` - Runtime handle used for proxy tasks, DNS forwarding, port publishing,
183///   and ICMP relays.
184#[allow(clippy::too_many_arguments)]
185pub fn smoltcp_poll_loop(
186    shared: Arc<SharedState>,
187    config: PollLoopConfig,
188    network_policy: NetworkPolicy,
189    dns_config: DnsConfig,
190    tls_state: Option<Arc<TlsState>>,
191    published_ports: Vec<PublishedPort>,
192    max_connections: Option<usize>,
193    tokio_handle: tokio::runtime::Handle,
194) {
195    let mut device = SmoltcpDevice::new(shared.clone(), config.mtu);
196    let mut iface = create_interface(&mut device, &config);
197    let mut sockets = SocketSet::new(vec![]);
198    let mut conn_tracker = ConnectionTracker::new(max_connections);
199
200    // The DNS forwarder needs to know which IPs count as "the gateway"
201    // (so it routes guest queries to those addresses through the
202    // configured upstream) and a policy evaluator (so guest-chosen
203    // `@target` resolvers are gated by egress rules just like any
204    // other outbound).
205    let gateway_ips: Arc<HashSet<IpAddr>> = Arc::new(HashSet::from([
206        IpAddr::V4(config.gateway.ipv4),
207        IpAddr::V6(config.gateway.ipv6),
208    ]));
209    // Gateway IPs must be on SharedState before any egress evaluation runs,
210    // so `DestinationGroup::Host` rules can resolve to the right address.
211    shared.set_gateway_ips(config.gateway.ipv4, config.gateway.ipv6);
212    let network_policy = Arc::new(network_policy);
213
214    let (mut dns_interceptor, dns_forwarder_handle) = DnsInterceptor::new(
215        &mut sockets,
216        dns_config,
217        shared.clone(),
218        &tokio_handle,
219        gateway_ips,
220        network_policy.clone(),
221        config.gateway,
222    );
223    let mut port_publisher = PortPublisher::new(&published_ports, config.guest_ipv4, &tokio_handle);
224    let mut udp_relay = UdpRelay::new(
225        shared.clone(),
226        config.gateway_mac,
227        config.guest_mac,
228        tokio_handle.clone(),
229    );
230    let icmp_relay = IcmpRelay::new(
231        shared.clone(),
232        config.gateway_mac,
233        config.guest_mac,
234        tokio_handle.clone(),
235    );
236
237    // Rate-limit cleanup operations: run at most once per second.
238    let mut last_cleanup = std::time::Instant::now();
239
240    // poll(2) file descriptors for sleeping.
241    let mut poll_fds = [
242        libc::pollfd {
243            fd: shared.tx_wake.as_raw_fd(),
244            events: libc::POLLIN,
245            revents: 0,
246        },
247        libc::pollfd {
248            fd: shared.proxy_wake.as_raw_fd(),
249            events: libc::POLLIN,
250            revents: 0,
251        },
252    ];
253
254    loop {
255        let now = smoltcp_now();
256
257        // ── Phase 1: Drain all guest frames with pre-inspection ──────────
258        while let Some(frame) = device.stage_next_frame() {
259            if handle_gateway_icmp_echo(frame, &config, &shared) {
260                device.drop_staged_frame();
261                continue;
262            }
263
264            if icmp_relay.relay_outbound_if_echo(frame, &config, &network_policy) {
265                device.drop_staged_frame();
266                continue;
267            }
268
269            match classify_frame(frame) {
270                FrameAction::TcpSyn { src, dst } => {
271                    let allow = match DnsPortType::from_tcp(dst.port()) {
272                        // Plain DNS: the interceptor enforces policy at
273                        // the application layer (block list + rebind
274                        // protection); bypass the network egress check.
275                        DnsPortType::Dns => true,
276                        // DoT: intercept only when TLS MITM is
277                        // configured. Without it, the block list can't
278                        // apply (traffic is encrypted end-to-end), so
279                        // we refuse to force a fall-back to plain
280                        // TCP/53. When TLS MITM is configured, bypass
281                        // egress policy the same way plain DNS does —
282                        // policy for the upstream resolver is applied
283                        // per query by the forwarder.
284                        DnsPortType::EncryptedDns => {
285                            if tls_state.is_some() {
286                                true
287                            } else {
288                                tracing::debug!(%dst, "DoT port refused (TLS interception not configured); stub should fall back to TCP/53");
289                                false
290                            }
291                        }
292                        // Alternative DNS protocol we can't proxy:
293                        // refuse outright — no socket means smoltcp
294                        // emits RST, which the guest's stub treats as
295                        // "upstream unavailable" and falls back to
296                        // plain TCP/53.
297                        DnsPortType::AlternativeDns => {
298                            tracing::debug!(%dst, "alternative-DNS TCP port refused; stub should fall back to TCP/53");
299                            false
300                        }
301                        // Other: regular outbound — defer Domain rules to first-flight;
302                        // accept unless an IP-layer rule denies.
303                        DnsPortType::Other => match network_policy.evaluate_egress_with_source(
304                            dst,
305                            Protocol::Tcp,
306                            &shared,
307                            HostnameSource::Deferred,
308                        ) {
309                            EgressEvaluation::Allow | EgressEvaluation::DeferUntilHostname => true,
310                            EgressEvaluation::Deny => false,
311                        },
312                    };
313                    if allow && !conn_tracker.has_socket_for(&src, &dst) {
314                        conn_tracker.create_tcp_socket(src, dst, &mut sockets);
315                    }
316                    // Let smoltcp process — matching socket completes
317                    // handshake, no socket means auto-RST.
318                    iface.poll_ingress_single(now, &mut device, &mut sockets);
319                }
320
321                FrameAction::UdpRelay { src, dst } => {
322                    // QUIC blocking: drop UDP to intercepted ports when
323                    // TLS interception is active.
324                    if let Some(ref tls) = tls_state
325                        && tls.config.intercepted_ports.contains(&dst.port())
326                        && tls.config.block_quic_on_intercept
327                    {
328                        device.drop_staged_frame();
329                        continue;
330                    }
331
332                    match DnsPortType::from_udp(dst.port()) {
333                        // Dns: unreachable here — classify_transport
334                        // routes UDP/53 to FrameAction::Dns, not
335                        // UdpRelay. Defensive drop covers regressions.
336                        DnsPortType::Dns => {
337                            device.drop_staged_frame();
338                            continue;
339                        }
340                        // EncryptedDns: unreachable here —
341                        // `DnsPortType::from_udp` never returns it
342                        // today (DoT is TCP-only; UDP/853 is DoQ and
343                        // returns AlternativeDns). Defensive drop.
344                        DnsPortType::EncryptedDns => {
345                            device.drop_staged_frame();
346                            continue;
347                        }
348                        // Alternative DNS protocols on well-known UDP
349                        // ports are dropped — forces fall-back to UDP/53.
350                        DnsPortType::AlternativeDns => {
351                            tracing::debug!(%dst, "alternative-DNS UDP port dropped; stub should fall back to UDP/53");
352                            device.drop_staged_frame();
353                            continue;
354                        }
355                        DnsPortType::Other => {}
356                    }
357
358                    // Policy check.
359                    if network_policy
360                        .evaluate_egress(dst, Protocol::Udp, &shared)
361                        .is_deny()
362                    {
363                        device.drop_staged_frame();
364                        continue;
365                    }
366
367                    // Resolve the host-side destination for the dial.
368                    // `dst` stays unchanged so reply frames are stamped
369                    // with the IP the guest expects.
370                    let host_dst = resolve_host_dst(dst, config.gateway);
371                    udp_relay.relay_outbound(frame, src, dst, host_dst);
372                    device.drop_staged_frame();
373                }
374
375                FrameAction::Dns | FrameAction::Passthrough => {
376                    // ARP, ICMP, DNS (port 53), TCP data — smoltcp handles.
377                    iface.poll_ingress_single(now, &mut device, &mut sockets);
378                }
379            }
380        }
381
382        // ── Phase 2: Ingress egress + maintenance ─────────────────────────
383        // Flush frames generated by Phase 1 ingress (ACKs, SYN-ACKs, etc.)
384        // before relaying data so smoltcp has up-to-date state.
385        loop {
386            let result = iface.poll_egress(now, &mut device, &mut sockets);
387            if matches!(result, smoltcp::iface::PollResult::None) {
388                break;
389            }
390        }
391        iface.poll_maintenance(now);
392
393        // Coalesced wake: if Phase 1/2 emitted any frames, wake the
394        // NetWorker once instead of per-frame.
395        if device.frames_emitted.swap(false, Ordering::Relaxed) {
396            shared.rx_wake.wake();
397        }
398
399        // ── Phase 3: Service connections + relay data ────────────────────
400        // Relay proxy data INTO smoltcp sockets first, then a single egress
401        // pass flushes everything. This eliminates the former "Phase 2b"
402        // double-egress pattern.
403        conn_tracker.relay_data(&mut sockets);
404        dns_interceptor.process(&mut sockets);
405
406        // Accept queued inbound connections from published port listeners.
407        port_publisher.accept_inbound(&mut iface, &mut sockets, &shared, &tokio_handle);
408        port_publisher.relay_data(&mut sockets);
409
410        // Detect newly-established connections and spawn proxy tasks.
411        let new_conns = conn_tracker.take_new_connections(&mut sockets);
412        for conn in new_conns {
413            if let Some(ref tls_state) = tls_state
414                && tls_state
415                    .config
416                    .intercepted_ports
417                    .contains(&conn.dst.port())
418            {
419                // TLS-intercepted port — spawn TLS MITM proxy.
420                let conn_dst = resolve_host_dst(conn.dst, config.gateway);
421                tls_proxy::spawn_tls_proxy(
422                    &tokio_handle,
423                    conn_dst,
424                    conn.from_smoltcp,
425                    conn.to_smoltcp,
426                    shared.clone(),
427                    tls_state.clone(),
428                    network_policy.clone(),
429                );
430                continue;
431            }
432            if conn.dst.port() == 53 {
433                // DNS over TCP: route through the same forwarder the UDP
434                // path uses. The forwarder applies the domain block list
435                // and rebind protection to every query and routes
436                // upstream based on `conn.dst.ip()` — the configured
437                // upstream for queries to the gateway, direct forward
438                // to the chosen `@target` (subject to egress policy)
439                // otherwise. No gateway→loopback rewrite here: the
440                // forwarder dials the configured upstream, not the
441                // gateway.
442                DnsTcpProxy::spawn(
443                    &tokio_handle,
444                    conn.dst,
445                    conn.from_smoltcp,
446                    conn.to_smoltcp,
447                    dns_forwarder_handle.clone(),
448                    shared.clone(),
449                );
450                continue;
451            }
452            if conn.dst.port() == 853
453                && let Some(ref tls_state) = tls_state
454            {
455                // DNS over TLS: terminate TLS at the gateway with a
456                // per-domain cert, hand the inner DNS frames to the
457                // same forwarder plain DNS uses. Policy for the
458                // chosen `@target` resolver is applied per-query by
459                // the forwarder (block list + rebind + egress).
460                DotProxy::spawn(
461                    &tokio_handle,
462                    conn.dst,
463                    conn.from_smoltcp,
464                    conn.to_smoltcp,
465                    dns_forwarder_handle.clone(),
466                    tls_state.clone(),
467                    shared.clone(),
468                );
469                continue;
470            }
471            // Plain TCP proxy.
472            let connect_dst = resolve_host_dst(conn.dst, config.gateway);
473            proxy::spawn_tcp_proxy(
474                &tokio_handle,
475                conn.dst,
476                connect_dst,
477                conn.from_smoltcp,
478                conn.to_smoltcp,
479                shared.clone(),
480                network_policy.clone(),
481            );
482        }
483
484        // Rate-limited cleanup: TIME_WAIT is 60s, session timeout is 60s,
485        // so checking once per second is more than sufficient.
486        if last_cleanup.elapsed() >= std::time::Duration::from_secs(1) {
487            conn_tracker.cleanup_closed(&mut sockets);
488            port_publisher.cleanup_closed(&mut sockets);
489            udp_relay.cleanup_expired();
490            shared.cleanup_resolved_hostnames();
491            last_cleanup = std::time::Instant::now();
492        }
493
494        // ── Phase 4: Flush relay data + sleep ────────────────────────────
495        // Single egress pass flushes all data written by Phase 3.
496        loop {
497            let result = iface.poll_egress(now, &mut device, &mut sockets);
498            if matches!(result, smoltcp::iface::PollResult::None) {
499                break;
500            }
501        }
502
503        // Coalesced wake: if Phase 3/4 emitted any frames, wake once.
504        if device.frames_emitted.swap(false, Ordering::Relaxed) {
505            shared.rx_wake.wake();
506        }
507
508        let timeout_ms = iface
509            .poll_delay(now, &sockets)
510            .map(|d| d.total_millis().min(i32::MAX as u64) as i32)
511            .unwrap_or(100); // 100ms fallback when no timers pending.
512
513        // SAFETY: poll_fds is a valid array of pollfd structs with valid fds.
514        unsafe {
515            libc::poll(
516                poll_fds.as_mut_ptr(),
517                poll_fds.len() as libc::nfds_t,
518                timeout_ms,
519            );
520        }
521
522        // Conditional drain: only drain pipes that actually have data.
523        if poll_fds[0].revents & libc::POLLIN != 0 {
524            shared.tx_wake.drain();
525        }
526        if poll_fds[1].revents & libc::POLLIN != 0 {
527            shared.proxy_wake.drain();
528        }
529    }
530}
531
532//--------------------------------------------------------------------------------------------------
533// Functions: Helpers
534//--------------------------------------------------------------------------------------------------
535
536/// Map a guest-wire destination to its host-socket equivalent.
537///
538/// Gateway IPs rewrite to loopback (`127.0.0.1` / `::1`); everything else
539/// passes through. Shared by the TCP proxy dispatch and the UDP relay.
540///
541/// # Arguments
542///
543/// * `dst` - Destination from the guest's packet.
544/// * `gateway` - Per-sandbox gateway IPs that trigger the loopback rewrite.
545pub(crate) fn resolve_host_dst(dst: SocketAddr, gateway: GatewayIps) -> SocketAddr {
546    match dst.ip() {
547        IpAddr::V4(v4) if v4 == gateway.ipv4 => {
548            SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), dst.port())
549        }
550        IpAddr::V6(v6) if v6 == gateway.ipv6 => {
551            SocketAddr::new(IpAddr::V6(Ipv6Addr::LOCALHOST), dst.port())
552        }
553        _ => dst,
554    }
555}
556
557/// Get the current time as a smoltcp [`Instant`] using a monotonic clock.
558///
559/// Uses `std::time::Instant` (monotonic) instead of `SystemTime` (wall
560/// clock) to avoid issues with NTP clock step corrections that could
561/// cause smoltcp timers to misbehave.
562fn smoltcp_now() -> Instant {
563    static EPOCH: std::sync::OnceLock<std::time::Instant> = std::sync::OnceLock::new();
564    let epoch = EPOCH.get_or_init(std::time::Instant::now);
565    let elapsed = epoch.elapsed();
566    Instant::from_millis(elapsed.as_millis() as i64)
567}
568
569/// Reply locally to ICMP echo requests aimed at the sandbox gateway.
570///
571/// `any_ip` is required so smoltcp accepts guest traffic for arbitrary remote
572/// destinations, but that would make smoltcp's automatic ICMP echo replies
573/// spoof remote hosts. Handle only the real gateway IPs here and leave all
574/// other ICMP traffic untouched.
575fn handle_gateway_icmp_echo(frame: &[u8], config: &PollLoopConfig, shared: &SharedState) -> bool {
576    let Ok(eth) = EthernetFrame::new_checked(frame) else {
577        return false;
578    };
579
580    let reply = match eth.ethertype() {
581        EthernetProtocol::Ipv4 => gateway_icmpv4_echo_reply(&eth, config),
582        EthernetProtocol::Ipv6 => gateway_icmpv6_echo_reply(&eth, config),
583        _ => None,
584    };
585    let Some(reply) = reply else {
586        return false;
587    };
588
589    let reply_len = reply.len();
590    if shared.rx_ring.push(reply).is_ok() {
591        shared.add_rx_bytes(reply_len);
592        shared.rx_wake.wake();
593    }
594
595    true
596}
597
598/// Build an IPv4 ICMP echo reply when the guest pings the gateway IPv4.
599fn gateway_icmpv4_echo_reply(
600    eth: &EthernetFrame<&[u8]>,
601    config: &PollLoopConfig,
602) -> Option<Vec<u8>> {
603    let ipv4 = Ipv4Packet::new_checked(eth.payload()).ok()?;
604    if ipv4.dst_addr() != config.gateway.ipv4 || ipv4.next_header() != IpProtocol::Icmp {
605        return None;
606    }
607
608    let icmp = Icmpv4Packet::new_checked(ipv4.payload()).ok()?;
609    let Icmpv4Repr::EchoRequest {
610        ident,
611        seq_no,
612        data,
613    } = Icmpv4Repr::parse(&icmp, &smoltcp::phy::ChecksumCapabilities::default()).ok()?
614    else {
615        return None;
616    };
617
618    let ipv4_repr = Ipv4Repr {
619        src_addr: config.gateway.ipv4,
620        dst_addr: ipv4.src_addr(),
621        next_header: IpProtocol::Icmp,
622        payload_len: 8 + data.len(),
623        hop_limit: 64,
624    };
625    let icmp_repr = Icmpv4Repr::EchoReply {
626        ident,
627        seq_no,
628        data,
629    };
630    let mut reply = vec![0u8; 14 + ipv4_repr.buffer_len() + icmp_repr.buffer_len()];
631
632    let mut reply_eth = EthernetFrame::new_unchecked(&mut reply);
633    reply_eth.set_src_addr(EthernetAddress(config.gateway_mac));
634    reply_eth.set_dst_addr(eth.src_addr());
635    reply_eth.set_ethertype(EthernetProtocol::Ipv4);
636
637    ipv4_repr.emit(
638        &mut Ipv4Packet::new_unchecked(&mut reply[14..34]),
639        &smoltcp::phy::ChecksumCapabilities::default(),
640    );
641    icmp_repr.emit(
642        &mut Icmpv4Packet::new_unchecked(&mut reply[34..]),
643        &smoltcp::phy::ChecksumCapabilities::default(),
644    );
645
646    Some(reply)
647}
648
649/// Build an IPv6 ICMP echo reply when the guest pings the gateway IPv6.
650fn gateway_icmpv6_echo_reply(
651    eth: &EthernetFrame<&[u8]>,
652    config: &PollLoopConfig,
653) -> Option<Vec<u8>> {
654    let ipv6 = Ipv6Packet::new_checked(eth.payload()).ok()?;
655    if ipv6.dst_addr() != config.gateway.ipv6 || ipv6.next_header() != IpProtocol::Icmpv6 {
656        return None;
657    }
658
659    let icmp = Icmpv6Packet::new_checked(ipv6.payload()).ok()?;
660    let Icmpv6Repr::EchoRequest {
661        ident,
662        seq_no,
663        data,
664    } = Icmpv6Repr::parse(
665        &ipv6.src_addr(),
666        &ipv6.dst_addr(),
667        &icmp,
668        &smoltcp::phy::ChecksumCapabilities::default(),
669    )
670    .ok()?
671    else {
672        return None;
673    };
674
675    let ipv6_repr = Ipv6Repr {
676        src_addr: config.gateway.ipv6,
677        dst_addr: ipv6.src_addr(),
678        next_header: IpProtocol::Icmpv6,
679        payload_len: icmp_repr_buffer_len_v6(data),
680        hop_limit: 64,
681    };
682    let icmp_repr = Icmpv6Repr::EchoReply {
683        ident,
684        seq_no,
685        data,
686    };
687    let ipv6_hdr_len = 40;
688    let mut reply = vec![0u8; 14 + ipv6_hdr_len + icmp_repr.buffer_len()];
689
690    let mut reply_eth = EthernetFrame::new_unchecked(&mut reply);
691    reply_eth.set_src_addr(EthernetAddress(config.gateway_mac));
692    reply_eth.set_dst_addr(eth.src_addr());
693    reply_eth.set_ethertype(EthernetProtocol::Ipv6);
694
695    ipv6_repr.emit(&mut Ipv6Packet::new_unchecked(&mut reply[14..54]));
696    icmp_repr.emit(
697        &config.gateway.ipv6,
698        &ipv6.src_addr(),
699        &mut Icmpv6Packet::new_unchecked(&mut reply[54..]),
700        &smoltcp::phy::ChecksumCapabilities::default(),
701    );
702
703    Some(reply)
704}
705
706fn icmp_repr_buffer_len_v6(data: &[u8]) -> usize {
707    Icmpv6Repr::EchoReply {
708        ident: 0,
709        seq_no: 0,
710        data,
711    }
712    .buffer_len()
713}
714
715/// Classify an IPv4 packet payload (after stripping the Ethernet header).
716fn classify_ipv4(payload: &[u8]) -> FrameAction {
717    let Ok(ipv4) = Ipv4Packet::new_checked(payload) else {
718        return FrameAction::Passthrough;
719    };
720    classify_transport(
721        ipv4.next_header(),
722        ipv4.src_addr().into(),
723        ipv4.dst_addr().into(),
724        ipv4.payload(),
725    )
726}
727
728/// Classify an IPv6 packet payload (after stripping the Ethernet header).
729fn classify_ipv6(payload: &[u8]) -> FrameAction {
730    let Ok(ipv6) = Ipv6Packet::new_checked(payload) else {
731        return FrameAction::Passthrough;
732    };
733    classify_transport(
734        ipv6.next_header(),
735        ipv6.src_addr().into(),
736        ipv6.dst_addr().into(),
737        ipv6.payload(),
738    )
739}
740
741/// Classify the transport-layer protocol (shared by IPv4 and IPv6).
742fn classify_transport(
743    protocol: IpProtocol,
744    src_ip: std::net::IpAddr,
745    dst_ip: std::net::IpAddr,
746    transport_payload: &[u8],
747) -> FrameAction {
748    match protocol {
749        IpProtocol::Tcp => {
750            let Ok(tcp) = TcpPacket::new_checked(transport_payload) else {
751                return FrameAction::Passthrough;
752            };
753            if tcp.syn() && !tcp.ack() {
754                FrameAction::TcpSyn {
755                    src: SocketAddr::new(src_ip, tcp.src_port()),
756                    dst: SocketAddr::new(dst_ip, tcp.dst_port()),
757                }
758            } else {
759                FrameAction::Passthrough
760            }
761        }
762        IpProtocol::Udp => {
763            let Ok(udp) = UdpPacket::new_checked(transport_payload) else {
764                return FrameAction::Passthrough;
765            };
766            // The plain-DNS port (UDP/53) lives in dns::common::ports so
767            // the alternative-DNS refusal logic and this dispatcher
768            // share one source of truth for "which UDP ports are DNS".
769            if DnsPortType::from_udp(udp.dst_port()) == DnsPortType::Dns {
770                FrameAction::Dns
771            } else {
772                FrameAction::UdpRelay {
773                    src: SocketAddr::new(src_ip, udp.src_port()),
774                    dst: SocketAddr::new(dst_ip, udp.dst_port()),
775                }
776            }
777        }
778        _ => FrameAction::Passthrough, // ICMP, etc.
779    }
780}
781
782//--------------------------------------------------------------------------------------------------
783// Tests
784//--------------------------------------------------------------------------------------------------
785
786#[cfg(test)]
787mod tests {
788    use super::*;
789    use std::sync::Arc;
790
791    use smoltcp::phy::ChecksumCapabilities;
792    use smoltcp::wire::{
793        ArpOperation, ArpPacket, ArpRepr, EthernetRepr, Icmpv4Packet, Icmpv4Repr, Ipv4Repr,
794    };
795
796    use crate::device::SmoltcpDevice;
797    use crate::shared::SharedState;
798
799    /// Build a minimal Ethernet + IPv4 + TCP SYN frame.
800    fn build_tcp_syn_frame(
801        src_ip: [u8; 4],
802        dst_ip: [u8; 4],
803        src_port: u16,
804        dst_port: u16,
805    ) -> Vec<u8> {
806        let mut frame = vec![0u8; 14 + 20 + 20]; // eth + ipv4 + tcp
807
808        // Ethernet header.
809        frame[12] = 0x08; // EtherType: IPv4
810        frame[13] = 0x00;
811
812        // IPv4 header.
813        let ip = &mut frame[14..34];
814        ip[0] = 0x45; // Version + IHL
815        let total_len = 40u16; // 20 (IP) + 20 (TCP)
816        ip[2..4].copy_from_slice(&total_len.to_be_bytes());
817        ip[6] = 0x40; // Don't Fragment
818        ip[8] = 64; // TTL
819        ip[9] = 6; // Protocol: TCP
820        ip[12..16].copy_from_slice(&src_ip);
821        ip[16..20].copy_from_slice(&dst_ip);
822
823        // TCP header.
824        let tcp = &mut frame[34..54];
825        tcp[0..2].copy_from_slice(&src_port.to_be_bytes());
826        tcp[2..4].copy_from_slice(&dst_port.to_be_bytes());
827        tcp[12] = 0x50; // Data offset: 5 words
828        tcp[13] = 0x02; // SYN flag
829
830        frame
831    }
832
833    /// Build a minimal Ethernet + IPv4 + UDP frame.
834    fn build_udp_frame(src_ip: [u8; 4], dst_ip: [u8; 4], src_port: u16, dst_port: u16) -> Vec<u8> {
835        let mut frame = vec![0u8; 14 + 20 + 8]; // eth + ipv4 + udp
836
837        // Ethernet header.
838        frame[12] = 0x08;
839        frame[13] = 0x00;
840
841        // IPv4 header.
842        let ip = &mut frame[14..34];
843        ip[0] = 0x45;
844        let total_len = 28u16; // 20 (IP) + 8 (UDP)
845        ip[2..4].copy_from_slice(&total_len.to_be_bytes());
846        ip[8] = 64;
847        ip[9] = 17; // Protocol: UDP
848        ip[12..16].copy_from_slice(&src_ip);
849        ip[16..20].copy_from_slice(&dst_ip);
850
851        // UDP header.
852        let udp = &mut frame[34..42];
853        udp[0..2].copy_from_slice(&src_port.to_be_bytes());
854        udp[2..4].copy_from_slice(&dst_port.to_be_bytes());
855        let udp_len = 8u16;
856        udp[4..6].copy_from_slice(&udp_len.to_be_bytes());
857
858        frame
859    }
860
861    /// Build a minimal Ethernet + IPv4 + ICMP echo request frame.
862    fn build_icmpv4_echo_frame(
863        src_mac: [u8; 6],
864        dst_mac: [u8; 6],
865        src_ip: [u8; 4],
866        dst_ip: [u8; 4],
867        ident: u16,
868        seq_no: u16,
869        data: &[u8],
870    ) -> Vec<u8> {
871        let ipv4_repr = Ipv4Repr {
872            src_addr: Ipv4Addr::from(src_ip),
873            dst_addr: Ipv4Addr::from(dst_ip),
874            next_header: IpProtocol::Icmp,
875            payload_len: 8 + data.len(),
876            hop_limit: 64,
877        };
878        let icmp_repr = Icmpv4Repr::EchoRequest {
879            ident,
880            seq_no,
881            data,
882        };
883        let frame_len = 14 + ipv4_repr.buffer_len() + icmp_repr.buffer_len();
884        let mut frame = vec![0u8; frame_len];
885
886        let mut eth_frame = EthernetFrame::new_unchecked(&mut frame);
887        EthernetRepr {
888            src_addr: EthernetAddress(src_mac),
889            dst_addr: EthernetAddress(dst_mac),
890            ethertype: EthernetProtocol::Ipv4,
891        }
892        .emit(&mut eth_frame);
893
894        ipv4_repr.emit(
895            &mut Ipv4Packet::new_unchecked(&mut frame[14..34]),
896            &ChecksumCapabilities::default(),
897        );
898        icmp_repr.emit(
899            &mut Icmpv4Packet::new_unchecked(&mut frame[34..]),
900            &ChecksumCapabilities::default(),
901        );
902
903        frame
904    }
905
906    /// Build a minimal Ethernet + ARP request frame.
907    fn build_arp_request_frame(src_mac: [u8; 6], src_ip: [u8; 4], target_ip: [u8; 4]) -> Vec<u8> {
908        let mut frame = vec![0u8; 14 + 28];
909
910        let mut eth_frame = EthernetFrame::new_unchecked(&mut frame);
911        EthernetRepr {
912            src_addr: EthernetAddress(src_mac),
913            dst_addr: EthernetAddress([0xff; 6]),
914            ethertype: EthernetProtocol::Arp,
915        }
916        .emit(&mut eth_frame);
917
918        ArpRepr::EthernetIpv4 {
919            operation: ArpOperation::Request,
920            source_hardware_addr: EthernetAddress(src_mac),
921            source_protocol_addr: Ipv4Addr::from(src_ip),
922            target_hardware_addr: EthernetAddress([0x00; 6]),
923            target_protocol_addr: Ipv4Addr::from(target_ip),
924        }
925        .emit(&mut ArpPacket::new_unchecked(&mut frame[14..]));
926
927        frame
928    }
929
930    #[test]
931    fn classify_tcp_syn() {
932        let frame = build_tcp_syn_frame([10, 0, 0, 2], [93, 184, 216, 34], 54321, 443);
933        match classify_frame(&frame) {
934            FrameAction::TcpSyn { src, dst } => {
935                assert_eq!(
936                    src,
937                    SocketAddr::new(Ipv4Addr::new(10, 0, 0, 2).into(), 54321)
938                );
939                assert_eq!(
940                    dst,
941                    SocketAddr::new(Ipv4Addr::new(93, 184, 216, 34).into(), 443)
942                );
943            }
944            _ => panic!("expected TcpSyn"),
945        }
946    }
947
948    #[test]
949    fn classify_tcp_ack_is_passthrough() {
950        let mut frame = build_tcp_syn_frame([10, 0, 0, 2], [93, 184, 216, 34], 54321, 443);
951        // Change flags to ACK only (not SYN).
952        frame[34 + 13] = 0x10; // ACK flag
953        assert!(matches!(classify_frame(&frame), FrameAction::Passthrough));
954    }
955
956    #[test]
957    fn classify_udp_dns() {
958        let frame = build_udp_frame([10, 0, 0, 2], [10, 0, 0, 1], 12345, 53);
959        assert!(matches!(classify_frame(&frame), FrameAction::Dns));
960    }
961
962    #[test]
963    fn classify_udp_non_dns() {
964        let frame = build_udp_frame([10, 0, 0, 2], [8, 8, 8, 8], 12345, 443);
965        match classify_frame(&frame) {
966            FrameAction::UdpRelay { src, dst } => {
967                assert_eq!(src.port(), 12345);
968                assert_eq!(dst.port(), 443);
969            }
970            _ => panic!("expected UdpRelay"),
971        }
972    }
973
974    #[test]
975    fn classify_arp_is_passthrough() {
976        let mut frame = vec![0u8; 42]; // ARP frame
977        frame[12] = 0x08;
978        frame[13] = 0x06; // EtherType: ARP
979        assert!(matches!(classify_frame(&frame), FrameAction::Passthrough));
980    }
981
982    #[test]
983    fn classify_garbage_is_passthrough() {
984        assert!(matches!(classify_frame(&[]), FrameAction::Passthrough));
985        assert!(matches!(classify_frame(&[0; 5]), FrameAction::Passthrough));
986    }
987
988    #[test]
989    fn gateway_replies_to_icmp_echo_requests() {
990        fn drive_one_frame(
991            device: &mut SmoltcpDevice,
992            iface: &mut Interface,
993            sockets: &mut SocketSet<'_>,
994            shared: &Arc<SharedState>,
995            poll_config: &PollLoopConfig,
996            now: Instant,
997        ) {
998            let frame = device.stage_next_frame().expect("expected staged frame");
999            if handle_gateway_icmp_echo(frame, poll_config, shared) {
1000                device.drop_staged_frame();
1001                return;
1002            }
1003            let _ = iface.poll_ingress_single(now, device, sockets);
1004            let _ = iface.poll_egress(now, device, sockets);
1005        }
1006
1007        let shared = Arc::new(SharedState::new(4));
1008        let poll_config = PollLoopConfig {
1009            gateway_mac: [0x02, 0x00, 0x00, 0x00, 0x00, 0x01],
1010            guest_mac: [0x02, 0x00, 0x00, 0x00, 0x00, 0x02],
1011            gateway: GatewayIps {
1012                ipv4: Ipv4Addr::new(100, 96, 0, 1),
1013                ipv6: Ipv6Addr::LOCALHOST,
1014            },
1015            guest_ipv4: Ipv4Addr::new(100, 96, 0, 2),
1016            mtu: 1500,
1017        };
1018        let mut device = SmoltcpDevice::new(shared.clone(), poll_config.mtu);
1019        let mut iface = create_interface(&mut device, &poll_config);
1020        let mut sockets = SocketSet::new(vec![]);
1021        let now = smoltcp_now();
1022
1023        // Mirror the real guest flow: resolve the gateway MAC before sending
1024        // the ICMP echo request.
1025        shared
1026            .tx_ring
1027            .push(build_arp_request_frame(
1028                poll_config.guest_mac,
1029                poll_config.guest_ipv4.octets(),
1030                poll_config.gateway.ipv4.octets(),
1031            ))
1032            .unwrap();
1033        shared
1034            .tx_ring
1035            .push(build_icmpv4_echo_frame(
1036                poll_config.guest_mac,
1037                poll_config.gateway_mac,
1038                poll_config.guest_ipv4.octets(),
1039                poll_config.gateway.ipv4.octets(),
1040                0x1234,
1041                0xABCD,
1042                b"ping",
1043            ))
1044            .unwrap();
1045
1046        drive_one_frame(
1047            &mut device,
1048            &mut iface,
1049            &mut sockets,
1050            &shared,
1051            &poll_config,
1052            now,
1053        );
1054        let _ = shared.rx_ring.pop().expect("expected ARP reply");
1055
1056        drive_one_frame(
1057            &mut device,
1058            &mut iface,
1059            &mut sockets,
1060            &shared,
1061            &poll_config,
1062            now,
1063        );
1064
1065        let reply = shared.rx_ring.pop().expect("expected ICMP echo reply");
1066        let eth = EthernetFrame::new_checked(&reply).expect("valid ethernet frame");
1067        assert_eq!(eth.src_addr(), EthernetAddress(poll_config.gateway_mac));
1068        assert_eq!(eth.dst_addr(), EthernetAddress(poll_config.guest_mac));
1069        assert_eq!(eth.ethertype(), EthernetProtocol::Ipv4);
1070
1071        let ipv4 = Ipv4Packet::new_checked(eth.payload()).expect("valid IPv4 packet");
1072        assert_eq!(ipv4.src_addr(), poll_config.gateway.ipv4);
1073        assert_eq!(ipv4.dst_addr(), poll_config.guest_ipv4);
1074        assert_eq!(ipv4.next_header(), IpProtocol::Icmp);
1075
1076        let icmp = Icmpv4Packet::new_checked(ipv4.payload()).expect("valid ICMP packet");
1077        let icmp_repr = Icmpv4Repr::parse(&icmp, &ChecksumCapabilities::default())
1078            .expect("valid ICMP echo reply");
1079        assert_eq!(
1080            icmp_repr,
1081            Icmpv4Repr::EchoReply {
1082                ident: 0x1234,
1083                seq_no: 0xABCD,
1084                data: b"ping",
1085            }
1086        );
1087    }
1088
1089    fn test_gateway() -> GatewayIps {
1090        GatewayIps {
1091            ipv4: Ipv4Addr::new(100, 96, 0, 1),
1092            ipv6: "fd42:6d73:62::1".parse().unwrap(),
1093        }
1094    }
1095
1096    #[test]
1097    fn resolve_host_dst_matches_ipv4() {
1098        let gw = test_gateway();
1099        let dst = SocketAddr::new(IpAddr::V4(gw.ipv4), 8080);
1100        assert_eq!(
1101            resolve_host_dst(dst, gw),
1102            SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 8080)
1103        );
1104    }
1105
1106    #[test]
1107    fn resolve_host_dst_matches_ipv6() {
1108        let gw = test_gateway();
1109        let dst = SocketAddr::new(IpAddr::V6(gw.ipv6), 8080);
1110        assert_eq!(
1111            resolve_host_dst(dst, gw),
1112            SocketAddr::new(IpAddr::V6(Ipv6Addr::LOCALHOST), 8080)
1113        );
1114    }
1115
1116    #[test]
1117    fn resolve_host_dst_passes_through_non_gateway() {
1118        let gw = test_gateway();
1119        let dst = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(8, 8, 8, 8)), 443);
1120        assert_eq!(resolve_host_dst(dst, gw), dst);
1121    }
1122
1123    #[test]
1124    fn external_icmp_echo_requests_are_not_answered_locally() {
1125        fn drive_one_frame(
1126            device: &mut SmoltcpDevice,
1127            iface: &mut Interface,
1128            sockets: &mut SocketSet<'_>,
1129            shared: &Arc<SharedState>,
1130            poll_config: &PollLoopConfig,
1131            now: Instant,
1132        ) {
1133            let frame = device.stage_next_frame().expect("expected staged frame");
1134            if handle_gateway_icmp_echo(frame, poll_config, shared) {
1135                device.drop_staged_frame();
1136                return;
1137            }
1138            let _ = iface.poll_ingress_single(now, device, sockets);
1139            let _ = iface.poll_egress(now, device, sockets);
1140        }
1141
1142        let shared = Arc::new(SharedState::new(4));
1143        let poll_config = PollLoopConfig {
1144            gateway_mac: [0x02, 0x00, 0x00, 0x00, 0x00, 0x01],
1145            guest_mac: [0x02, 0x00, 0x00, 0x00, 0x00, 0x02],
1146            gateway: GatewayIps {
1147                ipv4: Ipv4Addr::new(100, 96, 0, 1),
1148                ipv6: Ipv6Addr::LOCALHOST,
1149            },
1150            guest_ipv4: Ipv4Addr::new(100, 96, 0, 2),
1151            mtu: 1500,
1152        };
1153        let mut device = SmoltcpDevice::new(shared.clone(), poll_config.mtu);
1154        let mut iface = create_interface(&mut device, &poll_config);
1155        let mut sockets = SocketSet::new(vec![]);
1156        let now = smoltcp_now();
1157
1158        shared
1159            .tx_ring
1160            .push(build_arp_request_frame(
1161                poll_config.guest_mac,
1162                poll_config.guest_ipv4.octets(),
1163                poll_config.gateway.ipv4.octets(),
1164            ))
1165            .unwrap();
1166        shared
1167            .tx_ring
1168            .push(build_icmpv4_echo_frame(
1169                poll_config.guest_mac,
1170                poll_config.gateway_mac,
1171                poll_config.guest_ipv4.octets(),
1172                [142, 251, 216, 46],
1173                0x1234,
1174                0xABCD,
1175                b"ping",
1176            ))
1177            .unwrap();
1178
1179        drive_one_frame(
1180            &mut device,
1181            &mut iface,
1182            &mut sockets,
1183            &shared,
1184            &poll_config,
1185            now,
1186        );
1187        let _ = shared.rx_ring.pop().expect("expected ARP reply");
1188
1189        drive_one_frame(
1190            &mut device,
1191            &mut iface,
1192            &mut sockets,
1193            &shared,
1194            &poll_config,
1195            now,
1196        );
1197        assert!(
1198            shared.rx_ring.pop().is_none(),
1199            "external ICMP should not be answered locally"
1200        );
1201    }
1202}