Skip to main content

zlayer_overlayd/
netlink.rs

1//! Rust netlink helpers that replace shell-outs to `ip`/`nsenter`/`sysctl`
2//! for per-container overlay network setup.
3//!
4//! This module is populated incrementally through a phased migration.
5//! Stage 1: `move_link_into_netns_and_rename` replaces the shell pair
6//!          `ip link set <name> netns <pid>` + `nsenter -t <pid> -n ip
7//!          link set <name> name <new>` with a single atomic RTNETLINK
8//!          `SetLink` carrying both `IFLA_NET_NS_FD` and `IFLA_IFNAME`.
9//!          This bypasses the `/proc/<pid>/ns/net` access problem caused
10//!          by libcontainer setting `PR_SET_DUMPABLE(false)` on the
11//!          container init process under `SELinux` enforcing.
12//! Stage 2: `create_veth_pair`, `delete_link_by_name`, and
13//!          `set_link_up_by_name` replace the host-side veth shell
14//!          commands (`ip link add ... type veth peer name ...`,
15//!          `ip link delete ...`, `ip link set ... up`) used by
16//!          `overlay_manager::attach_to_interface` and the orphan
17//!          sweeper. These helpers talk RTNETLINK directly via the
18//!          `rtnetlink` crate (async, tokio-backed).
19//! Stage 3: `with_netns`, `add_address_to_link_by_name`, and
20//!          `add_default_route_via_dev` replace the remaining
21//!          container-netns shell-outs in
22//!          `overlay_manager::attach_to_interface`. `with_netns`
23//!          runs a closure on a dedicated OS thread that has joined
24//!          the target container's network namespace via `setns(2)`,
25//!          while the two new RTNETLINK helpers operate on the
26//!          current netns (so they must be invoked from inside a
27//!          `with_netns` closure). This removes the last three
28//!          `nsenter -t <pid> -n ip ...` shell-outs used to assign
29//!          the container IP, bring `eth0` / `lo` up, and add the
30//!          default route.
31
32#![cfg_attr(
33    not(target_os = "linux"),
34    allow(clippy::missing_errors_doc, clippy::unused_async)
35)]
36
37use thiserror::Error;
38
39/// Errors returned by the netlink helpers in this module.
40#[derive(Debug, Error)]
41pub enum NetlinkError {
42    /// Failed to open or access a file (typically `/proc/<pid>/ns/net`).
43    #[error("io error: {0}")]
44    Io(#[from] std::io::Error),
45
46    /// The requested link was not found in the current network namespace.
47    #[error("link '{0}' not found in current netns")]
48    NotFound(String),
49
50    /// A netlink operation failed.
51    #[error("netlink operation failed: {0}")]
52    Netlink(String),
53}
54
55/// Move a link from the current network namespace into the network
56/// namespace referenced by `ns_fd`, renaming it in the same atomic
57/// operation.
58///
59/// This is the fd-based variant of [`move_link_into_netns_and_rename`].
60/// Callers that have already opened `/proc/<pid>/ns/net` (e.g. to pin
61/// the namespace across multiple operations and survive a racing
62/// container init exit) should use this form so we don't reopen the
63/// path and lose the race.
64///
65/// The single RTNETLINK `SetLink` request carries both `IFLA_NET_NS_FD`
66/// and `IFLA_IFNAME`, so the kernel performs the move and the rename
67/// atomically.
68///
69/// # Errors
70///
71/// Returns [`NetlinkError::NotFound`] if `link_name` does not exist in
72/// the current netns. Returns [`NetlinkError::Netlink`] for any other
73/// netlink-level failure (permission denied, name collision in the
74/// target netns, etc.).
75///
76/// Implemented directly against the `rtnetlink` crate (overlayd has no
77/// libcontainer dependency): a single `LinkSetRequest` carrying
78/// `setns_by_fd` + `name` performs the move and rename atomically.
79#[cfg(target_os = "linux")]
80pub fn move_link_into_netns_fd_and_rename(
81    link_name: &str,
82    ns_fd: std::os::fd::BorrowedFd<'_>,
83    new_name: &str,
84) -> Result<(), NetlinkError> {
85    use std::os::fd::AsRawFd;
86
87    // `setns` of the moved link must reference the fd while the request
88    // executes. We drive the rtnetlink sequence on a local current-thread
89    // runtime, but build+run that runtime on a DEDICATED OS thread (mirroring
90    // `with_netns_fd` below): the sole caller (`attach_to_interface`) invokes
91    // this from inside an async block, and building/blocking a runtime on a
92    // thread already driving tokio tasks panics with "Cannot start a runtime
93    // from within a runtime". A fresh OS thread has no ambient runtime, so
94    // `block_on` is legal there. The raw fd is process-global and the caller
95    // keeps `ns_fd`'s owner alive across the synchronous `join`, so the
96    // borrowed fd stays valid for the duration.
97    let raw_fd = ns_fd.as_raw_fd();
98    let link_name = link_name.to_string();
99    let new_name = new_name.to_string();
100
101    let join_handle = std::thread::spawn(move || -> Result<(), NetlinkError> {
102        let rt = tokio::runtime::Builder::new_current_thread()
103            .enable_all()
104            .build()
105            .map_err(|e| NetlinkError::Netlink(format!("local runtime build failed: {e}")))?;
106
107        rt.block_on(async move {
108            use futures_util::stream::TryStreamExt;
109
110            let (connection, handle, _) = rtnetlink::new_connection()
111                .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
112            tokio::spawn(connection);
113
114            // Resolve the host-side interface index. Treat "No such device"
115            // as our dedicated NotFound variant so callers can distinguish
116            // "nothing to move" from real failures.
117            let link = handle
118                .link()
119                .get()
120                .match_name(link_name.clone())
121                .execute()
122                .try_next()
123                .await
124                .map_err(|e| {
125                    let msg = e.to_string();
126                    if msg.contains("No such device") {
127                        NetlinkError::NotFound(link_name.clone())
128                    } else {
129                        NetlinkError::Netlink(format!("link lookup failed for {link_name}: {msg}"))
130                    }
131                })?
132                .ok_or_else(|| NetlinkError::NotFound(link_name.clone()))?;
133
134            let index = link.header.index;
135
136            // Atomically move the link into the target netns and rename it.
137            handle
138                .link()
139                .set(index)
140                .setns_by_fd(raw_fd)
141                .name(new_name.clone())
142                .execute()
143                .await
144                .map_err(|e| {
145                    NetlinkError::Netlink(format!(
146                        "setns_by_fd(index={index}, new_name={new_name}) failed: {e}"
147                    ))
148                })
149        })
150    });
151
152    join_handle
153        .join()
154        .map_err(|_| NetlinkError::Netlink("move_link_into_netns thread panicked".to_string()))?
155}
156
157/// Stub for non-Linux Unix platforms (macOS/BSD).
158///
159/// Not emitted on Windows: `attach_container` (the sole caller chain) is
160/// itself gated `#[cfg(target_os = "linux")]` in `server.rs`, so there are
161/// no Windows callers, and the `BorrowedFd` parameter type is Unix-only.
162///
163/// # Errors
164///
165/// Always returns [`NetlinkError::Netlink`] — this function is unsupported on
166/// the current target.
167#[cfg(all(not(target_os = "linux"), unix))]
168pub fn move_link_into_netns_fd_and_rename(
169    _link_name: &str,
170    _ns_fd: std::os::fd::BorrowedFd<'_>,
171    _new_name: &str,
172) -> Result<(), NetlinkError> {
173    Err(NetlinkError::Netlink(
174        "move_link_into_netns_fd_and_rename is only supported on Linux".to_string(),
175    ))
176}
177
178/// Move a link from the current network namespace into the target PID's
179/// network namespace, renaming it in the same atomic operation.
180///
181/// Thin wrapper around [`move_link_into_netns_fd_and_rename`] that
182/// opens `/proc/<target_pid>/ns/net` then delegates. Kept for
183/// backward compatibility and for callers that only need a single
184/// operation on the target netns. Callers that need to perform
185/// multiple operations on the same netns (and want to survive a
186/// racing exit of the container init process) should open the fd
187/// themselves and call [`move_link_into_netns_fd_and_rename`]
188/// directly.
189///
190/// # Errors
191///
192/// Returns [`NetlinkError::Io`] if `/proc/<target_pid>/ns/net` cannot be
193/// opened (e.g. the container process is gone or is not dumpable and we
194/// lack `CAP_SYS_PTRACE`). Returns [`NetlinkError::NotFound`] if
195/// `link_name` does not exist in the current netns. Returns
196/// [`NetlinkError::Netlink`] for any other netlink-level failure
197/// (permission denied, name collision in the target netns, etc.).
198#[cfg(target_os = "linux")]
199pub fn move_link_into_netns_and_rename(
200    link_name: &str,
201    target_pid: u32,
202    new_name: &str,
203) -> Result<(), NetlinkError> {
204    use std::os::fd::{AsFd, OwnedFd};
205
206    let ns_file = std::fs::File::open(format!("/proc/{target_pid}/ns/net"))?;
207    let ns_fd: OwnedFd = OwnedFd::from(ns_file);
208    move_link_into_netns_fd_and_rename(link_name, ns_fd.as_fd(), new_name)
209}
210
211/// Non-Linux stub: the overlay manager never calls this on non-Linux
212/// platforms (libcontainer itself is a Linux-only dep), but keeping the
213/// signature available lets `overlay_manager.rs` stay platform-agnostic.
214#[cfg(not(target_os = "linux"))]
215pub fn move_link_into_netns_and_rename(
216    _link_name: &str,
217    _target_pid: u32,
218    _new_name: &str,
219) -> Result<(), NetlinkError> {
220    Err(NetlinkError::Netlink(
221        "move_link_into_netns_and_rename is only supported on Linux".to_string(),
222    ))
223}
224
225/// Create a veth pair with the two ends named `host_name` and `peer_name`.
226///
227/// Both ends start in the current network namespace. The caller is
228/// responsible for moving the peer end into the container netns (see
229/// [`move_link_into_netns_and_rename`]) and bringing the host end up
230/// (see [`set_link_up_by_name`]).
231///
232/// Replaces the shell-out:
233///   ip link add `<host_name>` type veth peer name `<peer_name>`
234///
235/// # Errors
236///
237/// Returns [`NetlinkError::Netlink`] if RTNETLINK fails for any
238/// reason. `EEXIST` / "File exists" is surfaced verbatim so the caller
239/// can distinguish a leaked endpoint (typically a sign the orphan
240/// sweeper missed something) from a permission or interface-name
241/// problem.
242#[cfg(target_os = "linux")]
243pub async fn create_veth_pair(host_name: &str, peer_name: &str) -> Result<(), NetlinkError> {
244    let (connection, handle, _) = rtnetlink::new_connection()
245        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
246    tokio::spawn(connection);
247
248    handle
249        .link()
250        .add()
251        .veth(host_name.to_string(), peer_name.to_string())
252        .execute()
253        .await
254        .map_err(|e| {
255            let msg = e.to_string();
256            if msg.contains("File exists") || msg.contains("EEXIST") {
257                NetlinkError::Netlink(format!(
258                    "veth pair already exists: host={host_name} peer={peer_name}: {msg}"
259                ))
260            } else {
261                NetlinkError::Netlink(format!(
262                    "veth create failed (host={host_name}, peer={peer_name}): {msg}"
263                ))
264            }
265        })
266}
267
268/// Non-Linux stub.
269#[cfg(not(target_os = "linux"))]
270pub async fn create_veth_pair(_host_name: &str, _peer_name: &str) -> Result<(), NetlinkError> {
271    Err(NetlinkError::Netlink(
272        "create_veth_pair is only supported on Linux".to_string(),
273    ))
274}
275
276/// Delete the link by name. Idempotent: returns `Ok(())` if the link
277/// does not exist. Any other error surfaces as
278/// [`NetlinkError::Netlink`].
279///
280/// Replaces the shell-out:
281///   ip link delete `<name>`
282///
283/// Used in `overlay_manager::attach_to_interface` pre-cleanup,
284/// cleanup-on-error, and the orphan-veth sweeper.
285///
286/// # Errors
287///
288/// Returns [`NetlinkError::Netlink`] if RTNETLINK reports a failure
289/// other than `ENODEV` / "No such device" (which are treated as
290/// success so this is safe to call unconditionally).
291#[cfg(target_os = "linux")]
292pub async fn delete_link_by_name(name: &str) -> Result<(), NetlinkError> {
293    use futures_util::stream::TryStreamExt;
294
295    let (connection, handle, _) = rtnetlink::new_connection()
296        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
297    tokio::spawn(connection);
298
299    // Look up the link by name. Treat "not found" as success so the
300    // helper is safe to call unconditionally in cleanup paths.
301    let lookup = handle
302        .link()
303        .get()
304        .match_name(name.to_string())
305        .execute()
306        .try_next()
307        .await;
308
309    let link = match lookup {
310        Ok(Some(link)) => link,
311        Ok(None) => return Ok(()),
312        Err(rtnetlink::Error::NetlinkError(err)) => {
313            // libc::ENODEV == 19. netlink-packet-core reports the raw
314            // errno as a negative i32 in `code`, but the exact type has
315            // moved between versions, so match by both numeric code and
316            // the human-readable message for belt-and-suspenders safety.
317            let msg = err.to_string();
318            let is_enodev = err
319                .code
320                .is_some_and(|c| c.get().unsigned_abs() == libc::ENODEV as u32);
321            if is_enodev || msg.contains("No such device") {
322                return Ok(());
323            }
324            return Err(NetlinkError::Netlink(format!(
325                "link lookup failed for {name}: {msg}"
326            )));
327        }
328        Err(e) => {
329            let msg = e.to_string();
330            if msg.contains("No such device") {
331                return Ok(());
332            }
333            return Err(NetlinkError::Netlink(format!(
334                "link lookup failed for {name}: {msg}"
335            )));
336        }
337    };
338
339    let index = link.header.index;
340
341    handle
342        .link()
343        .del(index)
344        .execute()
345        .await
346        .map_err(|e| NetlinkError::Netlink(format!("link delete failed for {name}: {e}")))
347}
348
349/// Non-Linux stub.
350#[cfg(not(target_os = "linux"))]
351pub async fn delete_link_by_name(_name: &str) -> Result<(), NetlinkError> {
352    Err(NetlinkError::Netlink(
353        "delete_link_by_name is only supported on Linux".to_string(),
354    ))
355}
356
357/// List all network interfaces in the current netns.
358///
359/// Returns a `Vec` of `(index, name)` tuples for every link the kernel
360/// reports. Used by the orphan veth sweeper to find `veth-<pid>` and
361/// `vc-<pid>` links whose owning PID is dead, so it can clean them up
362/// via [`delete_link_by_name`].
363///
364/// Replaces the shell-out:
365///   ip -br link
366///
367/// Issues a single RTNETLINK `RTM_GETLINK` dump request and iterates
368/// the resulting stream of `LinkMessage`s. Each message contributes
369/// one `(index, name)` tuple; messages without an `IFLA_IFNAME`
370/// attribute (extremely rare in practice — the kernel always emits
371/// one for configured devices) are silently skipped.
372///
373/// # Errors
374///
375/// Returns [`NetlinkError::Netlink`] if the rtnetlink socket cannot
376/// be created or if the dump stream itself reports a failure.
377#[cfg(target_os = "linux")]
378pub async fn list_all_links() -> Result<Vec<(u32, String)>, NetlinkError> {
379    use futures_util::stream::TryStreamExt;
380    use netlink_packet_route::link::LinkAttribute;
381
382    let (connection, handle, _) = rtnetlink::new_connection()
383        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
384    tokio::spawn(connection);
385
386    let mut stream = handle.link().get().execute();
387    let mut links = Vec::new();
388
389    while let Some(msg) = stream
390        .try_next()
391        .await
392        .map_err(|e| NetlinkError::Netlink(format!("link dump failed: {e}")))?
393    {
394        // LinkHeader.index is already u32 in netlink-packet-route
395        // 0.19 — no cast needed.
396        let index = msg.header.index;
397        let Some(name) = msg.attributes.iter().find_map(|a| match a {
398            LinkAttribute::IfName(n) => Some(n.clone()),
399            _ => None,
400        }) else {
401            continue;
402        };
403        links.push((index, name));
404    }
405
406    Ok(links)
407}
408
409/// Non-Linux stub.
410#[cfg(not(target_os = "linux"))]
411pub async fn list_all_links() -> Result<Vec<(u32, String)>, NetlinkError> {
412    Err(NetlinkError::Netlink(
413        "list_all_links is only supported on Linux".to_string(),
414    ))
415}
416
417/// Set the link identified by `name` to the "up" administrative state.
418///
419/// Replaces the shell-out:
420///   ip link set `<name>` up
421///
422/// Unlike [`delete_link_by_name`] this is *not* idempotent for missing
423/// links: if the link does not exist the caller almost certainly has a
424/// bug upstream (we only call this on a veth end we just created), so
425/// we return [`NetlinkError::NotFound`] rather than silently succeeding.
426///
427/// # Errors
428///
429/// Returns [`NetlinkError::NotFound`] if no link with the given name
430/// exists in the current netns. Returns [`NetlinkError::Netlink`] for
431/// any other RTNETLINK failure (permission denied, etc.).
432#[cfg(target_os = "linux")]
433pub async fn set_link_up_by_name(name: &str) -> Result<(), NetlinkError> {
434    use futures_util::stream::TryStreamExt;
435
436    let (connection, handle, _) = rtnetlink::new_connection()
437        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
438    tokio::spawn(connection);
439
440    let link = handle
441        .link()
442        .get()
443        .match_name(name.to_string())
444        .execute()
445        .try_next()
446        .await
447        .map_err(|e| {
448            let msg = e.to_string();
449            if msg.contains("No such device") {
450                NetlinkError::NotFound(name.to_string())
451            } else {
452                NetlinkError::Netlink(format!("link lookup failed for {name}: {msg}"))
453            }
454        })?
455        .ok_or_else(|| NetlinkError::NotFound(name.to_string()))?;
456
457    let index = link.header.index;
458
459    handle
460        .link()
461        .set(index)
462        .up()
463        .execute()
464        .await
465        .map_err(|e| NetlinkError::Netlink(format!("link set up failed for {name}: {e}")))
466}
467
468/// Non-Linux stub.
469#[cfg(not(target_os = "linux"))]
470pub async fn set_link_up_by_name(_name: &str) -> Result<(), NetlinkError> {
471    Err(NetlinkError::Netlink(
472        "set_link_up_by_name is only supported on Linux".to_string(),
473    ))
474}
475
476/// Add an IP address to the link identified by `name` in the current
477/// network namespace.
478///
479/// Replaces (in combination with [`with_netns`]):
480///   nsenter -t `<pid>` -n ip \[-6\] addr add `<addr>/<prefix_len>` dev `<name>`
481///
482/// `addr` may be v4 or v6. `prefix_len` is the CIDR prefix length
483/// (24 for a `/24`, 64 for a `/64`, etc.).
484///
485/// This helper operates on the CURRENT network namespace — it looks
486/// up the interface index via a local rtnetlink socket. To target a
487/// container's netns, wrap the call inside [`with_netns`].
488///
489/// # Errors
490///
491/// Returns [`NetlinkError::NotFound`] if the link is missing. Returns
492/// [`NetlinkError::Netlink`] for any other rtnetlink failure
493/// (permission denied, EEXIST on a duplicate address, etc.).
494#[cfg(target_os = "linux")]
495pub async fn add_address_to_link_by_name(
496    name: &str,
497    addr: std::net::IpAddr,
498    prefix_len: u8,
499) -> Result<(), NetlinkError> {
500    use futures_util::stream::TryStreamExt;
501
502    let (connection, handle, _) = rtnetlink::new_connection()
503        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
504    tokio::spawn(connection);
505
506    let link = handle
507        .link()
508        .get()
509        .match_name(name.to_string())
510        .execute()
511        .try_next()
512        .await
513        .map_err(|e| {
514            let msg = e.to_string();
515            if msg.contains("No such device") {
516                NetlinkError::NotFound(name.to_string())
517            } else {
518                NetlinkError::Netlink(format!("link lookup failed for {name}: {msg}"))
519            }
520        })?
521        .ok_or_else(|| NetlinkError::NotFound(name.to_string()))?;
522
523    let index = link.header.index;
524
525    handle
526        .address()
527        .add(index, addr, prefix_len)
528        .execute()
529        .await
530        .map_err(|e| {
531            NetlinkError::Netlink(format!(
532                "address add failed for {name} ({addr}/{prefix_len}): {e}"
533            ))
534        })
535}
536
537/// Non-Linux stub.
538#[cfg(not(target_os = "linux"))]
539pub async fn add_address_to_link_by_name(
540    _name: &str,
541    _addr: std::net::IpAddr,
542    _prefix_len: u8,
543) -> Result<(), NetlinkError> {
544    Err(NetlinkError::Netlink(
545        "add_address_to_link_by_name is only supported on Linux".to_string(),
546    ))
547}
548
549/// Remove ALL IPv4/IPv6 addresses currently assigned to the link named `name`.
550///
551/// Used to make service-bridge gateway assignment idempotent: a bridge that
552/// survives an overlayd/daemon restart is re-created (create is idempotent on
553/// EEXIST), and without flushing first, `add_address_to_link_by_name` would
554/// STACK the new gateway on top of the old one — exactly the observed bug where
555/// a bridge carried both a stale `/28` and the re-allocated `/26`. Flushing
556/// before re-adding guarantees a single address and self-heals such bridges.
557///
558/// No-ops (returns `Ok`) when the link is absent (ENODEV / "No such device").
559///
560/// # Errors
561/// Returns [`NetlinkError::Netlink`] on any rtnetlink failure other than a
562/// missing link (e.g. permission denied).
563#[cfg(target_os = "linux")]
564pub async fn flush_addresses_on_link_by_name(name: &str) -> Result<(), NetlinkError> {
565    use futures_util::stream::TryStreamExt;
566
567    let (connection, handle, _) = rtnetlink::new_connection()
568        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
569    tokio::spawn(connection);
570
571    let link = handle
572        .link()
573        .get()
574        .match_name(name.to_string())
575        .execute()
576        .try_next()
577        .await;
578
579    let link = match link {
580        Ok(Some(l)) => l,
581        // Absent link → nothing to flush.
582        Ok(None) => return Ok(()),
583        Err(e) => {
584            let msg = e.to_string();
585            if msg.contains("No such device") {
586                return Ok(());
587            }
588            return Err(NetlinkError::Netlink(format!(
589                "link lookup failed for {name}: {msg}"
590            )));
591        }
592    };
593
594    let index = link.header.index;
595
596    let addrs: Vec<_> = handle
597        .address()
598        .get()
599        .set_link_index_filter(index)
600        .execute()
601        .try_collect()
602        .await
603        .map_err(|e| NetlinkError::Netlink(format!("address list failed for {name}: {e}")))?;
604
605    for addr in addrs {
606        handle
607            .address()
608            .del(addr)
609            .execute()
610            .await
611            .map_err(|e| NetlinkError::Netlink(format!("address del failed for {name}: {e}")))?;
612    }
613
614    Ok(())
615}
616
617/// Non-Linux stub.
618#[cfg(not(target_os = "linux"))]
619pub async fn flush_addresses_on_link_by_name(_name: &str) -> Result<(), NetlinkError> {
620    Err(NetlinkError::Netlink(
621        "flush_addresses_on_link_by_name is only supported on Linux".to_string(),
622    ))
623}
624
625/// Add a default route via the given device name in the current
626/// network namespace.
627///
628/// Replaces (in combination with [`with_netns`]):
629///   nsenter -t `<pid>` -n ip \[-6\] route add default dev `<dev_name>`
630///
631/// The route is a direct, link-scope route: no gateway, the kernel
632/// ARPs / uses NDISC on the device for destination resolution. This
633/// is the correct form for a point-to-point veth link where the peer
634/// is reachable directly.
635///
636/// For IPv4 the destination prefix is `0.0.0.0/0`. For IPv6 it is
637/// `::/0`. Controlled by `is_v6`.
638///
639/// # Errors
640///
641/// Returns [`NetlinkError::NotFound`] if the device is missing.
642/// Returns [`NetlinkError::Netlink`] for any other rtnetlink failure.
643#[cfg(target_os = "linux")]
644pub async fn add_default_route_via_dev(dev_name: &str, is_v6: bool) -> Result<(), NetlinkError> {
645    use futures_util::stream::TryStreamExt;
646    use netlink_packet_route::route::RouteScope;
647
648    let (connection, handle, _) = rtnetlink::new_connection()
649        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
650    tokio::spawn(connection);
651
652    let link = handle
653        .link()
654        .get()
655        .match_name(dev_name.to_string())
656        .execute()
657        .try_next()
658        .await
659        .map_err(|e| {
660            let msg = e.to_string();
661            if msg.contains("No such device") {
662                NetlinkError::NotFound(dev_name.to_string())
663            } else {
664                NetlinkError::Netlink(format!("link lookup failed for {dev_name}: {msg}"))
665            }
666        })?
667        .ok_or_else(|| NetlinkError::NotFound(dev_name.to_string()))?;
668
669    let oif_idx = link.header.index;
670
671    if is_v6 {
672        handle
673            .route()
674            .add()
675            .v6()
676            .destination_prefix(std::net::Ipv6Addr::UNSPECIFIED, 0)
677            .output_interface(oif_idx)
678            .scope(RouteScope::Link)
679            .execute()
680            .await
681            .map_err(|e| {
682                NetlinkError::Netlink(format!("default route add v6 via {dev_name} failed: {e}"))
683            })
684    } else {
685        handle
686            .route()
687            .add()
688            .v4()
689            .destination_prefix(std::net::Ipv4Addr::UNSPECIFIED, 0)
690            .output_interface(oif_idx)
691            .scope(RouteScope::Link)
692            .execute()
693            .await
694            .map_err(|e| {
695                NetlinkError::Netlink(format!("default route add v4 via {dev_name} failed: {e}"))
696            })
697    }
698}
699
700/// Non-Linux stub.
701#[cfg(not(target_os = "linux"))]
702pub async fn add_default_route_via_dev(_dev_name: &str, _is_v6: bool) -> Result<(), NetlinkError> {
703    Err(NetlinkError::Netlink(
704        "add_default_route_via_dev is only supported on Linux".to_string(),
705    ))
706}
707
708/// Add a default route pointing at the given gateway IP in the current
709/// network namespace.
710///
711/// Replaces (in combination with [`with_netns`]):
712///   nsenter -t `<pid>` -n ip \[-6\] route add default via `<gateway>`
713///
714/// Used by the per-service bridge attach path: containers join the
715/// service bridge via a veth pair and reach the rest of the overlay
716/// through the bridge's L3 gateway IP. The address family of the route
717/// is inferred from `gateway`.
718///
719/// # Errors
720///
721/// Returns [`NetlinkError::Netlink`] for any rtnetlink failure.
722#[cfg(target_os = "linux")]
723pub async fn add_default_route_via_gateway(gateway: std::net::IpAddr) -> Result<(), NetlinkError> {
724    let (connection, handle, _) = rtnetlink::new_connection()
725        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
726    tokio::spawn(connection);
727
728    match gateway {
729        std::net::IpAddr::V4(gw) => handle
730            .route()
731            .add()
732            .v4()
733            .destination_prefix(std::net::Ipv4Addr::UNSPECIFIED, 0)
734            .gateway(gw)
735            .execute()
736            .await
737            .map_err(|e| {
738                NetlinkError::Netlink(format!("default route add v4 via gateway {gw} failed: {e}"))
739            }),
740        std::net::IpAddr::V6(gw) => handle
741            .route()
742            .add()
743            .v6()
744            .destination_prefix(std::net::Ipv6Addr::UNSPECIFIED, 0)
745            .gateway(gw)
746            .execute()
747            .await
748            .map_err(|e| {
749                NetlinkError::Netlink(format!("default route add v6 via gateway {gw} failed: {e}"))
750            }),
751    }
752}
753
754/// Non-Linux stub.
755#[cfg(not(target_os = "linux"))]
756pub async fn add_default_route_via_gateway(_gateway: std::net::IpAddr) -> Result<(), NetlinkError> {
757    Err(NetlinkError::Netlink(
758        "add_default_route_via_gateway is only supported on Linux".to_string(),
759    ))
760}
761
762/// Add or replace a route to `dest/prefix_len` that forwards via the
763/// interface named `dev_name`. Optional `src` sets the preferred source
764/// address.
765///
766/// Replaces the shell-outs:
767///   ip route replace `<dest>/<prefix_len>` dev `<dev_name>` \[src `<src>`\]
768///   ip -6 route replace `<dest>/<prefix_len>` dev `<dev_name>` \[src `<src>`\]
769///
770/// Uses `NLM_F_REPLACE | NLM_F_CREATE` semantics (via rtnetlink's
771/// `.replace()` on the route add builder) so stale routes left behind
772/// by a previous daemon run don't cause `EEXIST`.
773///
774/// The route is installed with link scope (direct-via-dev, no
775/// gateway) which is the correct form for a per-container `/32` or
776/// `/128` pointing at a host-side veth endpoint.
777///
778/// `dest` and `src` (if provided) must have matching address families
779/// — passing a v4 `dest` with a v6 `src` returns
780/// [`NetlinkError::Netlink`] without touching the kernel.
781///
782/// # Errors
783///
784/// Returns [`NetlinkError::NotFound`] if `dev_name` does not exist in
785/// the current netns. Returns [`NetlinkError::Netlink`] on address
786/// family mismatch or any RTNETLINK failure.
787#[cfg(target_os = "linux")]
788pub async fn replace_route_via_dev(
789    dest: std::net::IpAddr,
790    prefix_len: u8,
791    dev_name: &str,
792    src: Option<std::net::IpAddr>,
793) -> Result<(), NetlinkError> {
794    use std::net::IpAddr;
795
796    use futures_util::stream::TryStreamExt;
797    use netlink_packet_route::route::RouteScope;
798
799    let (connection, handle, _) = rtnetlink::new_connection()
800        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
801    tokio::spawn(connection);
802
803    let link = handle
804        .link()
805        .get()
806        .match_name(dev_name.to_string())
807        .execute()
808        .try_next()
809        .await
810        .map_err(|e| {
811            let msg = e.to_string();
812            if msg.contains("No such device") {
813                NetlinkError::NotFound(dev_name.to_string())
814            } else {
815                NetlinkError::Netlink(format!("link lookup failed for {dev_name}: {msg}"))
816            }
817        })?
818        .ok_or_else(|| NetlinkError::NotFound(dev_name.to_string()))?;
819
820    let oif_idx = link.header.index;
821
822    match (dest, src) {
823        (IpAddr::V4(d), Some(IpAddr::V4(s))) => handle
824            .route()
825            .add()
826            .v4()
827            .destination_prefix(d, prefix_len)
828            .output_interface(oif_idx)
829            .scope(RouteScope::Link)
830            .pref_source(s)
831            .replace()
832            .execute()
833            .await
834            .map_err(|e| {
835                NetlinkError::Netlink(format!(
836                    "route replace v4 {d}/{prefix_len} dev {dev_name} src {s} failed: {e}"
837                ))
838            }),
839        (IpAddr::V4(d), None) => handle
840            .route()
841            .add()
842            .v4()
843            .destination_prefix(d, prefix_len)
844            .output_interface(oif_idx)
845            .scope(RouteScope::Link)
846            .replace()
847            .execute()
848            .await
849            .map_err(|e| {
850                NetlinkError::Netlink(format!(
851                    "route replace v4 {d}/{prefix_len} dev {dev_name} failed: {e}"
852                ))
853            }),
854        (IpAddr::V6(d), Some(IpAddr::V6(s))) => handle
855            .route()
856            .add()
857            .v6()
858            .destination_prefix(d, prefix_len)
859            .output_interface(oif_idx)
860            .scope(RouteScope::Link)
861            .pref_source(s)
862            .replace()
863            .execute()
864            .await
865            .map_err(|e| {
866                NetlinkError::Netlink(format!(
867                    "route replace v6 {d}/{prefix_len} dev {dev_name} src {s} failed: {e}"
868                ))
869            }),
870        (IpAddr::V6(d), None) => handle
871            .route()
872            .add()
873            .v6()
874            .destination_prefix(d, prefix_len)
875            .output_interface(oif_idx)
876            .scope(RouteScope::Link)
877            .replace()
878            .execute()
879            .await
880            .map_err(|e| {
881                NetlinkError::Netlink(format!(
882                    "route replace v6 {d}/{prefix_len} dev {dev_name} failed: {e}"
883                ))
884            }),
885        (IpAddr::V4(_), Some(IpAddr::V6(_))) | (IpAddr::V6(_), Some(IpAddr::V4(_))) => Err(
886            NetlinkError::Netlink(format!("address family mismatch: dest={dest} src={src:?}")),
887        ),
888    }
889}
890
891/// Non-Linux stub.
892#[cfg(not(target_os = "linux"))]
893pub async fn replace_route_via_dev(
894    _dest: std::net::IpAddr,
895    _prefix_len: u8,
896    _dev_name: &str,
897    _src: Option<std::net::IpAddr>,
898) -> Result<(), NetlinkError> {
899    Err(NetlinkError::Netlink(
900        "replace_route_via_dev is only supported on Linux".to_string(),
901    ))
902}
903
904/// Delete the link-scope route to `dest/prefix_len` that forwards via the
905/// interface named `dev_name` — the symmetric counterpart of
906/// [`replace_route_via_dev`] used by overlay teardown to revert the per-container
907/// host route.
908///
909/// Idempotent and best-effort: a missing device (`NotFound`) or a missing route
910/// (`ESRCH` / "No such process", which is how the kernel reports
911/// `RTM_DELROUTE` for a route that is already gone) is treated as success so
912/// teardown can call this unconditionally without aborting on a route a prior
913/// per-container detach already removed.
914///
915/// The route message is built with the *same* fields as the install path
916/// (link scope, destination prefix, output interface) so the kernel matches and
917/// removes exactly the route `replace_route_via_dev` installed. The `src`
918/// preferred-source is intentionally omitted from the match: the kernel keys the
919/// delete on dest + oif + scope, and including a stale src risks a false miss.
920///
921/// # Errors
922///
923/// Returns [`NetlinkError::Netlink`] only on a genuine RTNETLINK failure that is
924/// neither "device not found" nor "route not found".
925#[cfg(target_os = "linux")]
926pub async fn delete_route_via_dev(
927    dest: std::net::IpAddr,
928    prefix_len: u8,
929    dev_name: &str,
930) -> Result<(), NetlinkError> {
931    use std::net::IpAddr;
932
933    use futures_util::stream::TryStreamExt;
934    use netlink_packet_route::route::RouteScope;
935
936    let (connection, handle, _) = rtnetlink::new_connection()
937        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
938    tokio::spawn(connection);
939
940    // Resolve the output interface. A vanished device means the route went with
941    // it (deleting a link drops its routes), so treat NotFound as success.
942    let lookup = handle
943        .link()
944        .get()
945        .match_name(dev_name.to_string())
946        .execute()
947        .try_next()
948        .await;
949    let link = match lookup {
950        Ok(Some(link)) => link,
951        Ok(None) => return Ok(()),
952        Err(e) => {
953            let msg = e.to_string();
954            if msg.contains("No such device") {
955                return Ok(());
956            }
957            return Err(NetlinkError::Netlink(format!(
958                "link lookup failed for {dev_name}: {msg}"
959            )));
960        }
961    };
962    let oif_idx = link.header.index;
963
964    // Build the route message identically to the install path, then hand it to
965    // `del`. `message_mut().clone()` extracts the fully-formed RouteMessage from
966    // the add builder so the delete matches the exact route we installed.
967    let message = match dest {
968        IpAddr::V4(d) => {
969            let mut req = handle
970                .route()
971                .add()
972                .v4()
973                .destination_prefix(d, prefix_len)
974                .output_interface(oif_idx)
975                .scope(RouteScope::Link);
976            req.message_mut().clone()
977        }
978        IpAddr::V6(d) => {
979            let mut req = handle
980                .route()
981                .add()
982                .v6()
983                .destination_prefix(d, prefix_len)
984                .output_interface(oif_idx)
985                .scope(RouteScope::Link);
986            req.message_mut().clone()
987        }
988    };
989
990    match handle.route().del(message).execute().await {
991        Ok(()) => Ok(()),
992        Err(e) => {
993            let msg = e.to_string();
994            // ESRCH (3) "No such process" is the kernel's RTM_DELROUTE answer
995            // for an already-absent route; ENOENT likewise. Both mean "already
996            // gone" — success for an idempotent teardown.
997            if msg.contains("No such process")
998                || msg.contains("No such file")
999                || msg.contains("ESRCH")
1000                || msg.contains("ENOENT")
1001            {
1002                return Ok(());
1003            }
1004            Err(NetlinkError::Netlink(format!(
1005                "route delete {dest}/{prefix_len} dev {dev_name} failed: {msg}"
1006            )))
1007        }
1008    }
1009}
1010
1011/// Non-Linux stub.
1012#[cfg(not(target_os = "linux"))]
1013pub async fn delete_route_via_dev(
1014    _dest: std::net::IpAddr,
1015    _prefix_len: u8,
1016    _dev_name: &str,
1017) -> Result<(), NetlinkError> {
1018    Err(NetlinkError::Netlink(
1019        "delete_route_via_dev is only supported on Linux".to_string(),
1020    ))
1021}
1022
1023/// Set a sysctl via the `/proc/sys/...` filesystem.
1024///
1025/// `key` uses dotted form like `net.ipv4.ip_forward`; dots are
1026/// translated to path separators so the effective path is
1027/// `/proc/sys/net/ipv4/ip_forward`. Writes the string form of
1028/// `value` to the file.
1029///
1030/// Replaces the shell-outs:
1031///   sysctl -w `<key>`=`<value>`
1032///
1033/// Writing to `/proc/sys/...` is the kernel-standard way of setting
1034/// sysctls and works under any confinement that still allows write
1035/// access to `/proc/sys` (which the overlay manager needs anyway for
1036/// its other operations).
1037///
1038/// # Errors
1039///
1040/// Returns [`NetlinkError::Io`] if the write fails (e.g. permission
1041/// denied, file missing because the sysctl doesn't exist on this
1042/// kernel, etc.).
1043pub fn set_sysctl(key: &str, value: &str) -> Result<(), NetlinkError> {
1044    let path = format!("/proc/sys/{}", key.replace('.', "/"));
1045    std::fs::write(&path, value)?;
1046    Ok(())
1047}
1048
1049/// Read the current value of a sysctl via the `/proc/sys/...` filesystem.
1050///
1051/// `key` uses dotted form like `net.ipv4.ip_forward`; dots are translated
1052/// to path separators (`/proc/sys/net/ipv4/ip_forward`). The trailing
1053/// newline the kernel emits is trimmed.
1054///
1055/// Used by the overlay teardown path to learn whether a forwarding sysctl
1056/// was already `1` before the daemon touched it, so a clean shutdown only
1057/// reverts the bits the daemon itself turned on (never clobbering an
1058/// operator who deliberately enabled routing on the host).
1059///
1060/// # Errors
1061///
1062/// Returns [`NetlinkError::Io`] if the read fails (e.g. the sysctl does
1063/// not exist on this kernel, or `/proc/sys` is not readable under the
1064/// current confinement).
1065pub fn read_sysctl(key: &str) -> Result<String, NetlinkError> {
1066    let path = format!("/proc/sys/{}", key.replace('.', "/"));
1067    let raw = std::fs::read_to_string(&path)?;
1068    Ok(raw.trim().to_string())
1069}
1070
1071/// Run a synchronous closure inside the network namespace referenced
1072/// by the given `OwnedFd`.
1073///
1074/// This is the fd-based variant of [`with_netns`]. Callers that have
1075/// already opened `/proc/<pid>/ns/net` (e.g. to pin the namespace
1076/// across multiple operations) should use this form to reuse the
1077/// same fd and avoid re-opening the procfs path — the reopen would
1078/// fail with `ENOENT` if the container init process has exited in
1079/// the meantime, even though the namespace itself is still alive
1080/// because our pinned fd holds a reference.
1081///
1082/// The `OwnedFd` is moved into the dedicated worker thread and
1083/// closed when the thread exits. Spawns a fresh OS thread (not a
1084/// tokio blocking worker) because `setns` affects the whole thread
1085/// and we don't want to contaminate a shared worker.
1086///
1087/// # Errors
1088///
1089/// Returns [`NetlinkError::Netlink`] if `setns` fails or the
1090/// dedicated thread panics. Any error returned by the closure itself
1091/// is propagated verbatim.
1092#[cfg(target_os = "linux")]
1093pub fn with_netns_fd<F, T>(ns_fd: std::os::fd::OwnedFd, f: F) -> Result<T, NetlinkError>
1094where
1095    F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
1096    T: Send + 'static,
1097{
1098    let join_handle = std::thread::spawn(move || -> Result<T, NetlinkError> {
1099        nix::sched::setns(&ns_fd, nix::sched::CloneFlags::CLONE_NEWNET)
1100            .map_err(|e| NetlinkError::Netlink(format!("setns(ns_fd) failed: {e}")))?;
1101        // Keep the fd alive for the duration of the closure even
1102        // though setns only needs it for the syscall itself. Dropping
1103        // it explicitly after the closure makes the lifetime obvious.
1104        let result = f();
1105        drop(ns_fd);
1106        result
1107    });
1108
1109    join_handle
1110        .join()
1111        .map_err(|_| NetlinkError::Netlink("with_netns_fd thread panicked".to_string()))?
1112}
1113
1114/// Non-Linux Unix (macOS/BSD) stub. Not emitted on Windows — the sole caller
1115/// chain (`attach_to_interface` in `overlay_manager.rs`) is
1116/// `#[cfg(target_os = "linux")]`-gated, and `OwnedFd` is Unix-only.
1117#[cfg(all(not(target_os = "linux"), unix))]
1118pub fn with_netns_fd<F, T>(_ns_fd: std::os::fd::OwnedFd, _f: F) -> Result<T, NetlinkError>
1119where
1120    F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
1121    T: Send + 'static,
1122{
1123    Err(NetlinkError::Netlink(
1124        "with_netns_fd is only supported on Linux".to_string(),
1125    ))
1126}
1127
1128/// Run a synchronous closure inside the network namespace of the
1129/// given PID.
1130///
1131/// Thin wrapper around [`with_netns_fd`] that opens
1132/// `/proc/<target_pid>/ns/net` then delegates. Kept for backward
1133/// compatibility and for callers that only need a single operation
1134/// on the target netns. Callers that need to pin the namespace
1135/// across multiple operations (and survive a racing exit of the
1136/// container init) should open the fd themselves and call
1137/// [`with_netns_fd`] directly.
1138///
1139/// Because `setns` is synchronous and `rtnetlink` is async, the
1140/// typical usage pattern inside the closure is to build a local
1141/// current-thread tokio runtime and `block_on` the netlink calls.
1142/// See [`with_netns_async`] for a convenience wrapper that does
1143/// exactly this.
1144///
1145/// # Errors
1146///
1147/// Returns [`NetlinkError::Io`] if `/proc/<target_pid>/ns/net` cannot
1148/// be opened. Returns [`NetlinkError::Netlink`] if `setns` fails or
1149/// the dedicated thread panics. Any error returned by the closure
1150/// itself is propagated verbatim.
1151#[cfg(target_os = "linux")]
1152pub fn with_netns<F, T>(target_pid: u32, f: F) -> Result<T, NetlinkError>
1153where
1154    F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
1155    T: Send + 'static,
1156{
1157    use std::os::fd::OwnedFd;
1158
1159    let ns_file = std::fs::File::open(format!("/proc/{target_pid}/ns/net"))?;
1160    let ns_fd: OwnedFd = OwnedFd::from(ns_file);
1161    with_netns_fd(ns_fd, f)
1162}
1163
1164/// Non-Linux stub.
1165#[cfg(not(target_os = "linux"))]
1166pub fn with_netns<F, T>(_target_pid: u32, _f: F) -> Result<T, NetlinkError>
1167where
1168    F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
1169    T: Send + 'static,
1170{
1171    Err(NetlinkError::Netlink(
1172        "with_netns is only supported on Linux".to_string(),
1173    ))
1174}
1175
1176/// Convenience wrapper around [`with_netns_fd`] that builds a local
1177/// current-thread tokio runtime inside the dedicated thread and
1178/// drives the provided async future to completion.
1179///
1180/// The future is produced by calling `f()` from inside the thread
1181/// that has already joined the target netns, so any rtnetlink
1182/// operations awaited inside the future will talk to the target
1183/// netns's kernel.
1184///
1185/// The local runtime is lightweight (single-thread, built per call)
1186/// and only drives a handful of netlink messages before being
1187/// dropped with the thread.
1188///
1189/// The `OwnedFd` is moved into the worker thread and closed when
1190/// the thread exits.
1191///
1192/// # Errors
1193///
1194/// Returns [`NetlinkError::Netlink`] per [`with_netns_fd`], plus
1195/// [`NetlinkError::Netlink`] if the local runtime fails to build.
1196/// Any error returned by the future is propagated verbatim.
1197#[cfg(target_os = "linux")]
1198pub fn with_netns_fd_async<F, Fut, T>(ns_fd: std::os::fd::OwnedFd, f: F) -> Result<T, NetlinkError>
1199where
1200    F: FnOnce() -> Fut + Send + 'static,
1201    Fut: std::future::Future<Output = Result<T, NetlinkError>>,
1202    T: Send + 'static,
1203{
1204    with_netns_fd(ns_fd, move || {
1205        let rt = tokio::runtime::Builder::new_current_thread()
1206            .enable_all()
1207            .build()
1208            .map_err(|e| NetlinkError::Netlink(format!("local runtime build failed: {e}")))?;
1209        rt.block_on(f())
1210    })
1211}
1212
1213/// Non-Linux Unix (macOS/BSD) stub. Not emitted on Windows — the sole caller
1214/// chain (`attach_to_interface` in `overlay_manager.rs`) is
1215/// `#[cfg(target_os = "linux")]`-gated, and `OwnedFd` is Unix-only.
1216#[cfg(all(not(target_os = "linux"), unix))]
1217pub fn with_netns_fd_async<F, Fut, T>(
1218    _ns_fd: std::os::fd::OwnedFd,
1219    _f: F,
1220) -> Result<T, NetlinkError>
1221where
1222    F: FnOnce() -> Fut + Send + 'static,
1223    Fut: std::future::Future<Output = Result<T, NetlinkError>>,
1224    T: Send + 'static,
1225{
1226    Err(NetlinkError::Netlink(
1227        "with_netns_fd_async is only supported on Linux".to_string(),
1228    ))
1229}
1230
1231/// Convenience wrapper around [`with_netns`] that builds a local
1232/// current-thread tokio runtime inside the dedicated thread and
1233/// drives the provided async future to completion.
1234///
1235/// Thin wrapper around [`with_netns_fd_async`] that opens
1236/// `/proc/<target_pid>/ns/net` then delegates.
1237///
1238/// # Errors
1239///
1240/// Returns [`NetlinkError::Io`] / [`NetlinkError::Netlink`] per
1241/// [`with_netns`], plus [`NetlinkError::Netlink`] if the local
1242/// runtime fails to build. Any error returned by the future is
1243/// propagated verbatim.
1244#[cfg(target_os = "linux")]
1245pub fn with_netns_async<F, Fut, T>(target_pid: u32, f: F) -> Result<T, NetlinkError>
1246where
1247    F: FnOnce() -> Fut + Send + 'static,
1248    Fut: std::future::Future<Output = Result<T, NetlinkError>>,
1249    T: Send + 'static,
1250{
1251    use std::os::fd::OwnedFd;
1252
1253    let ns_file = std::fs::File::open(format!("/proc/{target_pid}/ns/net"))?;
1254    let ns_fd: OwnedFd = OwnedFd::from(ns_file);
1255    with_netns_fd_async(ns_fd, f)
1256}
1257
1258/// Non-Linux stub.
1259#[cfg(not(target_os = "linux"))]
1260pub fn with_netns_async<F, Fut, T>(_target_pid: u32, _f: F) -> Result<T, NetlinkError>
1261where
1262    F: FnOnce() -> Fut + Send + 'static,
1263    Fut: std::future::Future<Output = Result<T, NetlinkError>>,
1264    T: Send + 'static,
1265{
1266    Err(NetlinkError::Netlink(
1267        "with_netns_async is only supported on Linux".to_string(),
1268    ))
1269}
1270
1271/// Create a Linux bridge interface with the given name.
1272///
1273/// Replaces the shell-out:
1274///   ip link add name `<name>` type bridge
1275///
1276/// Idempotent: if a link with that name already exists this returns
1277/// `Ok(())`. This matches how the overlay manager's per-service bridge
1278/// creation path needs to behave — multiple containers landing on the
1279/// same service-on-node bridge must all see "bridge ready" after a
1280/// successful call without racing against existence checks.
1281///
1282/// The bridge is created in the current network namespace. Callers
1283/// that need a different netns should wrap with [`with_netns_async`].
1284/// The bridge is created in the administratively-down state — call
1285/// [`set_link_up_by_name`] separately once any other attributes
1286/// ([`set_bridge_stp`] etc.) have been applied.
1287///
1288/// # Errors
1289///
1290/// Returns [`NetlinkError::Netlink`] for any RTNETLINK failure other
1291/// than `EEXIST` (which is treated as success).
1292#[cfg(target_os = "linux")]
1293pub async fn create_bridge(name: &str) -> Result<(), NetlinkError> {
1294    let (connection, handle, _) = rtnetlink::new_connection()
1295        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
1296    tokio::spawn(connection);
1297
1298    match handle.link().add().bridge(name.to_string()).execute().await {
1299        Ok(()) => Ok(()),
1300        Err(rtnetlink::Error::NetlinkError(err)) => {
1301            // EEXIST means a link with this name already exists. We
1302            // intentionally do NOT verify that the existing link is
1303            // actually a bridge — callers using stable per-service
1304            // names own that invariant, and re-checking here would
1305            // require another rtnetlink round-trip on the hot path.
1306            let is_eexist = err
1307                .code
1308                .is_some_and(|c| c.get().unsigned_abs() == libc::EEXIST as u32);
1309            let msg = err.to_string();
1310            if is_eexist || msg.contains("File exists") {
1311                Ok(())
1312            } else {
1313                Err(NetlinkError::Netlink(format!(
1314                    "bridge create failed for {name}: {msg}"
1315                )))
1316            }
1317        }
1318        Err(e) => {
1319            let msg = e.to_string();
1320            if msg.contains("File exists") {
1321                Ok(())
1322            } else {
1323                Err(NetlinkError::Netlink(format!(
1324                    "bridge create failed for {name}: {msg}"
1325                )))
1326            }
1327        }
1328    }
1329}
1330
1331/// Non-Linux stub.
1332#[cfg(not(target_os = "linux"))]
1333pub async fn create_bridge(_name: &str) -> Result<(), NetlinkError> {
1334    Err(NetlinkError::Netlink(
1335        "create_bridge is only supported on Linux".to_string(),
1336    ))
1337}
1338
1339/// Delete the bridge interface with the given name.
1340///
1341/// Replaces the shell-out:
1342///   ip link delete `<name>` type bridge
1343///
1344/// Idempotent: returns `Ok(())` if the bridge does not exist.
1345/// Delegates to [`delete_link_by_name`] — from RTNETLINK's perspective
1346/// deleting a bridge is the same `RTM_DELLINK` as deleting any other
1347/// link, and `delete_link_by_name` already has the ENODEV-as-success
1348/// handling we want.
1349///
1350/// # Errors
1351///
1352/// Returns [`NetlinkError::Netlink`] for any RTNETLINK failure other
1353/// than `ENODEV` (which is treated as success).
1354#[cfg(target_os = "linux")]
1355pub async fn delete_bridge(name: &str) -> Result<(), NetlinkError> {
1356    delete_link_by_name(name).await
1357}
1358
1359/// Non-Linux stub.
1360#[cfg(not(target_os = "linux"))]
1361pub async fn delete_bridge(_name: &str) -> Result<(), NetlinkError> {
1362    Err(NetlinkError::Netlink(
1363        "delete_bridge is only supported on Linux".to_string(),
1364    ))
1365}
1366
1367/// Count the member links of a bridge by reading the kernel's canonical
1368/// bridge-port directory `/sys/class/net/<name>/brif/`. Returns 0 when the
1369/// directory is absent or unreadable — correct for a `-d` `WireGuard` device
1370/// (not a bridge, no `brif`) and for an empty bridge. Used by the orphan
1371/// prune's zero-member guard to reclaim only IDLE service bridges.
1372#[cfg(target_os = "linux")]
1373pub async fn bridge_member_count(name: &str) -> usize {
1374    let path = format!("/sys/class/net/{name}/brif");
1375    let Ok(mut entries) = tokio::fs::read_dir(&path).await else {
1376        return 0;
1377    };
1378    let mut count = 0usize;
1379    while let Ok(Some(_entry)) = entries.next_entry().await {
1380        count += 1;
1381    }
1382    count
1383}
1384
1385/// Non-Linux: per-service bridges are a Linux-only mechanic, so there are no
1386/// `brif` members to count.
1387#[cfg(not(target_os = "linux"))]
1388#[allow(clippy::unused_async)]
1389pub async fn bridge_member_count(_name: &str) -> usize {
1390    0
1391}
1392
1393/// Attach `link` to `bridge` by setting the link's `IFLA_MASTER` to
1394/// the bridge's ifindex.
1395///
1396/// Replaces the shell-out:
1397///   ip link set `<link>` master `<bridge>`
1398///
1399/// Both interfaces must already exist in the current network
1400/// namespace. This is what the overlay manager will call to splice a
1401/// container's host-side veth end into the per-service bridge instead
1402/// of /32-routing it directly.
1403///
1404/// # Errors
1405///
1406/// Returns [`NetlinkError::NotFound`] if either `link` or `bridge`
1407/// does not exist in the current netns. Returns
1408/// [`NetlinkError::Netlink`] for any other RTNETLINK failure.
1409#[cfg(target_os = "linux")]
1410pub async fn add_link_to_bridge(link: &str, bridge: &str) -> Result<(), NetlinkError> {
1411    use futures_util::stream::TryStreamExt;
1412
1413    let (connection, handle, _) = rtnetlink::new_connection()
1414        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
1415    tokio::spawn(connection);
1416
1417    let bridge_link = handle
1418        .link()
1419        .get()
1420        .match_name(bridge.to_string())
1421        .execute()
1422        .try_next()
1423        .await
1424        .map_err(|e| {
1425            let msg = e.to_string();
1426            if msg.contains("No such device") {
1427                NetlinkError::NotFound(bridge.to_string())
1428            } else {
1429                NetlinkError::Netlink(format!("link lookup failed for {bridge}: {msg}"))
1430            }
1431        })?
1432        .ok_or_else(|| NetlinkError::NotFound(bridge.to_string()))?;
1433    let bridge_idx = bridge_link.header.index;
1434
1435    let member_link = handle
1436        .link()
1437        .get()
1438        .match_name(link.to_string())
1439        .execute()
1440        .try_next()
1441        .await
1442        .map_err(|e| {
1443            let msg = e.to_string();
1444            if msg.contains("No such device") {
1445                NetlinkError::NotFound(link.to_string())
1446            } else {
1447                NetlinkError::Netlink(format!("link lookup failed for {link}: {msg}"))
1448            }
1449        })?
1450        .ok_or_else(|| NetlinkError::NotFound(link.to_string()))?;
1451    let member_idx = member_link.header.index;
1452
1453    handle
1454        .link()
1455        .set(member_idx)
1456        .controller(bridge_idx)
1457        .execute()
1458        .await
1459        .map_err(|e| {
1460            NetlinkError::Netlink(format!(
1461                "set master failed: link={link} bridge={bridge}: {e}"
1462            ))
1463        })
1464}
1465
1466/// Non-Linux stub.
1467#[cfg(not(target_os = "linux"))]
1468pub async fn add_link_to_bridge(_link: &str, _bridge: &str) -> Result<(), NetlinkError> {
1469    Err(NetlinkError::Netlink(
1470        "add_link_to_bridge is only supported on Linux".to_string(),
1471    ))
1472}
1473
1474/// Enable or disable Spanning Tree Protocol (STP) on the named bridge.
1475///
1476/// STP is disabled by default on bridges created via [`create_bridge`]
1477/// (the kernel default for a freshly-created bridge is STP off), and
1478/// for `ZLayer`'s per-service bridges we want to keep it off: each
1479/// bridge is single-host, has no possibility of a loop, and STP's
1480/// initial 30s forwarding-delay would stall container traffic on
1481/// attach.
1482///
1483/// rtnetlink 0.14 does not expose a typed builder for `IFLA_BR_STP_STATE`
1484/// (it lives inside the nested `IFLA_LINKINFO` -> `IFLA_INFO_DATA` ->
1485/// `IFLA_BR_STP_STATE` attribute and the crate's bridge builder only
1486/// covers it at create-time, not as a post-create modification). The
1487/// portable kernel-supported alternative is the sysfs knob at
1488/// `/sys/class/net/<name>/bridge/stp_state`, which is what
1489/// `brctl stp <name> on|off` writes under the hood. We use the sysfs
1490/// path so the helper works on every kernel that has bridge support
1491/// without depending on an rtnetlink API surface that may move
1492/// between crate versions.
1493///
1494/// # Errors
1495///
1496/// Returns [`NetlinkError::NotFound`] if the bridge does not exist (no
1497/// `/sys/class/net/<name>/bridge` directory). Returns
1498/// [`NetlinkError::Io`] for any other write failure (permission
1499/// denied, the link exists but is not a bridge, etc.).
1500#[cfg(target_os = "linux")]
1501pub fn set_bridge_stp(name: &str, stp_on: bool) -> Result<(), NetlinkError> {
1502    let bridge_dir = format!("/sys/class/net/{name}/bridge");
1503    if !std::path::Path::new(&bridge_dir).exists() {
1504        return Err(NetlinkError::NotFound(name.to_string()));
1505    }
1506    let path = format!("{bridge_dir}/stp_state");
1507    let value = if stp_on { "1" } else { "0" };
1508    std::fs::write(&path, value)?;
1509    Ok(())
1510}
1511
1512/// Non-Linux stub.
1513#[cfg(not(target_os = "linux"))]
1514pub fn set_bridge_stp(_name: &str, _stp_on: bool) -> Result<(), NetlinkError> {
1515    Err(NetlinkError::Netlink(
1516        "set_bridge_stp is only supported on Linux".to_string(),
1517    ))
1518}
1519
1520#[cfg(test)]
1521mod tests {
1522    // The helpers and tests in this module are Linux-only (they require
1523    // netlink + CAP_NET_ADMIN). Keep imports/fixtures gated so the lib
1524    // tests still compile on Windows/macOS cross-checks.
1525    #[cfg(target_os = "linux")]
1526    use super::*;
1527
1528    /// Generate a short random-ish suffix for test interface names so
1529    /// parallel `cargo test` invocations don't collide. Bounded to 6
1530    /// chars so the full name (`zlb-` prefix + suffix) stays under the
1531    /// 15-char `IFNAMSIZ` limit.
1532    #[cfg(target_os = "linux")]
1533    fn rand_suffix() -> String {
1534        use std::time::{SystemTime, UNIX_EPOCH};
1535        const CHARS: &[u8] = b"0123456789abcdefghijklmnopqrstuvwxyz";
1536        let nanos = SystemTime::now()
1537            .duration_since(UNIX_EPOCH)
1538            .map_or(0, |d| d.subsec_nanos());
1539        // base36-ish, 6 chars
1540        let mut n = u64::from(nanos);
1541        let mut out = String::new();
1542        let base = CHARS.len() as u64;
1543        for _ in 0..6 {
1544            let idx = usize::try_from(n % base).unwrap_or(0);
1545            out.push(CHARS[idx] as char);
1546            n /= base;
1547        }
1548        out
1549    }
1550
1551    /// True when the process holds enough privilege to mutate netlink (root, or
1552    /// at least `CAP_NET_ADMIN`). The `#[ignore]`d root-gated tests below call
1553    /// this and return early (a skip, not a failure) when run via `--ignored` on
1554    /// an unprivileged host, mirroring the rest of the crate's "skip gracefully
1555    /// when not root" convention.
1556    #[cfg(target_os = "linux")]
1557    fn have_net_admin() -> bool {
1558        // SAFETY: `geteuid` is a pure read of the caller's effective uid with no
1559        // preconditions and no side effects.
1560        #[allow(unsafe_code)]
1561        let euid = unsafe { libc::geteuid() };
1562        if euid == 0 {
1563            return true;
1564        }
1565        // Non-root: probe whether netlink link creation actually works. A failure
1566        // to even open a netlink socket / create a link means no CAP_NET_ADMIN.
1567        // We don't leave anything behind on success (the caller's test does its
1568        // own create/cleanup); this is a cheap capability sniff via a throwaway
1569        // dummy that is immediately deleted.
1570        let probe = format!("zlcap-{}", rand_suffix());
1571        if probe.len() > 15 {
1572            return false;
1573        }
1574        let Ok(rt) = tokio::runtime::Builder::new_current_thread()
1575            .enable_all()
1576            .build()
1577        else {
1578            return false;
1579        };
1580        rt.block_on(async {
1581            if create_dummy(&probe).await.is_err() {
1582                return false;
1583            }
1584            let _ = delete_link_by_name(&probe).await;
1585            true
1586        })
1587    }
1588
1589    /// Query the kernel (via `ip route`) for whether a link-scope route to
1590    /// `dest/prefix` out of `dev` is present. Returns `false` when `ip` is
1591    /// missing or the route is absent. Used by the teardown round-trip tests to
1592    /// assert a route is actually installed/removed at the kernel level rather
1593    /// than trusting the helper's return value alone.
1594    #[cfg(target_os = "linux")]
1595    fn route_present(dest: &str, prefix: u8, dev: &str) -> bool {
1596        use std::process::Command;
1597        let target = format!("{dest}/{prefix}");
1598        let Ok(out) = Command::new("ip")
1599            .args(["route", "show", &target, "dev", dev])
1600            .output()
1601        else {
1602            return false;
1603        };
1604        out.status.success() && !out.stdout.is_empty()
1605    }
1606
1607    /// Create a dummy interface with the given name (used as a stand-in
1608    /// for a host-side veth end in `bridge_add_link_membership`).
1609    #[cfg(target_os = "linux")]
1610    async fn create_dummy(name: &str) -> Result<(), NetlinkError> {
1611        let (connection, handle, _) = rtnetlink::new_connection()
1612            .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
1613        tokio::spawn(connection);
1614        handle
1615            .link()
1616            .add()
1617            .dummy(name.to_string())
1618            .execute()
1619            .await
1620            .map_err(|e| NetlinkError::Netlink(format!("dummy create failed for {name}: {e}")))
1621    }
1622
1623    #[cfg(target_os = "linux")]
1624    #[tokio::test]
1625    #[ignore = "requires CAP_NET_ADMIN; run manually or in privileged CI"]
1626    async fn bridge_create_idempotent() {
1627        let name = format!("zlb-{}", rand_suffix());
1628        assert!(name.len() <= 15, "interface name exceeds IFNAMSIZ: {name}");
1629
1630        // First create.
1631        create_bridge(&name).await.expect("first create_bridge");
1632        assert!(
1633            std::path::Path::new(&format!("/sys/class/net/{name}")).exists(),
1634            "bridge {name} should exist after create"
1635        );
1636
1637        // Second create on same name must be Ok.
1638        create_bridge(&name)
1639            .await
1640            .expect("second create_bridge should be idempotent");
1641
1642        // Delete and confirm gone.
1643        delete_bridge(&name).await.expect("delete_bridge");
1644        assert!(
1645            !std::path::Path::new(&format!("/sys/class/net/{name}")).exists(),
1646            "bridge {name} should be gone after delete"
1647        );
1648
1649        // Second delete on missing name must be Ok.
1650        delete_bridge(&name)
1651            .await
1652            .expect("second delete_bridge should be idempotent");
1653    }
1654
1655    /// Count the addresses currently assigned to a link (used by the flush test).
1656    #[cfg(target_os = "linux")]
1657    async fn count_addresses(name: &str) -> usize {
1658        use futures_util::stream::TryStreamExt;
1659        let (connection, handle, _) = rtnetlink::new_connection().expect("new_connection");
1660        tokio::spawn(connection);
1661        let link = handle
1662            .link()
1663            .get()
1664            .match_name(name.to_string())
1665            .execute()
1666            .try_next()
1667            .await
1668            .expect("link lookup")
1669            .expect("link present");
1670        let index = link.header.index;
1671        let addrs: Vec<_> = handle
1672            .address()
1673            .get()
1674            .set_link_index_filter(index)
1675            .execute()
1676            .try_collect()
1677            .await
1678            .expect("addr list");
1679        addrs.len()
1680    }
1681
1682    /// Regression for the dual `/28` + `/26` leak: a bridge re-created over a
1683    /// surviving link used to stack the new gateway on top of the stale one
1684    /// because nothing flushed first. `flush_addresses_on_link_by_name` must
1685    /// wipe all addresses so a re-add yields exactly one.
1686    #[cfg(target_os = "linux")]
1687    #[tokio::test]
1688    #[ignore = "requires CAP_NET_ADMIN; run manually or in privileged CI"]
1689    async fn bridge_address_flush_removes_stale_then_single_readd() {
1690        use std::net::{IpAddr, Ipv4Addr};
1691        let name = format!("zlb-{}", rand_suffix());
1692        assert!(name.len() <= 15, "interface name exceeds IFNAMSIZ: {name}");
1693        create_bridge(&name).await.expect("create_bridge");
1694
1695        // Simulate the leak: a stale /28 plus a re-allocated /26 on one bridge.
1696        add_address_to_link_by_name(&name, IpAddr::V4(Ipv4Addr::new(10, 9, 0, 1)), 28)
1697            .await
1698            .expect("add /28");
1699        add_address_to_link_by_name(&name, IpAddr::V4(Ipv4Addr::new(10, 9, 1, 1)), 26)
1700            .await
1701            .expect("add /26");
1702        assert!(
1703            count_addresses(&name).await >= 2,
1704            "both addresses should be present before flush"
1705        );
1706
1707        // Flush wipes every address.
1708        flush_addresses_on_link_by_name(&name).await.expect("flush");
1709        assert_eq!(
1710            count_addresses(&name).await,
1711            0,
1712            "flush should remove all addresses"
1713        );
1714
1715        // Re-add exactly the gateway → exactly one address (the link stays down,
1716        // so no IPv6 link-local re-appears).
1717        add_address_to_link_by_name(&name, IpAddr::V4(Ipv4Addr::new(10, 9, 1, 1)), 26)
1718            .await
1719            .expect("re-add /26");
1720        assert_eq!(
1721            count_addresses(&name).await,
1722            1,
1723            "exactly one address after flush + re-add"
1724        );
1725
1726        delete_bridge(&name).await.expect("delete_bridge");
1727    }
1728
1729    /// The teardown-by-name and flush-before-add paths both call into the netlink
1730    /// helpers for links that may not exist; flushing an absent link must be a
1731    /// tolerant no-op, not an error.
1732    #[cfg(target_os = "linux")]
1733    #[tokio::test]
1734    #[ignore = "requires CAP_NET_ADMIN; run manually or in privileged CI"]
1735    async fn flush_addresses_on_absent_link_is_ok() {
1736        let name = format!("zlb-{}", rand_suffix());
1737        flush_addresses_on_link_by_name(&name)
1738            .await
1739            .expect("flush on absent link should be Ok");
1740    }
1741
1742    #[cfg(target_os = "linux")]
1743    #[tokio::test]
1744    #[ignore = "requires CAP_NET_ADMIN; run manually or in privileged CI"]
1745    async fn bridge_add_link_membership() {
1746        let suffix = rand_suffix();
1747        let bridge = format!("zlb-{suffix}");
1748        let dummy = format!("zld-{suffix}");
1749        assert!(bridge.len() <= 15);
1750        assert!(dummy.len() <= 15);
1751
1752        create_bridge(&bridge).await.expect("create_bridge");
1753        create_dummy(&dummy).await.expect("create_dummy");
1754
1755        add_link_to_bridge(&dummy, &bridge)
1756            .await
1757            .expect("add_link_to_bridge");
1758
1759        // The dummy's master/ifindex symlink should resolve to the
1760        // bridge's ifindex.
1761        let master_ifindex_path = format!("/sys/class/net/{dummy}/master/ifindex");
1762        let dummy_master_ifindex = std::fs::read_to_string(&master_ifindex_path)
1763            .expect("read dummy master ifindex")
1764            .trim()
1765            .parse::<u32>()
1766            .expect("parse dummy master ifindex");
1767
1768        let bridge_ifindex = std::fs::read_to_string(format!("/sys/class/net/{bridge}/ifindex"))
1769            .expect("read bridge ifindex")
1770            .trim()
1771            .parse::<u32>()
1772            .expect("parse bridge ifindex");
1773
1774        assert_eq!(
1775            dummy_master_ifindex, bridge_ifindex,
1776            "dummy's master ifindex should equal bridge's ifindex"
1777        );
1778
1779        // Cleanup.
1780        delete_link_by_name(&dummy).await.expect("delete dummy");
1781        delete_bridge(&bridge).await.expect("delete bridge");
1782    }
1783
1784    /// `bridge_member_count` must read the kernel's `brif` directory: an empty
1785    /// freshly-created bridge has 0 members, and attaching one dummy link yields
1786    /// exactly 1. This is the signal the orphan prune uses to decide a candidate
1787    /// bridge is idle and reclaimable.
1788    #[cfg(target_os = "linux")]
1789    #[tokio::test]
1790    #[ignore = "requires root or CAP_NET_ADMIN"]
1791    async fn bridge_member_count_counts_brif_entries() {
1792        let suffix = rand_suffix();
1793        let bridge = format!("zlb-{suffix}");
1794        let dummy = format!("zld-{suffix}");
1795        assert!(bridge.len() <= 15);
1796        assert!(dummy.len() <= 15);
1797
1798        // Fresh bridge: zero members.
1799        create_bridge(&bridge).await.expect("create_bridge");
1800        assert_eq!(
1801            bridge_member_count(&bridge).await,
1802            0,
1803            "freshly-created bridge should have 0 members"
1804        );
1805
1806        // Attach a dummy link → exactly one member.
1807        create_dummy(&dummy).await.expect("create_dummy");
1808        add_link_to_bridge(&dummy, &bridge)
1809            .await
1810            .expect("add_link_to_bridge");
1811        assert_eq!(
1812            bridge_member_count(&bridge).await,
1813            1,
1814            "bridge with one attached link should report 1 member"
1815        );
1816
1817        // Cleanup the links.
1818        delete_link_by_name(&dummy).await.expect("delete dummy");
1819        delete_bridge(&bridge).await.expect("delete bridge");
1820    }
1821
1822    #[cfg(target_os = "linux")]
1823    #[tokio::test]
1824    #[ignore = "requires CAP_NET_ADMIN; run manually or in privileged CI"]
1825    async fn bridge_stp_off() {
1826        let name = format!("zlb-{}", rand_suffix());
1827        assert!(name.len() <= 15);
1828
1829        create_bridge(&name).await.expect("create_bridge");
1830
1831        set_bridge_stp(&name, false).expect("set_bridge_stp off");
1832        let stp_state = std::fs::read_to_string(format!("/sys/class/net/{name}/bridge/stp_state"))
1833            .expect("read stp_state")
1834            .trim()
1835            .to_string();
1836        assert_eq!(
1837            stp_state, "0",
1838            "stp_state should be 0 after set_bridge_stp(false)"
1839        );
1840
1841        // Cleanup.
1842        delete_bridge(&name).await.expect("delete_bridge");
1843    }
1844
1845    /// Full teardown round-trip for the host-side overlay netlink resources the
1846    /// daemon reverts on shutdown: create a veth pair + a bridge, install a host
1847    /// `/32` link-scope route via the host veth (the bridgeless per-container
1848    /// attach shape), then run the exact delete helpers
1849    /// `teardown_global_overlay` uses — `delete_route_via_dev`,
1850    /// `delete_link_by_name` — and assert each resource is actually gone at the
1851    /// kernel level (via `/sys/class/net` and `ip route`).
1852    ///
1853    /// This is the regression for the teardown fix: it validates the new
1854    /// `delete_route_via_dev` helper round-trips a real route AND that the delete
1855    /// idempotency holds (a second delete of an already-absent route/link must be
1856    /// `Ok`). Uses unique <=15-char names and tears everything down on every exit
1857    /// path *before* asserting, so a failed assertion still leaves the host clean.
1858    #[cfg(target_os = "linux")]
1859    #[tokio::test(flavor = "multi_thread")]
1860    #[ignore = "requires root or CAP_NET_ADMIN"]
1861    async fn teardown_deletes_route_veth_and_bridge() {
1862        use std::net::{IpAddr, Ipv4Addr};
1863
1864        if !have_net_admin() {
1865            eprintln!("skipping teardown_deletes_route_veth_and_bridge: no CAP_NET_ADMIN");
1866            return;
1867        }
1868
1869        let suffix = rand_suffix();
1870        let veth_host = format!("vh-{suffix}");
1871        let veth_peer = format!("vp-{suffix}");
1872        let bridge = format!("zlb-{suffix}");
1873        assert!(veth_host.len() <= 15, "veth host name exceeds IFNAMSIZ");
1874        assert!(veth_peer.len() <= 15, "veth peer name exceeds IFNAMSIZ");
1875        assert!(bridge.len() <= 15, "bridge name exceeds IFNAMSIZ");
1876
1877        let dest = IpAddr::V4(Ipv4Addr::new(10, 222, 0, 7));
1878        let dest_str = "10.222.0.7";
1879        let prefix: u8 = 32;
1880
1881        // --- setup: veth pair, bridge, and a /32 host route via the host veth ---
1882        create_veth_pair(&veth_host, &veth_peer)
1883            .await
1884            .expect("create_veth_pair");
1885        create_bridge(&bridge).await.expect("create_bridge");
1886        replace_route_via_dev(dest, prefix, &veth_host, None)
1887            .await
1888            .expect("replace_route_via_dev installs /32");
1889
1890        // Snapshot kernel presence BEFORE teardown.
1891        let route_was_present = route_present(dest_str, prefix, &veth_host);
1892        let veth_was_present =
1893            std::path::Path::new(&format!("/sys/class/net/{veth_host}")).exists();
1894        let bridge_was_present = std::path::Path::new(&format!("/sys/class/net/{bridge}")).exists();
1895
1896        // --- teardown: the exact helper sequence teardown_global_overlay drives,
1897        // in the same order (route first since it references the veth as its oif,
1898        // then the host veth, then the bridge). Collect results; the deletes are
1899        // best-effort so we capture them and run a belt-and-braces cleanup of any
1900        // straggler before asserting, keeping the host clean on a failed assert.
1901        let del_route = delete_route_via_dev(dest, prefix, &veth_host).await;
1902        let del_veth = delete_link_by_name(&veth_host).await;
1903        let del_bridge = delete_link_by_name(&bridge).await;
1904
1905        // Idempotency: a second delete of the now-absent route/links must be Ok
1906        // (this is what lets teardown run unconditionally over per-container
1907        // detach leftovers without aborting).
1908        let del_route_again = delete_route_via_dev(dest, prefix, &veth_host).await;
1909        let del_veth_again = delete_link_by_name(&veth_host).await;
1910        let del_bridge_again = delete_link_by_name(&bridge).await;
1911
1912        // Snapshot kernel absence AFTER teardown.
1913        let route_gone = !route_present(dest_str, prefix, &veth_host);
1914        let veth_gone = !std::path::Path::new(&format!("/sys/class/net/{veth_host}")).exists();
1915        let bridge_gone = !std::path::Path::new(&format!("/sys/class/net/{bridge}")).exists();
1916
1917        // Belt-and-braces: ensure nothing leaks even if an assertion below fails.
1918        let _ = delete_route_via_dev(dest, prefix, &veth_host).await;
1919        let _ = delete_link_by_name(&veth_host).await;
1920        let _ = delete_link_by_name(&veth_peer).await;
1921        let _ = delete_link_by_name(&bridge).await;
1922
1923        // --- assertions (all after cleanup) ---
1924        assert!(
1925            route_was_present,
1926            "the /32 route should exist after install"
1927        );
1928        assert!(veth_was_present, "the host veth should exist after create");
1929        assert!(bridge_was_present, "the bridge should exist after create");
1930
1931        del_route.expect("delete_route_via_dev should succeed");
1932        del_veth.expect("delete_link_by_name(veth) should succeed");
1933        del_bridge.expect("delete_link_by_name(bridge) should succeed");
1934
1935        del_route_again.expect("second delete_route_via_dev should be idempotent Ok");
1936        del_veth_again.expect("second delete_link_by_name(veth) should be idempotent Ok");
1937        del_bridge_again.expect("second delete_link_by_name(bridge) should be idempotent Ok");
1938
1939        assert!(route_gone, "the /32 route should be gone after teardown");
1940        assert!(veth_gone, "the host veth should be gone after teardown");
1941        assert!(bridge_gone, "the bridge should be gone after teardown");
1942    }
1943}