zlayer_overlayd/netlink.rs
1//! Rust netlink helpers that replace shell-outs to `ip`/`nsenter`/`sysctl`
2//! for per-container overlay network setup.
3//!
4//! This module is populated incrementally through a phased migration.
5//! Stage 1: `move_link_into_netns_and_rename` replaces the shell pair
6//! `ip link set <name> netns <pid>` + `nsenter -t <pid> -n ip
7//! link set <name> name <new>` with a single atomic RTNETLINK
8//! `SetLink` carrying both `IFLA_NET_NS_FD` and `IFLA_IFNAME`.
9//! This bypasses the `/proc/<pid>/ns/net` access problem caused
10//! by libcontainer setting `PR_SET_DUMPABLE(false)` on the
11//! container init process under `SELinux` enforcing.
12//! Stage 2: `create_veth_pair`, `delete_link_by_name`, and
13//! `set_link_up_by_name` replace the host-side veth shell
14//! commands (`ip link add ... type veth peer name ...`,
15//! `ip link delete ...`, `ip link set ... up`) used by
16//! `overlay_manager::attach_to_interface` and the orphan
17//! sweeper. These helpers talk RTNETLINK directly via the
18//! `rtnetlink` crate (async, tokio-backed).
19//! Stage 3: `with_netns`, `add_address_to_link_by_name`, and
20//! `add_default_route_via_dev` replace the remaining
21//! container-netns shell-outs in
22//! `overlay_manager::attach_to_interface`. `with_netns`
23//! runs a closure on a dedicated OS thread that has joined
24//! the target container's network namespace via `setns(2)`,
25//! while the two new RTNETLINK helpers operate on the
26//! current netns (so they must be invoked from inside a
27//! `with_netns` closure). This removes the last three
28//! `nsenter -t <pid> -n ip ...` shell-outs used to assign
29//! the container IP, bring `eth0` / `lo` up, and add the
30//! default route.
31
32#![cfg_attr(
33 not(target_os = "linux"),
34 allow(clippy::missing_errors_doc, clippy::unused_async)
35)]
36
37use thiserror::Error;
38
39/// Errors returned by the netlink helpers in this module.
40#[derive(Debug, Error)]
41pub enum NetlinkError {
42 /// Failed to open or access a file (typically `/proc/<pid>/ns/net`).
43 #[error("io error: {0}")]
44 Io(#[from] std::io::Error),
45
46 /// The requested link was not found in the current network namespace.
47 #[error("link '{0}' not found in current netns")]
48 NotFound(String),
49
50 /// A netlink operation failed.
51 #[error("netlink operation failed: {0}")]
52 Netlink(String),
53}
54
55/// Move a link from the current network namespace into the network
56/// namespace referenced by `ns_fd`, renaming it in the same atomic
57/// operation.
58///
59/// This is the fd-based variant of [`move_link_into_netns_and_rename`].
60/// Callers that have already opened `/proc/<pid>/ns/net` (e.g. to pin
61/// the namespace across multiple operations and survive a racing
62/// container init exit) should use this form so we don't reopen the
63/// path and lose the race.
64///
65/// The single RTNETLINK `SetLink` request carries both `IFLA_NET_NS_FD`
66/// and `IFLA_IFNAME`, so the kernel performs the move and the rename
67/// atomically.
68///
69/// # Errors
70///
71/// Returns [`NetlinkError::NotFound`] if `link_name` does not exist in
72/// the current netns. Returns [`NetlinkError::Netlink`] for any other
73/// netlink-level failure (permission denied, name collision in the
74/// target netns, etc.).
75///
76/// Implemented directly against the `rtnetlink` crate (overlayd has no
77/// libcontainer dependency): a single `LinkSetRequest` carrying
78/// `setns_by_fd` + `name` performs the move and rename atomically.
79#[cfg(target_os = "linux")]
80pub fn move_link_into_netns_fd_and_rename(
81 link_name: &str,
82 ns_fd: std::os::fd::BorrowedFd<'_>,
83 new_name: &str,
84) -> Result<(), NetlinkError> {
85 use std::os::fd::AsRawFd;
86
87 // `setns` of the moved link must reference the fd while the request
88 // executes. We drive the rtnetlink sequence on a local current-thread
89 // runtime, but build+run that runtime on a DEDICATED OS thread (mirroring
90 // `with_netns_fd` below): the sole caller (`attach_to_interface`) invokes
91 // this from inside an async block, and building/blocking a runtime on a
92 // thread already driving tokio tasks panics with "Cannot start a runtime
93 // from within a runtime". A fresh OS thread has no ambient runtime, so
94 // `block_on` is legal there. The raw fd is process-global and the caller
95 // keeps `ns_fd`'s owner alive across the synchronous `join`, so the
96 // borrowed fd stays valid for the duration.
97 let raw_fd = ns_fd.as_raw_fd();
98 let link_name = link_name.to_string();
99 let new_name = new_name.to_string();
100
101 let join_handle = std::thread::spawn(move || -> Result<(), NetlinkError> {
102 let rt = tokio::runtime::Builder::new_current_thread()
103 .enable_all()
104 .build()
105 .map_err(|e| NetlinkError::Netlink(format!("local runtime build failed: {e}")))?;
106
107 rt.block_on(async move {
108 use futures_util::stream::TryStreamExt;
109
110 let (connection, handle, _) = rtnetlink::new_connection()
111 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
112 tokio::spawn(connection);
113
114 // Resolve the host-side interface index. Treat "No such device"
115 // as our dedicated NotFound variant so callers can distinguish
116 // "nothing to move" from real failures.
117 let link = handle
118 .link()
119 .get()
120 .match_name(link_name.clone())
121 .execute()
122 .try_next()
123 .await
124 .map_err(|e| {
125 let msg = e.to_string();
126 if msg.contains("No such device") {
127 NetlinkError::NotFound(link_name.clone())
128 } else {
129 NetlinkError::Netlink(format!("link lookup failed for {link_name}: {msg}"))
130 }
131 })?
132 .ok_or_else(|| NetlinkError::NotFound(link_name.clone()))?;
133
134 let index = link.header.index;
135
136 // Atomically move the link into the target netns and rename it.
137 handle
138 .link()
139 .set(index)
140 .setns_by_fd(raw_fd)
141 .name(new_name.clone())
142 .execute()
143 .await
144 .map_err(|e| {
145 NetlinkError::Netlink(format!(
146 "setns_by_fd(index={index}, new_name={new_name}) failed: {e}"
147 ))
148 })
149 })
150 });
151
152 join_handle
153 .join()
154 .map_err(|_| NetlinkError::Netlink("move_link_into_netns thread panicked".to_string()))?
155}
156
157/// Stub for non-Linux Unix platforms (macOS/BSD).
158///
159/// Not emitted on Windows: `attach_container` (the sole caller chain) is
160/// itself gated `#[cfg(target_os = "linux")]` in `server.rs`, so there are
161/// no Windows callers, and the `BorrowedFd` parameter type is Unix-only.
162///
163/// # Errors
164///
165/// Always returns [`NetlinkError::Netlink`] — this function is unsupported on
166/// the current target.
167#[cfg(all(not(target_os = "linux"), unix))]
168pub fn move_link_into_netns_fd_and_rename(
169 _link_name: &str,
170 _ns_fd: std::os::fd::BorrowedFd<'_>,
171 _new_name: &str,
172) -> Result<(), NetlinkError> {
173 Err(NetlinkError::Netlink(
174 "move_link_into_netns_fd_and_rename is only supported on Linux".to_string(),
175 ))
176}
177
178/// Move a link from the current network namespace into the target PID's
179/// network namespace, renaming it in the same atomic operation.
180///
181/// Thin wrapper around [`move_link_into_netns_fd_and_rename`] that
182/// opens `/proc/<target_pid>/ns/net` then delegates. Kept for
183/// backward compatibility and for callers that only need a single
184/// operation on the target netns. Callers that need to perform
185/// multiple operations on the same netns (and want to survive a
186/// racing exit of the container init process) should open the fd
187/// themselves and call [`move_link_into_netns_fd_and_rename`]
188/// directly.
189///
190/// # Errors
191///
192/// Returns [`NetlinkError::Io`] if `/proc/<target_pid>/ns/net` cannot be
193/// opened (e.g. the container process is gone or is not dumpable and we
194/// lack `CAP_SYS_PTRACE`). Returns [`NetlinkError::NotFound`] if
195/// `link_name` does not exist in the current netns. Returns
196/// [`NetlinkError::Netlink`] for any other netlink-level failure
197/// (permission denied, name collision in the target netns, etc.).
198#[cfg(target_os = "linux")]
199pub fn move_link_into_netns_and_rename(
200 link_name: &str,
201 target_pid: u32,
202 new_name: &str,
203) -> Result<(), NetlinkError> {
204 use std::os::fd::{AsFd, OwnedFd};
205
206 let ns_file = std::fs::File::open(format!("/proc/{target_pid}/ns/net"))?;
207 let ns_fd: OwnedFd = OwnedFd::from(ns_file);
208 move_link_into_netns_fd_and_rename(link_name, ns_fd.as_fd(), new_name)
209}
210
211/// Non-Linux stub: the overlay manager never calls this on non-Linux
212/// platforms (libcontainer itself is a Linux-only dep), but keeping the
213/// signature available lets `overlay_manager.rs` stay platform-agnostic.
214#[cfg(not(target_os = "linux"))]
215pub fn move_link_into_netns_and_rename(
216 _link_name: &str,
217 _target_pid: u32,
218 _new_name: &str,
219) -> Result<(), NetlinkError> {
220 Err(NetlinkError::Netlink(
221 "move_link_into_netns_and_rename is only supported on Linux".to_string(),
222 ))
223}
224
225/// Create a veth pair with the two ends named `host_name` and `peer_name`.
226///
227/// Both ends start in the current network namespace. The caller is
228/// responsible for moving the peer end into the container netns (see
229/// [`move_link_into_netns_and_rename`]) and bringing the host end up
230/// (see [`set_link_up_by_name`]).
231///
232/// Replaces the shell-out:
233/// ip link add `<host_name>` type veth peer name `<peer_name>`
234///
235/// # Errors
236///
237/// Returns [`NetlinkError::Netlink`] if RTNETLINK fails for any
238/// reason. `EEXIST` / "File exists" is surfaced verbatim so the caller
239/// can distinguish a leaked endpoint (typically a sign the orphan
240/// sweeper missed something) from a permission or interface-name
241/// problem.
242#[cfg(target_os = "linux")]
243pub async fn create_veth_pair(host_name: &str, peer_name: &str) -> Result<(), NetlinkError> {
244 let (connection, handle, _) = rtnetlink::new_connection()
245 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
246 tokio::spawn(connection);
247
248 handle
249 .link()
250 .add()
251 .veth(host_name.to_string(), peer_name.to_string())
252 .execute()
253 .await
254 .map_err(|e| {
255 let msg = e.to_string();
256 if msg.contains("File exists") || msg.contains("EEXIST") {
257 NetlinkError::Netlink(format!(
258 "veth pair already exists: host={host_name} peer={peer_name}: {msg}"
259 ))
260 } else {
261 NetlinkError::Netlink(format!(
262 "veth create failed (host={host_name}, peer={peer_name}): {msg}"
263 ))
264 }
265 })
266}
267
268/// Non-Linux stub.
269#[cfg(not(target_os = "linux"))]
270pub async fn create_veth_pair(_host_name: &str, _peer_name: &str) -> Result<(), NetlinkError> {
271 Err(NetlinkError::Netlink(
272 "create_veth_pair is only supported on Linux".to_string(),
273 ))
274}
275
276/// Delete the link by name. Idempotent: returns `Ok(())` if the link
277/// does not exist. Any other error surfaces as
278/// [`NetlinkError::Netlink`].
279///
280/// Replaces the shell-out:
281/// ip link delete `<name>`
282///
283/// Used in `overlay_manager::attach_to_interface` pre-cleanup,
284/// cleanup-on-error, and the orphan-veth sweeper.
285///
286/// # Errors
287///
288/// Returns [`NetlinkError::Netlink`] if RTNETLINK reports a failure
289/// other than `ENODEV` / "No such device" (which are treated as
290/// success so this is safe to call unconditionally).
291#[cfg(target_os = "linux")]
292pub async fn delete_link_by_name(name: &str) -> Result<(), NetlinkError> {
293 use futures_util::stream::TryStreamExt;
294
295 let (connection, handle, _) = rtnetlink::new_connection()
296 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
297 tokio::spawn(connection);
298
299 // Look up the link by name. Treat "not found" as success so the
300 // helper is safe to call unconditionally in cleanup paths.
301 let lookup = handle
302 .link()
303 .get()
304 .match_name(name.to_string())
305 .execute()
306 .try_next()
307 .await;
308
309 let link = match lookup {
310 Ok(Some(link)) => link,
311 Ok(None) => return Ok(()),
312 Err(rtnetlink::Error::NetlinkError(err)) => {
313 // libc::ENODEV == 19. netlink-packet-core reports the raw
314 // errno as a negative i32 in `code`, but the exact type has
315 // moved between versions, so match by both numeric code and
316 // the human-readable message for belt-and-suspenders safety.
317 let msg = err.to_string();
318 let is_enodev = err
319 .code
320 .is_some_and(|c| c.get().unsigned_abs() == libc::ENODEV as u32);
321 if is_enodev || msg.contains("No such device") {
322 return Ok(());
323 }
324 return Err(NetlinkError::Netlink(format!(
325 "link lookup failed for {name}: {msg}"
326 )));
327 }
328 Err(e) => {
329 let msg = e.to_string();
330 if msg.contains("No such device") {
331 return Ok(());
332 }
333 return Err(NetlinkError::Netlink(format!(
334 "link lookup failed for {name}: {msg}"
335 )));
336 }
337 };
338
339 let index = link.header.index;
340
341 handle
342 .link()
343 .del(index)
344 .execute()
345 .await
346 .map_err(|e| NetlinkError::Netlink(format!("link delete failed for {name}: {e}")))
347}
348
349/// Non-Linux stub.
350#[cfg(not(target_os = "linux"))]
351pub async fn delete_link_by_name(_name: &str) -> Result<(), NetlinkError> {
352 Err(NetlinkError::Netlink(
353 "delete_link_by_name is only supported on Linux".to_string(),
354 ))
355}
356
357/// List all network interfaces in the current netns.
358///
359/// Returns a `Vec` of `(index, name)` tuples for every link the kernel
360/// reports. Used by the orphan veth sweeper to find `veth-<pid>` and
361/// `vc-<pid>` links whose owning PID is dead, so it can clean them up
362/// via [`delete_link_by_name`].
363///
364/// Replaces the shell-out:
365/// ip -br link
366///
367/// Issues a single RTNETLINK `RTM_GETLINK` dump request and iterates
368/// the resulting stream of `LinkMessage`s. Each message contributes
369/// one `(index, name)` tuple; messages without an `IFLA_IFNAME`
370/// attribute (extremely rare in practice — the kernel always emits
371/// one for configured devices) are silently skipped.
372///
373/// # Errors
374///
375/// Returns [`NetlinkError::Netlink`] if the rtnetlink socket cannot
376/// be created or if the dump stream itself reports a failure.
377#[cfg(target_os = "linux")]
378pub async fn list_all_links() -> Result<Vec<(u32, String)>, NetlinkError> {
379 use futures_util::stream::TryStreamExt;
380 use netlink_packet_route::link::LinkAttribute;
381
382 let (connection, handle, _) = rtnetlink::new_connection()
383 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
384 tokio::spawn(connection);
385
386 let mut stream = handle.link().get().execute();
387 let mut links = Vec::new();
388
389 while let Some(msg) = stream
390 .try_next()
391 .await
392 .map_err(|e| NetlinkError::Netlink(format!("link dump failed: {e}")))?
393 {
394 // LinkHeader.index is already u32 in netlink-packet-route
395 // 0.19 — no cast needed.
396 let index = msg.header.index;
397 let Some(name) = msg.attributes.iter().find_map(|a| match a {
398 LinkAttribute::IfName(n) => Some(n.clone()),
399 _ => None,
400 }) else {
401 continue;
402 };
403 links.push((index, name));
404 }
405
406 Ok(links)
407}
408
409/// Non-Linux stub.
410#[cfg(not(target_os = "linux"))]
411pub async fn list_all_links() -> Result<Vec<(u32, String)>, NetlinkError> {
412 Err(NetlinkError::Netlink(
413 "list_all_links is only supported on Linux".to_string(),
414 ))
415}
416
417/// Set the link identified by `name` to the "up" administrative state.
418///
419/// Replaces the shell-out:
420/// ip link set `<name>` up
421///
422/// Unlike [`delete_link_by_name`] this is *not* idempotent for missing
423/// links: if the link does not exist the caller almost certainly has a
424/// bug upstream (we only call this on a veth end we just created), so
425/// we return [`NetlinkError::NotFound`] rather than silently succeeding.
426///
427/// # Errors
428///
429/// Returns [`NetlinkError::NotFound`] if no link with the given name
430/// exists in the current netns. Returns [`NetlinkError::Netlink`] for
431/// any other RTNETLINK failure (permission denied, etc.).
432#[cfg(target_os = "linux")]
433pub async fn set_link_up_by_name(name: &str) -> Result<(), NetlinkError> {
434 use futures_util::stream::TryStreamExt;
435
436 let (connection, handle, _) = rtnetlink::new_connection()
437 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
438 tokio::spawn(connection);
439
440 let link = handle
441 .link()
442 .get()
443 .match_name(name.to_string())
444 .execute()
445 .try_next()
446 .await
447 .map_err(|e| {
448 let msg = e.to_string();
449 if msg.contains("No such device") {
450 NetlinkError::NotFound(name.to_string())
451 } else {
452 NetlinkError::Netlink(format!("link lookup failed for {name}: {msg}"))
453 }
454 })?
455 .ok_or_else(|| NetlinkError::NotFound(name.to_string()))?;
456
457 let index = link.header.index;
458
459 handle
460 .link()
461 .set(index)
462 .up()
463 .execute()
464 .await
465 .map_err(|e| NetlinkError::Netlink(format!("link set up failed for {name}: {e}")))
466}
467
468/// Non-Linux stub.
469#[cfg(not(target_os = "linux"))]
470pub async fn set_link_up_by_name(_name: &str) -> Result<(), NetlinkError> {
471 Err(NetlinkError::Netlink(
472 "set_link_up_by_name is only supported on Linux".to_string(),
473 ))
474}
475
476/// Add an IP address to the link identified by `name` in the current
477/// network namespace.
478///
479/// Replaces (in combination with [`with_netns`]):
480/// nsenter -t `<pid>` -n ip \[-6\] addr add `<addr>/<prefix_len>` dev `<name>`
481///
482/// `addr` may be v4 or v6. `prefix_len` is the CIDR prefix length
483/// (24 for a `/24`, 64 for a `/64`, etc.).
484///
485/// This helper operates on the CURRENT network namespace — it looks
486/// up the interface index via a local rtnetlink socket. To target a
487/// container's netns, wrap the call inside [`with_netns`].
488///
489/// # Errors
490///
491/// Returns [`NetlinkError::NotFound`] if the link is missing. Returns
492/// [`NetlinkError::Netlink`] for any other rtnetlink failure
493/// (permission denied, EEXIST on a duplicate address, etc.).
494#[cfg(target_os = "linux")]
495pub async fn add_address_to_link_by_name(
496 name: &str,
497 addr: std::net::IpAddr,
498 prefix_len: u8,
499) -> Result<(), NetlinkError> {
500 use futures_util::stream::TryStreamExt;
501
502 let (connection, handle, _) = rtnetlink::new_connection()
503 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
504 tokio::spawn(connection);
505
506 let link = handle
507 .link()
508 .get()
509 .match_name(name.to_string())
510 .execute()
511 .try_next()
512 .await
513 .map_err(|e| {
514 let msg = e.to_string();
515 if msg.contains("No such device") {
516 NetlinkError::NotFound(name.to_string())
517 } else {
518 NetlinkError::Netlink(format!("link lookup failed for {name}: {msg}"))
519 }
520 })?
521 .ok_or_else(|| NetlinkError::NotFound(name.to_string()))?;
522
523 let index = link.header.index;
524
525 handle
526 .address()
527 .add(index, addr, prefix_len)
528 .execute()
529 .await
530 .map_err(|e| {
531 NetlinkError::Netlink(format!(
532 "address add failed for {name} ({addr}/{prefix_len}): {e}"
533 ))
534 })
535}
536
537/// Non-Linux stub.
538#[cfg(not(target_os = "linux"))]
539pub async fn add_address_to_link_by_name(
540 _name: &str,
541 _addr: std::net::IpAddr,
542 _prefix_len: u8,
543) -> Result<(), NetlinkError> {
544 Err(NetlinkError::Netlink(
545 "add_address_to_link_by_name is only supported on Linux".to_string(),
546 ))
547}
548
549/// Remove ALL IPv4/IPv6 addresses currently assigned to the link named `name`.
550///
551/// Used to make service-bridge gateway assignment idempotent: a bridge that
552/// survives an overlayd/daemon restart is re-created (create is idempotent on
553/// EEXIST), and without flushing first, `add_address_to_link_by_name` would
554/// STACK the new gateway on top of the old one — exactly the observed bug where
555/// a bridge carried both a stale `/28` and the re-allocated `/26`. Flushing
556/// before re-adding guarantees a single address and self-heals such bridges.
557///
558/// No-ops (returns `Ok`) when the link is absent (ENODEV / "No such device").
559///
560/// # Errors
561/// Returns [`NetlinkError::Netlink`] on any rtnetlink failure other than a
562/// missing link (e.g. permission denied).
563#[cfg(target_os = "linux")]
564pub async fn flush_addresses_on_link_by_name(name: &str) -> Result<(), NetlinkError> {
565 use futures_util::stream::TryStreamExt;
566
567 let (connection, handle, _) = rtnetlink::new_connection()
568 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
569 tokio::spawn(connection);
570
571 let link = handle
572 .link()
573 .get()
574 .match_name(name.to_string())
575 .execute()
576 .try_next()
577 .await;
578
579 let link = match link {
580 Ok(Some(l)) => l,
581 // Absent link → nothing to flush.
582 Ok(None) => return Ok(()),
583 Err(e) => {
584 let msg = e.to_string();
585 if msg.contains("No such device") {
586 return Ok(());
587 }
588 return Err(NetlinkError::Netlink(format!(
589 "link lookup failed for {name}: {msg}"
590 )));
591 }
592 };
593
594 let index = link.header.index;
595
596 let addrs: Vec<_> = handle
597 .address()
598 .get()
599 .set_link_index_filter(index)
600 .execute()
601 .try_collect()
602 .await
603 .map_err(|e| NetlinkError::Netlink(format!("address list failed for {name}: {e}")))?;
604
605 for addr in addrs {
606 handle
607 .address()
608 .del(addr)
609 .execute()
610 .await
611 .map_err(|e| NetlinkError::Netlink(format!("address del failed for {name}: {e}")))?;
612 }
613
614 Ok(())
615}
616
617/// Non-Linux stub.
618#[cfg(not(target_os = "linux"))]
619pub async fn flush_addresses_on_link_by_name(_name: &str) -> Result<(), NetlinkError> {
620 Err(NetlinkError::Netlink(
621 "flush_addresses_on_link_by_name is only supported on Linux".to_string(),
622 ))
623}
624
625/// Add a default route via the given device name in the current
626/// network namespace.
627///
628/// Replaces (in combination with [`with_netns`]):
629/// nsenter -t `<pid>` -n ip \[-6\] route add default dev `<dev_name>`
630///
631/// The route is a direct, link-scope route: no gateway, the kernel
632/// ARPs / uses NDISC on the device for destination resolution. This
633/// is the correct form for a point-to-point veth link where the peer
634/// is reachable directly.
635///
636/// For IPv4 the destination prefix is `0.0.0.0/0`. For IPv6 it is
637/// `::/0`. Controlled by `is_v6`.
638///
639/// # Errors
640///
641/// Returns [`NetlinkError::NotFound`] if the device is missing.
642/// Returns [`NetlinkError::Netlink`] for any other rtnetlink failure.
643#[cfg(target_os = "linux")]
644pub async fn add_default_route_via_dev(dev_name: &str, is_v6: bool) -> Result<(), NetlinkError> {
645 use futures_util::stream::TryStreamExt;
646 use netlink_packet_route::route::RouteScope;
647
648 let (connection, handle, _) = rtnetlink::new_connection()
649 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
650 tokio::spawn(connection);
651
652 let link = handle
653 .link()
654 .get()
655 .match_name(dev_name.to_string())
656 .execute()
657 .try_next()
658 .await
659 .map_err(|e| {
660 let msg = e.to_string();
661 if msg.contains("No such device") {
662 NetlinkError::NotFound(dev_name.to_string())
663 } else {
664 NetlinkError::Netlink(format!("link lookup failed for {dev_name}: {msg}"))
665 }
666 })?
667 .ok_or_else(|| NetlinkError::NotFound(dev_name.to_string()))?;
668
669 let oif_idx = link.header.index;
670
671 if is_v6 {
672 handle
673 .route()
674 .add()
675 .v6()
676 .destination_prefix(std::net::Ipv6Addr::UNSPECIFIED, 0)
677 .output_interface(oif_idx)
678 .scope(RouteScope::Link)
679 .execute()
680 .await
681 .map_err(|e| {
682 NetlinkError::Netlink(format!("default route add v6 via {dev_name} failed: {e}"))
683 })
684 } else {
685 handle
686 .route()
687 .add()
688 .v4()
689 .destination_prefix(std::net::Ipv4Addr::UNSPECIFIED, 0)
690 .output_interface(oif_idx)
691 .scope(RouteScope::Link)
692 .execute()
693 .await
694 .map_err(|e| {
695 NetlinkError::Netlink(format!("default route add v4 via {dev_name} failed: {e}"))
696 })
697 }
698}
699
700/// Non-Linux stub.
701#[cfg(not(target_os = "linux"))]
702pub async fn add_default_route_via_dev(_dev_name: &str, _is_v6: bool) -> Result<(), NetlinkError> {
703 Err(NetlinkError::Netlink(
704 "add_default_route_via_dev is only supported on Linux".to_string(),
705 ))
706}
707
708/// Add a default route pointing at the given gateway IP in the current
709/// network namespace.
710///
711/// Replaces (in combination with [`with_netns`]):
712/// nsenter -t `<pid>` -n ip \[-6\] route add default via `<gateway>`
713///
714/// Used by the per-service bridge attach path: containers join the
715/// service bridge via a veth pair and reach the rest of the overlay
716/// through the bridge's L3 gateway IP. The address family of the route
717/// is inferred from `gateway`.
718///
719/// # Errors
720///
721/// Returns [`NetlinkError::Netlink`] for any rtnetlink failure.
722#[cfg(target_os = "linux")]
723pub async fn add_default_route_via_gateway(gateway: std::net::IpAddr) -> Result<(), NetlinkError> {
724 let (connection, handle, _) = rtnetlink::new_connection()
725 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
726 tokio::spawn(connection);
727
728 match gateway {
729 std::net::IpAddr::V4(gw) => handle
730 .route()
731 .add()
732 .v4()
733 .destination_prefix(std::net::Ipv4Addr::UNSPECIFIED, 0)
734 .gateway(gw)
735 .execute()
736 .await
737 .map_err(|e| {
738 NetlinkError::Netlink(format!("default route add v4 via gateway {gw} failed: {e}"))
739 }),
740 std::net::IpAddr::V6(gw) => handle
741 .route()
742 .add()
743 .v6()
744 .destination_prefix(std::net::Ipv6Addr::UNSPECIFIED, 0)
745 .gateway(gw)
746 .execute()
747 .await
748 .map_err(|e| {
749 NetlinkError::Netlink(format!("default route add v6 via gateway {gw} failed: {e}"))
750 }),
751 }
752}
753
754/// Non-Linux stub.
755#[cfg(not(target_os = "linux"))]
756pub async fn add_default_route_via_gateway(_gateway: std::net::IpAddr) -> Result<(), NetlinkError> {
757 Err(NetlinkError::Netlink(
758 "add_default_route_via_gateway is only supported on Linux".to_string(),
759 ))
760}
761
762/// Add or replace a route to `dest/prefix_len` that forwards via the
763/// interface named `dev_name`. Optional `src` sets the preferred source
764/// address.
765///
766/// Replaces the shell-outs:
767/// ip route replace `<dest>/<prefix_len>` dev `<dev_name>` \[src `<src>`\]
768/// ip -6 route replace `<dest>/<prefix_len>` dev `<dev_name>` \[src `<src>`\]
769///
770/// Uses `NLM_F_REPLACE | NLM_F_CREATE` semantics (via rtnetlink's
771/// `.replace()` on the route add builder) so stale routes left behind
772/// by a previous daemon run don't cause `EEXIST`.
773///
774/// The route is installed with link scope (direct-via-dev, no
775/// gateway) which is the correct form for a per-container `/32` or
776/// `/128` pointing at a host-side veth endpoint.
777///
778/// `dest` and `src` (if provided) must have matching address families
779/// — passing a v4 `dest` with a v6 `src` returns
780/// [`NetlinkError::Netlink`] without touching the kernel.
781///
782/// # Errors
783///
784/// Returns [`NetlinkError::NotFound`] if `dev_name` does not exist in
785/// the current netns. Returns [`NetlinkError::Netlink`] on address
786/// family mismatch or any RTNETLINK failure.
787#[cfg(target_os = "linux")]
788pub async fn replace_route_via_dev(
789 dest: std::net::IpAddr,
790 prefix_len: u8,
791 dev_name: &str,
792 src: Option<std::net::IpAddr>,
793) -> Result<(), NetlinkError> {
794 use std::net::IpAddr;
795
796 use futures_util::stream::TryStreamExt;
797 use netlink_packet_route::route::RouteScope;
798
799 let (connection, handle, _) = rtnetlink::new_connection()
800 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
801 tokio::spawn(connection);
802
803 let link = handle
804 .link()
805 .get()
806 .match_name(dev_name.to_string())
807 .execute()
808 .try_next()
809 .await
810 .map_err(|e| {
811 let msg = e.to_string();
812 if msg.contains("No such device") {
813 NetlinkError::NotFound(dev_name.to_string())
814 } else {
815 NetlinkError::Netlink(format!("link lookup failed for {dev_name}: {msg}"))
816 }
817 })?
818 .ok_or_else(|| NetlinkError::NotFound(dev_name.to_string()))?;
819
820 let oif_idx = link.header.index;
821
822 match (dest, src) {
823 (IpAddr::V4(d), Some(IpAddr::V4(s))) => handle
824 .route()
825 .add()
826 .v4()
827 .destination_prefix(d, prefix_len)
828 .output_interface(oif_idx)
829 .scope(RouteScope::Link)
830 .pref_source(s)
831 .replace()
832 .execute()
833 .await
834 .map_err(|e| {
835 NetlinkError::Netlink(format!(
836 "route replace v4 {d}/{prefix_len} dev {dev_name} src {s} failed: {e}"
837 ))
838 }),
839 (IpAddr::V4(d), None) => handle
840 .route()
841 .add()
842 .v4()
843 .destination_prefix(d, prefix_len)
844 .output_interface(oif_idx)
845 .scope(RouteScope::Link)
846 .replace()
847 .execute()
848 .await
849 .map_err(|e| {
850 NetlinkError::Netlink(format!(
851 "route replace v4 {d}/{prefix_len} dev {dev_name} failed: {e}"
852 ))
853 }),
854 (IpAddr::V6(d), Some(IpAddr::V6(s))) => handle
855 .route()
856 .add()
857 .v6()
858 .destination_prefix(d, prefix_len)
859 .output_interface(oif_idx)
860 .scope(RouteScope::Link)
861 .pref_source(s)
862 .replace()
863 .execute()
864 .await
865 .map_err(|e| {
866 NetlinkError::Netlink(format!(
867 "route replace v6 {d}/{prefix_len} dev {dev_name} src {s} failed: {e}"
868 ))
869 }),
870 (IpAddr::V6(d), None) => handle
871 .route()
872 .add()
873 .v6()
874 .destination_prefix(d, prefix_len)
875 .output_interface(oif_idx)
876 .scope(RouteScope::Link)
877 .replace()
878 .execute()
879 .await
880 .map_err(|e| {
881 NetlinkError::Netlink(format!(
882 "route replace v6 {d}/{prefix_len} dev {dev_name} failed: {e}"
883 ))
884 }),
885 (IpAddr::V4(_), Some(IpAddr::V6(_))) | (IpAddr::V6(_), Some(IpAddr::V4(_))) => Err(
886 NetlinkError::Netlink(format!("address family mismatch: dest={dest} src={src:?}")),
887 ),
888 }
889}
890
891/// Non-Linux stub.
892#[cfg(not(target_os = "linux"))]
893pub async fn replace_route_via_dev(
894 _dest: std::net::IpAddr,
895 _prefix_len: u8,
896 _dev_name: &str,
897 _src: Option<std::net::IpAddr>,
898) -> Result<(), NetlinkError> {
899 Err(NetlinkError::Netlink(
900 "replace_route_via_dev is only supported on Linux".to_string(),
901 ))
902}
903
904/// Delete the link-scope route to `dest/prefix_len` that forwards via the
905/// interface named `dev_name` — the symmetric counterpart of
906/// [`replace_route_via_dev`] used by overlay teardown to revert the per-container
907/// host route.
908///
909/// Idempotent and best-effort: a missing device (`NotFound`) or a missing route
910/// (`ESRCH` / "No such process", which is how the kernel reports
911/// `RTM_DELROUTE` for a route that is already gone) is treated as success so
912/// teardown can call this unconditionally without aborting on a route a prior
913/// per-container detach already removed.
914///
915/// The route message is built with the *same* fields as the install path
916/// (link scope, destination prefix, output interface) so the kernel matches and
917/// removes exactly the route `replace_route_via_dev` installed. The `src`
918/// preferred-source is intentionally omitted from the match: the kernel keys the
919/// delete on dest + oif + scope, and including a stale src risks a false miss.
920///
921/// # Errors
922///
923/// Returns [`NetlinkError::Netlink`] only on a genuine RTNETLINK failure that is
924/// neither "device not found" nor "route not found".
925#[cfg(target_os = "linux")]
926pub async fn delete_route_via_dev(
927 dest: std::net::IpAddr,
928 prefix_len: u8,
929 dev_name: &str,
930) -> Result<(), NetlinkError> {
931 use std::net::IpAddr;
932
933 use futures_util::stream::TryStreamExt;
934 use netlink_packet_route::route::RouteScope;
935
936 let (connection, handle, _) = rtnetlink::new_connection()
937 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
938 tokio::spawn(connection);
939
940 // Resolve the output interface. A vanished device means the route went with
941 // it (deleting a link drops its routes), so treat NotFound as success.
942 let lookup = handle
943 .link()
944 .get()
945 .match_name(dev_name.to_string())
946 .execute()
947 .try_next()
948 .await;
949 let link = match lookup {
950 Ok(Some(link)) => link,
951 Ok(None) => return Ok(()),
952 Err(e) => {
953 let msg = e.to_string();
954 if msg.contains("No such device") {
955 return Ok(());
956 }
957 return Err(NetlinkError::Netlink(format!(
958 "link lookup failed for {dev_name}: {msg}"
959 )));
960 }
961 };
962 let oif_idx = link.header.index;
963
964 // Build the route message identically to the install path, then hand it to
965 // `del`. `message_mut().clone()` extracts the fully-formed RouteMessage from
966 // the add builder so the delete matches the exact route we installed.
967 let message = match dest {
968 IpAddr::V4(d) => {
969 let mut req = handle
970 .route()
971 .add()
972 .v4()
973 .destination_prefix(d, prefix_len)
974 .output_interface(oif_idx)
975 .scope(RouteScope::Link);
976 req.message_mut().clone()
977 }
978 IpAddr::V6(d) => {
979 let mut req = handle
980 .route()
981 .add()
982 .v6()
983 .destination_prefix(d, prefix_len)
984 .output_interface(oif_idx)
985 .scope(RouteScope::Link);
986 req.message_mut().clone()
987 }
988 };
989
990 match handle.route().del(message).execute().await {
991 Ok(()) => Ok(()),
992 Err(e) => {
993 let msg = e.to_string();
994 // ESRCH (3) "No such process" is the kernel's RTM_DELROUTE answer
995 // for an already-absent route; ENOENT likewise. Both mean "already
996 // gone" — success for an idempotent teardown.
997 if msg.contains("No such process")
998 || msg.contains("No such file")
999 || msg.contains("ESRCH")
1000 || msg.contains("ENOENT")
1001 {
1002 return Ok(());
1003 }
1004 Err(NetlinkError::Netlink(format!(
1005 "route delete {dest}/{prefix_len} dev {dev_name} failed: {msg}"
1006 )))
1007 }
1008 }
1009}
1010
1011/// Non-Linux stub.
1012#[cfg(not(target_os = "linux"))]
1013pub async fn delete_route_via_dev(
1014 _dest: std::net::IpAddr,
1015 _prefix_len: u8,
1016 _dev_name: &str,
1017) -> Result<(), NetlinkError> {
1018 Err(NetlinkError::Netlink(
1019 "delete_route_via_dev is only supported on Linux".to_string(),
1020 ))
1021}
1022
1023/// Set a sysctl via the `/proc/sys/...` filesystem.
1024///
1025/// `key` uses dotted form like `net.ipv4.ip_forward`; dots are
1026/// translated to path separators so the effective path is
1027/// `/proc/sys/net/ipv4/ip_forward`. Writes the string form of
1028/// `value` to the file.
1029///
1030/// Replaces the shell-outs:
1031/// sysctl -w `<key>`=`<value>`
1032///
1033/// Writing to `/proc/sys/...` is the kernel-standard way of setting
1034/// sysctls and works under any confinement that still allows write
1035/// access to `/proc/sys` (which the overlay manager needs anyway for
1036/// its other operations).
1037///
1038/// # Errors
1039///
1040/// Returns [`NetlinkError::Io`] if the write fails (e.g. permission
1041/// denied, file missing because the sysctl doesn't exist on this
1042/// kernel, etc.).
1043pub fn set_sysctl(key: &str, value: &str) -> Result<(), NetlinkError> {
1044 let path = format!("/proc/sys/{}", key.replace('.', "/"));
1045 std::fs::write(&path, value)?;
1046 Ok(())
1047}
1048
1049/// Read the current value of a sysctl via the `/proc/sys/...` filesystem.
1050///
1051/// `key` uses dotted form like `net.ipv4.ip_forward`; dots are translated
1052/// to path separators (`/proc/sys/net/ipv4/ip_forward`). The trailing
1053/// newline the kernel emits is trimmed.
1054///
1055/// Used by the overlay teardown path to learn whether a forwarding sysctl
1056/// was already `1` before the daemon touched it, so a clean shutdown only
1057/// reverts the bits the daemon itself turned on (never clobbering an
1058/// operator who deliberately enabled routing on the host).
1059///
1060/// # Errors
1061///
1062/// Returns [`NetlinkError::Io`] if the read fails (e.g. the sysctl does
1063/// not exist on this kernel, or `/proc/sys` is not readable under the
1064/// current confinement).
1065pub fn read_sysctl(key: &str) -> Result<String, NetlinkError> {
1066 let path = format!("/proc/sys/{}", key.replace('.', "/"));
1067 let raw = std::fs::read_to_string(&path)?;
1068 Ok(raw.trim().to_string())
1069}
1070
1071/// Run a synchronous closure inside the network namespace referenced
1072/// by the given `OwnedFd`.
1073///
1074/// This is the fd-based variant of [`with_netns`]. Callers that have
1075/// already opened `/proc/<pid>/ns/net` (e.g. to pin the namespace
1076/// across multiple operations) should use this form to reuse the
1077/// same fd and avoid re-opening the procfs path — the reopen would
1078/// fail with `ENOENT` if the container init process has exited in
1079/// the meantime, even though the namespace itself is still alive
1080/// because our pinned fd holds a reference.
1081///
1082/// The `OwnedFd` is moved into the dedicated worker thread and
1083/// closed when the thread exits. Spawns a fresh OS thread (not a
1084/// tokio blocking worker) because `setns` affects the whole thread
1085/// and we don't want to contaminate a shared worker.
1086///
1087/// # Errors
1088///
1089/// Returns [`NetlinkError::Netlink`] if `setns` fails or the
1090/// dedicated thread panics. Any error returned by the closure itself
1091/// is propagated verbatim.
1092#[cfg(target_os = "linux")]
1093pub fn with_netns_fd<F, T>(ns_fd: std::os::fd::OwnedFd, f: F) -> Result<T, NetlinkError>
1094where
1095 F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
1096 T: Send + 'static,
1097{
1098 let join_handle = std::thread::spawn(move || -> Result<T, NetlinkError> {
1099 nix::sched::setns(&ns_fd, nix::sched::CloneFlags::CLONE_NEWNET)
1100 .map_err(|e| NetlinkError::Netlink(format!("setns(ns_fd) failed: {e}")))?;
1101 // Keep the fd alive for the duration of the closure even
1102 // though setns only needs it for the syscall itself. Dropping
1103 // it explicitly after the closure makes the lifetime obvious.
1104 let result = f();
1105 drop(ns_fd);
1106 result
1107 });
1108
1109 join_handle
1110 .join()
1111 .map_err(|_| NetlinkError::Netlink("with_netns_fd thread panicked".to_string()))?
1112}
1113
1114/// Non-Linux Unix (macOS/BSD) stub. Not emitted on Windows — the sole caller
1115/// chain (`attach_to_interface` in `overlay_manager.rs`) is
1116/// `#[cfg(target_os = "linux")]`-gated, and `OwnedFd` is Unix-only.
1117#[cfg(all(not(target_os = "linux"), unix))]
1118pub fn with_netns_fd<F, T>(_ns_fd: std::os::fd::OwnedFd, _f: F) -> Result<T, NetlinkError>
1119where
1120 F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
1121 T: Send + 'static,
1122{
1123 Err(NetlinkError::Netlink(
1124 "with_netns_fd is only supported on Linux".to_string(),
1125 ))
1126}
1127
1128/// Run a synchronous closure inside the network namespace of the
1129/// given PID.
1130///
1131/// Thin wrapper around [`with_netns_fd`] that opens
1132/// `/proc/<target_pid>/ns/net` then delegates. Kept for backward
1133/// compatibility and for callers that only need a single operation
1134/// on the target netns. Callers that need to pin the namespace
1135/// across multiple operations (and survive a racing exit of the
1136/// container init) should open the fd themselves and call
1137/// [`with_netns_fd`] directly.
1138///
1139/// Because `setns` is synchronous and `rtnetlink` is async, the
1140/// typical usage pattern inside the closure is to build a local
1141/// current-thread tokio runtime and `block_on` the netlink calls.
1142/// See [`with_netns_async`] for a convenience wrapper that does
1143/// exactly this.
1144///
1145/// # Errors
1146///
1147/// Returns [`NetlinkError::Io`] if `/proc/<target_pid>/ns/net` cannot
1148/// be opened. Returns [`NetlinkError::Netlink`] if `setns` fails or
1149/// the dedicated thread panics. Any error returned by the closure
1150/// itself is propagated verbatim.
1151#[cfg(target_os = "linux")]
1152pub fn with_netns<F, T>(target_pid: u32, f: F) -> Result<T, NetlinkError>
1153where
1154 F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
1155 T: Send + 'static,
1156{
1157 use std::os::fd::OwnedFd;
1158
1159 let ns_file = std::fs::File::open(format!("/proc/{target_pid}/ns/net"))?;
1160 let ns_fd: OwnedFd = OwnedFd::from(ns_file);
1161 with_netns_fd(ns_fd, f)
1162}
1163
1164/// Non-Linux stub.
1165#[cfg(not(target_os = "linux"))]
1166pub fn with_netns<F, T>(_target_pid: u32, _f: F) -> Result<T, NetlinkError>
1167where
1168 F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
1169 T: Send + 'static,
1170{
1171 Err(NetlinkError::Netlink(
1172 "with_netns is only supported on Linux".to_string(),
1173 ))
1174}
1175
1176/// Convenience wrapper around [`with_netns_fd`] that builds a local
1177/// current-thread tokio runtime inside the dedicated thread and
1178/// drives the provided async future to completion.
1179///
1180/// The future is produced by calling `f()` from inside the thread
1181/// that has already joined the target netns, so any rtnetlink
1182/// operations awaited inside the future will talk to the target
1183/// netns's kernel.
1184///
1185/// The local runtime is lightweight (single-thread, built per call)
1186/// and only drives a handful of netlink messages before being
1187/// dropped with the thread.
1188///
1189/// The `OwnedFd` is moved into the worker thread and closed when
1190/// the thread exits.
1191///
1192/// # Errors
1193///
1194/// Returns [`NetlinkError::Netlink`] per [`with_netns_fd`], plus
1195/// [`NetlinkError::Netlink`] if the local runtime fails to build.
1196/// Any error returned by the future is propagated verbatim.
1197#[cfg(target_os = "linux")]
1198pub fn with_netns_fd_async<F, Fut, T>(ns_fd: std::os::fd::OwnedFd, f: F) -> Result<T, NetlinkError>
1199where
1200 F: FnOnce() -> Fut + Send + 'static,
1201 Fut: std::future::Future<Output = Result<T, NetlinkError>>,
1202 T: Send + 'static,
1203{
1204 with_netns_fd(ns_fd, move || {
1205 let rt = tokio::runtime::Builder::new_current_thread()
1206 .enable_all()
1207 .build()
1208 .map_err(|e| NetlinkError::Netlink(format!("local runtime build failed: {e}")))?;
1209 rt.block_on(f())
1210 })
1211}
1212
1213/// Non-Linux Unix (macOS/BSD) stub. Not emitted on Windows — the sole caller
1214/// chain (`attach_to_interface` in `overlay_manager.rs`) is
1215/// `#[cfg(target_os = "linux")]`-gated, and `OwnedFd` is Unix-only.
1216#[cfg(all(not(target_os = "linux"), unix))]
1217pub fn with_netns_fd_async<F, Fut, T>(
1218 _ns_fd: std::os::fd::OwnedFd,
1219 _f: F,
1220) -> Result<T, NetlinkError>
1221where
1222 F: FnOnce() -> Fut + Send + 'static,
1223 Fut: std::future::Future<Output = Result<T, NetlinkError>>,
1224 T: Send + 'static,
1225{
1226 Err(NetlinkError::Netlink(
1227 "with_netns_fd_async is only supported on Linux".to_string(),
1228 ))
1229}
1230
1231/// Convenience wrapper around [`with_netns`] that builds a local
1232/// current-thread tokio runtime inside the dedicated thread and
1233/// drives the provided async future to completion.
1234///
1235/// Thin wrapper around [`with_netns_fd_async`] that opens
1236/// `/proc/<target_pid>/ns/net` then delegates.
1237///
1238/// # Errors
1239///
1240/// Returns [`NetlinkError::Io`] / [`NetlinkError::Netlink`] per
1241/// [`with_netns`], plus [`NetlinkError::Netlink`] if the local
1242/// runtime fails to build. Any error returned by the future is
1243/// propagated verbatim.
1244#[cfg(target_os = "linux")]
1245pub fn with_netns_async<F, Fut, T>(target_pid: u32, f: F) -> Result<T, NetlinkError>
1246where
1247 F: FnOnce() -> Fut + Send + 'static,
1248 Fut: std::future::Future<Output = Result<T, NetlinkError>>,
1249 T: Send + 'static,
1250{
1251 use std::os::fd::OwnedFd;
1252
1253 let ns_file = std::fs::File::open(format!("/proc/{target_pid}/ns/net"))?;
1254 let ns_fd: OwnedFd = OwnedFd::from(ns_file);
1255 with_netns_fd_async(ns_fd, f)
1256}
1257
1258/// Non-Linux stub.
1259#[cfg(not(target_os = "linux"))]
1260pub fn with_netns_async<F, Fut, T>(_target_pid: u32, _f: F) -> Result<T, NetlinkError>
1261where
1262 F: FnOnce() -> Fut + Send + 'static,
1263 Fut: std::future::Future<Output = Result<T, NetlinkError>>,
1264 T: Send + 'static,
1265{
1266 Err(NetlinkError::Netlink(
1267 "with_netns_async is only supported on Linux".to_string(),
1268 ))
1269}
1270
1271/// Create a Linux bridge interface with the given name.
1272///
1273/// Replaces the shell-out:
1274/// ip link add name `<name>` type bridge
1275///
1276/// Idempotent: if a link with that name already exists this returns
1277/// `Ok(())`. This matches how the overlay manager's per-service bridge
1278/// creation path needs to behave — multiple containers landing on the
1279/// same service-on-node bridge must all see "bridge ready" after a
1280/// successful call without racing against existence checks.
1281///
1282/// The bridge is created in the current network namespace. Callers
1283/// that need a different netns should wrap with [`with_netns_async`].
1284/// The bridge is created in the administratively-down state — call
1285/// [`set_link_up_by_name`] separately once any other attributes
1286/// ([`set_bridge_stp`] etc.) have been applied.
1287///
1288/// # Errors
1289///
1290/// Returns [`NetlinkError::Netlink`] for any RTNETLINK failure other
1291/// than `EEXIST` (which is treated as success).
1292#[cfg(target_os = "linux")]
1293pub async fn create_bridge(name: &str) -> Result<(), NetlinkError> {
1294 let (connection, handle, _) = rtnetlink::new_connection()
1295 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
1296 tokio::spawn(connection);
1297
1298 match handle.link().add().bridge(name.to_string()).execute().await {
1299 Ok(()) => Ok(()),
1300 Err(rtnetlink::Error::NetlinkError(err)) => {
1301 // EEXIST means a link with this name already exists. We
1302 // intentionally do NOT verify that the existing link is
1303 // actually a bridge — callers using stable per-service
1304 // names own that invariant, and re-checking here would
1305 // require another rtnetlink round-trip on the hot path.
1306 let is_eexist = err
1307 .code
1308 .is_some_and(|c| c.get().unsigned_abs() == libc::EEXIST as u32);
1309 let msg = err.to_string();
1310 if is_eexist || msg.contains("File exists") {
1311 Ok(())
1312 } else {
1313 Err(NetlinkError::Netlink(format!(
1314 "bridge create failed for {name}: {msg}"
1315 )))
1316 }
1317 }
1318 Err(e) => {
1319 let msg = e.to_string();
1320 if msg.contains("File exists") {
1321 Ok(())
1322 } else {
1323 Err(NetlinkError::Netlink(format!(
1324 "bridge create failed for {name}: {msg}"
1325 )))
1326 }
1327 }
1328 }
1329}
1330
1331/// Non-Linux stub.
1332#[cfg(not(target_os = "linux"))]
1333pub async fn create_bridge(_name: &str) -> Result<(), NetlinkError> {
1334 Err(NetlinkError::Netlink(
1335 "create_bridge is only supported on Linux".to_string(),
1336 ))
1337}
1338
1339/// Delete the bridge interface with the given name.
1340///
1341/// Replaces the shell-out:
1342/// ip link delete `<name>` type bridge
1343///
1344/// Idempotent: returns `Ok(())` if the bridge does not exist.
1345/// Delegates to [`delete_link_by_name`] — from RTNETLINK's perspective
1346/// deleting a bridge is the same `RTM_DELLINK` as deleting any other
1347/// link, and `delete_link_by_name` already has the ENODEV-as-success
1348/// handling we want.
1349///
1350/// # Errors
1351///
1352/// Returns [`NetlinkError::Netlink`] for any RTNETLINK failure other
1353/// than `ENODEV` (which is treated as success).
1354#[cfg(target_os = "linux")]
1355pub async fn delete_bridge(name: &str) -> Result<(), NetlinkError> {
1356 delete_link_by_name(name).await
1357}
1358
1359/// Non-Linux stub.
1360#[cfg(not(target_os = "linux"))]
1361pub async fn delete_bridge(_name: &str) -> Result<(), NetlinkError> {
1362 Err(NetlinkError::Netlink(
1363 "delete_bridge is only supported on Linux".to_string(),
1364 ))
1365}
1366
1367/// Count the member links of a bridge by reading the kernel's canonical
1368/// bridge-port directory `/sys/class/net/<name>/brif/`. Returns 0 when the
1369/// directory is absent or unreadable — correct for a `-d` `WireGuard` device
1370/// (not a bridge, no `brif`) and for an empty bridge. Used by the orphan
1371/// prune's zero-member guard to reclaim only IDLE service bridges.
1372#[cfg(target_os = "linux")]
1373pub async fn bridge_member_count(name: &str) -> usize {
1374 let path = format!("/sys/class/net/{name}/brif");
1375 let Ok(mut entries) = tokio::fs::read_dir(&path).await else {
1376 return 0;
1377 };
1378 let mut count = 0usize;
1379 while let Ok(Some(_entry)) = entries.next_entry().await {
1380 count += 1;
1381 }
1382 count
1383}
1384
1385/// Non-Linux: per-service bridges are a Linux-only mechanic, so there are no
1386/// `brif` members to count.
1387#[cfg(not(target_os = "linux"))]
1388#[allow(clippy::unused_async)]
1389pub async fn bridge_member_count(_name: &str) -> usize {
1390 0
1391}
1392
1393/// Attach `link` to `bridge` by setting the link's `IFLA_MASTER` to
1394/// the bridge's ifindex.
1395///
1396/// Replaces the shell-out:
1397/// ip link set `<link>` master `<bridge>`
1398///
1399/// Both interfaces must already exist in the current network
1400/// namespace. This is what the overlay manager will call to splice a
1401/// container's host-side veth end into the per-service bridge instead
1402/// of /32-routing it directly.
1403///
1404/// # Errors
1405///
1406/// Returns [`NetlinkError::NotFound`] if either `link` or `bridge`
1407/// does not exist in the current netns. Returns
1408/// [`NetlinkError::Netlink`] for any other RTNETLINK failure.
1409#[cfg(target_os = "linux")]
1410pub async fn add_link_to_bridge(link: &str, bridge: &str) -> Result<(), NetlinkError> {
1411 use futures_util::stream::TryStreamExt;
1412
1413 let (connection, handle, _) = rtnetlink::new_connection()
1414 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
1415 tokio::spawn(connection);
1416
1417 let bridge_link = handle
1418 .link()
1419 .get()
1420 .match_name(bridge.to_string())
1421 .execute()
1422 .try_next()
1423 .await
1424 .map_err(|e| {
1425 let msg = e.to_string();
1426 if msg.contains("No such device") {
1427 NetlinkError::NotFound(bridge.to_string())
1428 } else {
1429 NetlinkError::Netlink(format!("link lookup failed for {bridge}: {msg}"))
1430 }
1431 })?
1432 .ok_or_else(|| NetlinkError::NotFound(bridge.to_string()))?;
1433 let bridge_idx = bridge_link.header.index;
1434
1435 let member_link = handle
1436 .link()
1437 .get()
1438 .match_name(link.to_string())
1439 .execute()
1440 .try_next()
1441 .await
1442 .map_err(|e| {
1443 let msg = e.to_string();
1444 if msg.contains("No such device") {
1445 NetlinkError::NotFound(link.to_string())
1446 } else {
1447 NetlinkError::Netlink(format!("link lookup failed for {link}: {msg}"))
1448 }
1449 })?
1450 .ok_or_else(|| NetlinkError::NotFound(link.to_string()))?;
1451 let member_idx = member_link.header.index;
1452
1453 handle
1454 .link()
1455 .set(member_idx)
1456 .controller(bridge_idx)
1457 .execute()
1458 .await
1459 .map_err(|e| {
1460 NetlinkError::Netlink(format!(
1461 "set master failed: link={link} bridge={bridge}: {e}"
1462 ))
1463 })
1464}
1465
1466/// Non-Linux stub.
1467#[cfg(not(target_os = "linux"))]
1468pub async fn add_link_to_bridge(_link: &str, _bridge: &str) -> Result<(), NetlinkError> {
1469 Err(NetlinkError::Netlink(
1470 "add_link_to_bridge is only supported on Linux".to_string(),
1471 ))
1472}
1473
1474/// Enable or disable Spanning Tree Protocol (STP) on the named bridge.
1475///
1476/// STP is disabled by default on bridges created via [`create_bridge`]
1477/// (the kernel default for a freshly-created bridge is STP off), and
1478/// for `ZLayer`'s per-service bridges we want to keep it off: each
1479/// bridge is single-host, has no possibility of a loop, and STP's
1480/// initial 30s forwarding-delay would stall container traffic on
1481/// attach.
1482///
1483/// rtnetlink 0.14 does not expose a typed builder for `IFLA_BR_STP_STATE`
1484/// (it lives inside the nested `IFLA_LINKINFO` -> `IFLA_INFO_DATA` ->
1485/// `IFLA_BR_STP_STATE` attribute and the crate's bridge builder only
1486/// covers it at create-time, not as a post-create modification). The
1487/// portable kernel-supported alternative is the sysfs knob at
1488/// `/sys/class/net/<name>/bridge/stp_state`, which is what
1489/// `brctl stp <name> on|off` writes under the hood. We use the sysfs
1490/// path so the helper works on every kernel that has bridge support
1491/// without depending on an rtnetlink API surface that may move
1492/// between crate versions.
1493///
1494/// # Errors
1495///
1496/// Returns [`NetlinkError::NotFound`] if the bridge does not exist (no
1497/// `/sys/class/net/<name>/bridge` directory). Returns
1498/// [`NetlinkError::Io`] for any other write failure (permission
1499/// denied, the link exists but is not a bridge, etc.).
1500#[cfg(target_os = "linux")]
1501pub fn set_bridge_stp(name: &str, stp_on: bool) -> Result<(), NetlinkError> {
1502 let bridge_dir = format!("/sys/class/net/{name}/bridge");
1503 if !std::path::Path::new(&bridge_dir).exists() {
1504 return Err(NetlinkError::NotFound(name.to_string()));
1505 }
1506 let path = format!("{bridge_dir}/stp_state");
1507 let value = if stp_on { "1" } else { "0" };
1508 std::fs::write(&path, value)?;
1509 Ok(())
1510}
1511
1512/// Non-Linux stub.
1513#[cfg(not(target_os = "linux"))]
1514pub fn set_bridge_stp(_name: &str, _stp_on: bool) -> Result<(), NetlinkError> {
1515 Err(NetlinkError::Netlink(
1516 "set_bridge_stp is only supported on Linux".to_string(),
1517 ))
1518}
1519
1520#[cfg(test)]
1521mod tests {
1522 // The helpers and tests in this module are Linux-only (they require
1523 // netlink + CAP_NET_ADMIN). Keep imports/fixtures gated so the lib
1524 // tests still compile on Windows/macOS cross-checks.
1525 #[cfg(target_os = "linux")]
1526 use super::*;
1527
1528 /// Generate a short random-ish suffix for test interface names so
1529 /// parallel `cargo test` invocations don't collide. Bounded to 6
1530 /// chars so the full name (`zlb-` prefix + suffix) stays under the
1531 /// 15-char `IFNAMSIZ` limit.
1532 #[cfg(target_os = "linux")]
1533 fn rand_suffix() -> String {
1534 use std::time::{SystemTime, UNIX_EPOCH};
1535 const CHARS: &[u8] = b"0123456789abcdefghijklmnopqrstuvwxyz";
1536 let nanos = SystemTime::now()
1537 .duration_since(UNIX_EPOCH)
1538 .map_or(0, |d| d.subsec_nanos());
1539 // base36-ish, 6 chars
1540 let mut n = u64::from(nanos);
1541 let mut out = String::new();
1542 let base = CHARS.len() as u64;
1543 for _ in 0..6 {
1544 let idx = usize::try_from(n % base).unwrap_or(0);
1545 out.push(CHARS[idx] as char);
1546 n /= base;
1547 }
1548 out
1549 }
1550
1551 /// True when the process holds enough privilege to mutate netlink (root, or
1552 /// at least `CAP_NET_ADMIN`). The `#[ignore]`d root-gated tests below call
1553 /// this and return early (a skip, not a failure) when run via `--ignored` on
1554 /// an unprivileged host, mirroring the rest of the crate's "skip gracefully
1555 /// when not root" convention.
1556 #[cfg(target_os = "linux")]
1557 fn have_net_admin() -> bool {
1558 // SAFETY: `geteuid` is a pure read of the caller's effective uid with no
1559 // preconditions and no side effects.
1560 #[allow(unsafe_code)]
1561 let euid = unsafe { libc::geteuid() };
1562 if euid == 0 {
1563 return true;
1564 }
1565 // Non-root: probe whether netlink link creation actually works. A failure
1566 // to even open a netlink socket / create a link means no CAP_NET_ADMIN.
1567 // We don't leave anything behind on success (the caller's test does its
1568 // own create/cleanup); this is a cheap capability sniff via a throwaway
1569 // dummy that is immediately deleted.
1570 let probe = format!("zlcap-{}", rand_suffix());
1571 if probe.len() > 15 {
1572 return false;
1573 }
1574 let Ok(rt) = tokio::runtime::Builder::new_current_thread()
1575 .enable_all()
1576 .build()
1577 else {
1578 return false;
1579 };
1580 rt.block_on(async {
1581 if create_dummy(&probe).await.is_err() {
1582 return false;
1583 }
1584 let _ = delete_link_by_name(&probe).await;
1585 true
1586 })
1587 }
1588
1589 /// Query the kernel (via `ip route`) for whether a link-scope route to
1590 /// `dest/prefix` out of `dev` is present. Returns `false` when `ip` is
1591 /// missing or the route is absent. Used by the teardown round-trip tests to
1592 /// assert a route is actually installed/removed at the kernel level rather
1593 /// than trusting the helper's return value alone.
1594 #[cfg(target_os = "linux")]
1595 fn route_present(dest: &str, prefix: u8, dev: &str) -> bool {
1596 use std::process::Command;
1597 let target = format!("{dest}/{prefix}");
1598 let Ok(out) = Command::new("ip")
1599 .args(["route", "show", &target, "dev", dev])
1600 .output()
1601 else {
1602 return false;
1603 };
1604 out.status.success() && !out.stdout.is_empty()
1605 }
1606
1607 /// Create a dummy interface with the given name (used as a stand-in
1608 /// for a host-side veth end in `bridge_add_link_membership`).
1609 #[cfg(target_os = "linux")]
1610 async fn create_dummy(name: &str) -> Result<(), NetlinkError> {
1611 let (connection, handle, _) = rtnetlink::new_connection()
1612 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
1613 tokio::spawn(connection);
1614 handle
1615 .link()
1616 .add()
1617 .dummy(name.to_string())
1618 .execute()
1619 .await
1620 .map_err(|e| NetlinkError::Netlink(format!("dummy create failed for {name}: {e}")))
1621 }
1622
1623 #[cfg(target_os = "linux")]
1624 #[tokio::test]
1625 #[ignore = "requires CAP_NET_ADMIN; run manually or in privileged CI"]
1626 async fn bridge_create_idempotent() {
1627 let name = format!("zlb-{}", rand_suffix());
1628 assert!(name.len() <= 15, "interface name exceeds IFNAMSIZ: {name}");
1629
1630 // First create.
1631 create_bridge(&name).await.expect("first create_bridge");
1632 assert!(
1633 std::path::Path::new(&format!("/sys/class/net/{name}")).exists(),
1634 "bridge {name} should exist after create"
1635 );
1636
1637 // Second create on same name must be Ok.
1638 create_bridge(&name)
1639 .await
1640 .expect("second create_bridge should be idempotent");
1641
1642 // Delete and confirm gone.
1643 delete_bridge(&name).await.expect("delete_bridge");
1644 assert!(
1645 !std::path::Path::new(&format!("/sys/class/net/{name}")).exists(),
1646 "bridge {name} should be gone after delete"
1647 );
1648
1649 // Second delete on missing name must be Ok.
1650 delete_bridge(&name)
1651 .await
1652 .expect("second delete_bridge should be idempotent");
1653 }
1654
1655 /// Count the addresses currently assigned to a link (used by the flush test).
1656 #[cfg(target_os = "linux")]
1657 async fn count_addresses(name: &str) -> usize {
1658 use futures_util::stream::TryStreamExt;
1659 let (connection, handle, _) = rtnetlink::new_connection().expect("new_connection");
1660 tokio::spawn(connection);
1661 let link = handle
1662 .link()
1663 .get()
1664 .match_name(name.to_string())
1665 .execute()
1666 .try_next()
1667 .await
1668 .expect("link lookup")
1669 .expect("link present");
1670 let index = link.header.index;
1671 let addrs: Vec<_> = handle
1672 .address()
1673 .get()
1674 .set_link_index_filter(index)
1675 .execute()
1676 .try_collect()
1677 .await
1678 .expect("addr list");
1679 addrs.len()
1680 }
1681
1682 /// Regression for the dual `/28` + `/26` leak: a bridge re-created over a
1683 /// surviving link used to stack the new gateway on top of the stale one
1684 /// because nothing flushed first. `flush_addresses_on_link_by_name` must
1685 /// wipe all addresses so a re-add yields exactly one.
1686 #[cfg(target_os = "linux")]
1687 #[tokio::test]
1688 #[ignore = "requires CAP_NET_ADMIN; run manually or in privileged CI"]
1689 async fn bridge_address_flush_removes_stale_then_single_readd() {
1690 use std::net::{IpAddr, Ipv4Addr};
1691 let name = format!("zlb-{}", rand_suffix());
1692 assert!(name.len() <= 15, "interface name exceeds IFNAMSIZ: {name}");
1693 create_bridge(&name).await.expect("create_bridge");
1694
1695 // Simulate the leak: a stale /28 plus a re-allocated /26 on one bridge.
1696 add_address_to_link_by_name(&name, IpAddr::V4(Ipv4Addr::new(10, 9, 0, 1)), 28)
1697 .await
1698 .expect("add /28");
1699 add_address_to_link_by_name(&name, IpAddr::V4(Ipv4Addr::new(10, 9, 1, 1)), 26)
1700 .await
1701 .expect("add /26");
1702 assert!(
1703 count_addresses(&name).await >= 2,
1704 "both addresses should be present before flush"
1705 );
1706
1707 // Flush wipes every address.
1708 flush_addresses_on_link_by_name(&name).await.expect("flush");
1709 assert_eq!(
1710 count_addresses(&name).await,
1711 0,
1712 "flush should remove all addresses"
1713 );
1714
1715 // Re-add exactly the gateway → exactly one address (the link stays down,
1716 // so no IPv6 link-local re-appears).
1717 add_address_to_link_by_name(&name, IpAddr::V4(Ipv4Addr::new(10, 9, 1, 1)), 26)
1718 .await
1719 .expect("re-add /26");
1720 assert_eq!(
1721 count_addresses(&name).await,
1722 1,
1723 "exactly one address after flush + re-add"
1724 );
1725
1726 delete_bridge(&name).await.expect("delete_bridge");
1727 }
1728
1729 /// The teardown-by-name and flush-before-add paths both call into the netlink
1730 /// helpers for links that may not exist; flushing an absent link must be a
1731 /// tolerant no-op, not an error.
1732 #[cfg(target_os = "linux")]
1733 #[tokio::test]
1734 #[ignore = "requires CAP_NET_ADMIN; run manually or in privileged CI"]
1735 async fn flush_addresses_on_absent_link_is_ok() {
1736 let name = format!("zlb-{}", rand_suffix());
1737 flush_addresses_on_link_by_name(&name)
1738 .await
1739 .expect("flush on absent link should be Ok");
1740 }
1741
1742 #[cfg(target_os = "linux")]
1743 #[tokio::test]
1744 #[ignore = "requires CAP_NET_ADMIN; run manually or in privileged CI"]
1745 async fn bridge_add_link_membership() {
1746 let suffix = rand_suffix();
1747 let bridge = format!("zlb-{suffix}");
1748 let dummy = format!("zld-{suffix}");
1749 assert!(bridge.len() <= 15);
1750 assert!(dummy.len() <= 15);
1751
1752 create_bridge(&bridge).await.expect("create_bridge");
1753 create_dummy(&dummy).await.expect("create_dummy");
1754
1755 add_link_to_bridge(&dummy, &bridge)
1756 .await
1757 .expect("add_link_to_bridge");
1758
1759 // The dummy's master/ifindex symlink should resolve to the
1760 // bridge's ifindex.
1761 let master_ifindex_path = format!("/sys/class/net/{dummy}/master/ifindex");
1762 let dummy_master_ifindex = std::fs::read_to_string(&master_ifindex_path)
1763 .expect("read dummy master ifindex")
1764 .trim()
1765 .parse::<u32>()
1766 .expect("parse dummy master ifindex");
1767
1768 let bridge_ifindex = std::fs::read_to_string(format!("/sys/class/net/{bridge}/ifindex"))
1769 .expect("read bridge ifindex")
1770 .trim()
1771 .parse::<u32>()
1772 .expect("parse bridge ifindex");
1773
1774 assert_eq!(
1775 dummy_master_ifindex, bridge_ifindex,
1776 "dummy's master ifindex should equal bridge's ifindex"
1777 );
1778
1779 // Cleanup.
1780 delete_link_by_name(&dummy).await.expect("delete dummy");
1781 delete_bridge(&bridge).await.expect("delete bridge");
1782 }
1783
1784 /// `bridge_member_count` must read the kernel's `brif` directory: an empty
1785 /// freshly-created bridge has 0 members, and attaching one dummy link yields
1786 /// exactly 1. This is the signal the orphan prune uses to decide a candidate
1787 /// bridge is idle and reclaimable.
1788 #[cfg(target_os = "linux")]
1789 #[tokio::test]
1790 #[ignore = "requires root or CAP_NET_ADMIN"]
1791 async fn bridge_member_count_counts_brif_entries() {
1792 let suffix = rand_suffix();
1793 let bridge = format!("zlb-{suffix}");
1794 let dummy = format!("zld-{suffix}");
1795 assert!(bridge.len() <= 15);
1796 assert!(dummy.len() <= 15);
1797
1798 // Fresh bridge: zero members.
1799 create_bridge(&bridge).await.expect("create_bridge");
1800 assert_eq!(
1801 bridge_member_count(&bridge).await,
1802 0,
1803 "freshly-created bridge should have 0 members"
1804 );
1805
1806 // Attach a dummy link → exactly one member.
1807 create_dummy(&dummy).await.expect("create_dummy");
1808 add_link_to_bridge(&dummy, &bridge)
1809 .await
1810 .expect("add_link_to_bridge");
1811 assert_eq!(
1812 bridge_member_count(&bridge).await,
1813 1,
1814 "bridge with one attached link should report 1 member"
1815 );
1816
1817 // Cleanup the links.
1818 delete_link_by_name(&dummy).await.expect("delete dummy");
1819 delete_bridge(&bridge).await.expect("delete bridge");
1820 }
1821
1822 #[cfg(target_os = "linux")]
1823 #[tokio::test]
1824 #[ignore = "requires CAP_NET_ADMIN; run manually or in privileged CI"]
1825 async fn bridge_stp_off() {
1826 let name = format!("zlb-{}", rand_suffix());
1827 assert!(name.len() <= 15);
1828
1829 create_bridge(&name).await.expect("create_bridge");
1830
1831 set_bridge_stp(&name, false).expect("set_bridge_stp off");
1832 let stp_state = std::fs::read_to_string(format!("/sys/class/net/{name}/bridge/stp_state"))
1833 .expect("read stp_state")
1834 .trim()
1835 .to_string();
1836 assert_eq!(
1837 stp_state, "0",
1838 "stp_state should be 0 after set_bridge_stp(false)"
1839 );
1840
1841 // Cleanup.
1842 delete_bridge(&name).await.expect("delete_bridge");
1843 }
1844
1845 /// Full teardown round-trip for the host-side overlay netlink resources the
1846 /// daemon reverts on shutdown: create a veth pair + a bridge, install a host
1847 /// `/32` link-scope route via the host veth (the bridgeless per-container
1848 /// attach shape), then run the exact delete helpers
1849 /// `teardown_global_overlay` uses — `delete_route_via_dev`,
1850 /// `delete_link_by_name` — and assert each resource is actually gone at the
1851 /// kernel level (via `/sys/class/net` and `ip route`).
1852 ///
1853 /// This is the regression for the teardown fix: it validates the new
1854 /// `delete_route_via_dev` helper round-trips a real route AND that the delete
1855 /// idempotency holds (a second delete of an already-absent route/link must be
1856 /// `Ok`). Uses unique <=15-char names and tears everything down on every exit
1857 /// path *before* asserting, so a failed assertion still leaves the host clean.
1858 #[cfg(target_os = "linux")]
1859 #[tokio::test(flavor = "multi_thread")]
1860 #[ignore = "requires root or CAP_NET_ADMIN"]
1861 async fn teardown_deletes_route_veth_and_bridge() {
1862 use std::net::{IpAddr, Ipv4Addr};
1863
1864 if !have_net_admin() {
1865 eprintln!("skipping teardown_deletes_route_veth_and_bridge: no CAP_NET_ADMIN");
1866 return;
1867 }
1868
1869 let suffix = rand_suffix();
1870 let veth_host = format!("vh-{suffix}");
1871 let veth_peer = format!("vp-{suffix}");
1872 let bridge = format!("zlb-{suffix}");
1873 assert!(veth_host.len() <= 15, "veth host name exceeds IFNAMSIZ");
1874 assert!(veth_peer.len() <= 15, "veth peer name exceeds IFNAMSIZ");
1875 assert!(bridge.len() <= 15, "bridge name exceeds IFNAMSIZ");
1876
1877 let dest = IpAddr::V4(Ipv4Addr::new(10, 222, 0, 7));
1878 let dest_str = "10.222.0.7";
1879 let prefix: u8 = 32;
1880
1881 // --- setup: veth pair, bridge, and a /32 host route via the host veth ---
1882 create_veth_pair(&veth_host, &veth_peer)
1883 .await
1884 .expect("create_veth_pair");
1885 create_bridge(&bridge).await.expect("create_bridge");
1886 replace_route_via_dev(dest, prefix, &veth_host, None)
1887 .await
1888 .expect("replace_route_via_dev installs /32");
1889
1890 // Snapshot kernel presence BEFORE teardown.
1891 let route_was_present = route_present(dest_str, prefix, &veth_host);
1892 let veth_was_present =
1893 std::path::Path::new(&format!("/sys/class/net/{veth_host}")).exists();
1894 let bridge_was_present = std::path::Path::new(&format!("/sys/class/net/{bridge}")).exists();
1895
1896 // --- teardown: the exact helper sequence teardown_global_overlay drives,
1897 // in the same order (route first since it references the veth as its oif,
1898 // then the host veth, then the bridge). Collect results; the deletes are
1899 // best-effort so we capture them and run a belt-and-braces cleanup of any
1900 // straggler before asserting, keeping the host clean on a failed assert.
1901 let del_route = delete_route_via_dev(dest, prefix, &veth_host).await;
1902 let del_veth = delete_link_by_name(&veth_host).await;
1903 let del_bridge = delete_link_by_name(&bridge).await;
1904
1905 // Idempotency: a second delete of the now-absent route/links must be Ok
1906 // (this is what lets teardown run unconditionally over per-container
1907 // detach leftovers without aborting).
1908 let del_route_again = delete_route_via_dev(dest, prefix, &veth_host).await;
1909 let del_veth_again = delete_link_by_name(&veth_host).await;
1910 let del_bridge_again = delete_link_by_name(&bridge).await;
1911
1912 // Snapshot kernel absence AFTER teardown.
1913 let route_gone = !route_present(dest_str, prefix, &veth_host);
1914 let veth_gone = !std::path::Path::new(&format!("/sys/class/net/{veth_host}")).exists();
1915 let bridge_gone = !std::path::Path::new(&format!("/sys/class/net/{bridge}")).exists();
1916
1917 // Belt-and-braces: ensure nothing leaks even if an assertion below fails.
1918 let _ = delete_route_via_dev(dest, prefix, &veth_host).await;
1919 let _ = delete_link_by_name(&veth_host).await;
1920 let _ = delete_link_by_name(&veth_peer).await;
1921 let _ = delete_link_by_name(&bridge).await;
1922
1923 // --- assertions (all after cleanup) ---
1924 assert!(
1925 route_was_present,
1926 "the /32 route should exist after install"
1927 );
1928 assert!(veth_was_present, "the host veth should exist after create");
1929 assert!(bridge_was_present, "the bridge should exist after create");
1930
1931 del_route.expect("delete_route_via_dev should succeed");
1932 del_veth.expect("delete_link_by_name(veth) should succeed");
1933 del_bridge.expect("delete_link_by_name(bridge) should succeed");
1934
1935 del_route_again.expect("second delete_route_via_dev should be idempotent Ok");
1936 del_veth_again.expect("second delete_link_by_name(veth) should be idempotent Ok");
1937 del_bridge_again.expect("second delete_link_by_name(bridge) should be idempotent Ok");
1938
1939 assert!(route_gone, "the /32 route should be gone after teardown");
1940 assert!(veth_gone, "the host veth should be gone after teardown");
1941 assert!(bridge_gone, "the bridge should be gone after teardown");
1942 }
1943}