Skip to main content

zlayer_overlayd/
netlink.rs

1//! Rust netlink helpers that replace shell-outs to `ip`/`nsenter`/`sysctl`
2//! for per-container overlay network setup.
3//!
4//! This module is populated incrementally through a phased migration.
5//! Stage 1: `move_link_into_netns_and_rename` replaces the shell pair
6//!          `ip link set <name> netns <pid>` + `nsenter -t <pid> -n ip
7//!          link set <name> name <new>` with a single atomic RTNETLINK
8//!          `SetLink` carrying both `IFLA_NET_NS_FD` and `IFLA_IFNAME`.
9//!          This bypasses the `/proc/<pid>/ns/net` access problem caused
10//!          by libcontainer setting `PR_SET_DUMPABLE(false)` on the
11//!          container init process under `SELinux` enforcing.
12//! Stage 2: `create_veth_pair`, `delete_link_by_name`, and
13//!          `set_link_up_by_name` replace the host-side veth shell
14//!          commands (`ip link add ... type veth peer name ...`,
15//!          `ip link delete ...`, `ip link set ... up`) used by
16//!          `overlay_manager::attach_to_interface` and the orphan
17//!          sweeper. These helpers talk RTNETLINK directly via the
18//!          `rtnetlink` crate (async, tokio-backed).
19//! Stage 3: `with_netns`, `add_address_to_link_by_name`, and
20//!          `add_default_route_via_dev` replace the remaining
21//!          container-netns shell-outs in
22//!          `overlay_manager::attach_to_interface`. `with_netns`
23//!          runs a closure on a dedicated OS thread that has joined
24//!          the target container's network namespace via `setns(2)`,
25//!          while the two new RTNETLINK helpers operate on the
26//!          current netns (so they must be invoked from inside a
27//!          `with_netns` closure). This removes the last three
28//!          `nsenter -t <pid> -n ip ...` shell-outs used to assign
29//!          the container IP, bring `eth0` / `lo` up, and add the
30//!          default route.
31
32#![cfg_attr(
33    not(target_os = "linux"),
34    allow(clippy::missing_errors_doc, clippy::unused_async)
35)]
36
37use thiserror::Error;
38
39/// Errors returned by the netlink helpers in this module.
40#[derive(Debug, Error)]
41pub enum NetlinkError {
42    /// Failed to open or access a file (typically `/proc/<pid>/ns/net`).
43    #[error("io error: {0}")]
44    Io(#[from] std::io::Error),
45
46    /// The requested link was not found in the current network namespace.
47    #[error("link '{0}' not found in current netns")]
48    NotFound(String),
49
50    /// A netlink operation failed.
51    #[error("netlink operation failed: {0}")]
52    Netlink(String),
53}
54
55/// Move a link from the current network namespace into the network
56/// namespace referenced by `ns_fd`, renaming it in the same atomic
57/// operation.
58///
59/// This is the fd-based variant of [`move_link_into_netns_and_rename`].
60/// Callers that have already opened `/proc/<pid>/ns/net` (e.g. to pin
61/// the namespace across multiple operations and survive a racing
62/// container init exit) should use this form so we don't reopen the
63/// path and lose the race.
64///
65/// The single RTNETLINK `SetLink` request carries both `IFLA_NET_NS_FD`
66/// and `IFLA_IFNAME`, so the kernel performs the move and the rename
67/// atomically.
68///
69/// # Errors
70///
71/// Returns [`NetlinkError::NotFound`] if `link_name` does not exist in
72/// the current netns. Returns [`NetlinkError::Netlink`] for any other
73/// netlink-level failure (permission denied, name collision in the
74/// target netns, etc.).
75///
76/// Implemented directly against the `rtnetlink` crate (overlayd has no
77/// libcontainer dependency): a single `LinkSetRequest` carrying
78/// `setns_by_fd` + `name` performs the move and rename atomically.
79#[cfg(target_os = "linux")]
80pub fn move_link_into_netns_fd_and_rename(
81    link_name: &str,
82    ns_fd: std::os::fd::BorrowedFd<'_>,
83    new_name: &str,
84) -> Result<(), NetlinkError> {
85    use std::os::fd::AsRawFd;
86
87    // `setns` of the moved link must reference the fd while the request
88    // executes, so we drive the whole sequence on a local current-thread
89    // runtime rather than requiring an ambient tokio context. The raw fd
90    // is borrowed (the caller retains ownership of `ns_fd`).
91    let raw_fd = ns_fd.as_raw_fd();
92    let link_name = link_name.to_string();
93    let new_name = new_name.to_string();
94
95    let rt = tokio::runtime::Builder::new_current_thread()
96        .enable_all()
97        .build()
98        .map_err(|e| NetlinkError::Netlink(format!("local runtime build failed: {e}")))?;
99
100    rt.block_on(async move {
101        use futures_util::stream::TryStreamExt;
102
103        let (connection, handle, _) = rtnetlink::new_connection()
104            .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
105        tokio::spawn(connection);
106
107        // Resolve the host-side interface index. Treat "No such device"
108        // as our dedicated NotFound variant so callers can distinguish
109        // "nothing to move" from real failures.
110        let link = handle
111            .link()
112            .get()
113            .match_name(link_name.clone())
114            .execute()
115            .try_next()
116            .await
117            .map_err(|e| {
118                let msg = e.to_string();
119                if msg.contains("No such device") {
120                    NetlinkError::NotFound(link_name.clone())
121                } else {
122                    NetlinkError::Netlink(format!("link lookup failed for {link_name}: {msg}"))
123                }
124            })?
125            .ok_or_else(|| NetlinkError::NotFound(link_name.clone()))?;
126
127        let index = link.header.index;
128
129        // Atomically move the link into the target netns and rename it.
130        handle
131            .link()
132            .set(index)
133            .setns_by_fd(raw_fd)
134            .name(new_name.clone())
135            .execute()
136            .await
137            .map_err(|e| {
138                NetlinkError::Netlink(format!(
139                    "setns_by_fd(index={index}, new_name={new_name}) failed: {e}"
140                ))
141            })
142    })
143}
144
145/// Stub for non-Linux Unix platforms (macOS/BSD).
146///
147/// Not emitted on Windows: `attach_container` (the sole caller chain) is
148/// itself gated `#[cfg(target_os = "linux")]` in `server.rs`, so there are
149/// no Windows callers, and the `BorrowedFd` parameter type is Unix-only.
150///
151/// # Errors
152///
153/// Always returns [`NetlinkError::Netlink`] — this function is unsupported on
154/// the current target.
155#[cfg(all(not(target_os = "linux"), unix))]
156pub fn move_link_into_netns_fd_and_rename(
157    _link_name: &str,
158    _ns_fd: std::os::fd::BorrowedFd<'_>,
159    _new_name: &str,
160) -> Result<(), NetlinkError> {
161    Err(NetlinkError::Netlink(
162        "move_link_into_netns_fd_and_rename is only supported on Linux".to_string(),
163    ))
164}
165
166/// Move a link from the current network namespace into the target PID's
167/// network namespace, renaming it in the same atomic operation.
168///
169/// Thin wrapper around [`move_link_into_netns_fd_and_rename`] that
170/// opens `/proc/<target_pid>/ns/net` then delegates. Kept for
171/// backward compatibility and for callers that only need a single
172/// operation on the target netns. Callers that need to perform
173/// multiple operations on the same netns (and want to survive a
174/// racing exit of the container init process) should open the fd
175/// themselves and call [`move_link_into_netns_fd_and_rename`]
176/// directly.
177///
178/// # Errors
179///
180/// Returns [`NetlinkError::Io`] if `/proc/<target_pid>/ns/net` cannot be
181/// opened (e.g. the container process is gone or is not dumpable and we
182/// lack `CAP_SYS_PTRACE`). Returns [`NetlinkError::NotFound`] if
183/// `link_name` does not exist in the current netns. Returns
184/// [`NetlinkError::Netlink`] for any other netlink-level failure
185/// (permission denied, name collision in the target netns, etc.).
186#[cfg(target_os = "linux")]
187pub fn move_link_into_netns_and_rename(
188    link_name: &str,
189    target_pid: u32,
190    new_name: &str,
191) -> Result<(), NetlinkError> {
192    use std::os::fd::{AsFd, OwnedFd};
193
194    let ns_file = std::fs::File::open(format!("/proc/{target_pid}/ns/net"))?;
195    let ns_fd: OwnedFd = OwnedFd::from(ns_file);
196    move_link_into_netns_fd_and_rename(link_name, ns_fd.as_fd(), new_name)
197}
198
199/// Non-Linux stub: the overlay manager never calls this on non-Linux
200/// platforms (libcontainer itself is a Linux-only dep), but keeping the
201/// signature available lets `overlay_manager.rs` stay platform-agnostic.
202#[cfg(not(target_os = "linux"))]
203pub fn move_link_into_netns_and_rename(
204    _link_name: &str,
205    _target_pid: u32,
206    _new_name: &str,
207) -> Result<(), NetlinkError> {
208    Err(NetlinkError::Netlink(
209        "move_link_into_netns_and_rename is only supported on Linux".to_string(),
210    ))
211}
212
213/// Create a veth pair with the two ends named `host_name` and `peer_name`.
214///
215/// Both ends start in the current network namespace. The caller is
216/// responsible for moving the peer end into the container netns (see
217/// [`move_link_into_netns_and_rename`]) and bringing the host end up
218/// (see [`set_link_up_by_name`]).
219///
220/// Replaces the shell-out:
221///   ip link add `<host_name>` type veth peer name `<peer_name>`
222///
223/// # Errors
224///
225/// Returns [`NetlinkError::Netlink`] if RTNETLINK fails for any
226/// reason. `EEXIST` / "File exists" is surfaced verbatim so the caller
227/// can distinguish a leaked endpoint (typically a sign the orphan
228/// sweeper missed something) from a permission or interface-name
229/// problem.
230#[cfg(target_os = "linux")]
231pub async fn create_veth_pair(host_name: &str, peer_name: &str) -> Result<(), NetlinkError> {
232    let (connection, handle, _) = rtnetlink::new_connection()
233        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
234    tokio::spawn(connection);
235
236    handle
237        .link()
238        .add()
239        .veth(host_name.to_string(), peer_name.to_string())
240        .execute()
241        .await
242        .map_err(|e| {
243            let msg = e.to_string();
244            if msg.contains("File exists") || msg.contains("EEXIST") {
245                NetlinkError::Netlink(format!(
246                    "veth pair already exists: host={host_name} peer={peer_name}: {msg}"
247                ))
248            } else {
249                NetlinkError::Netlink(format!(
250                    "veth create failed (host={host_name}, peer={peer_name}): {msg}"
251                ))
252            }
253        })
254}
255
256/// Non-Linux stub.
257#[cfg(not(target_os = "linux"))]
258pub async fn create_veth_pair(_host_name: &str, _peer_name: &str) -> Result<(), NetlinkError> {
259    Err(NetlinkError::Netlink(
260        "create_veth_pair is only supported on Linux".to_string(),
261    ))
262}
263
264/// Delete the link by name. Idempotent: returns `Ok(())` if the link
265/// does not exist. Any other error surfaces as
266/// [`NetlinkError::Netlink`].
267///
268/// Replaces the shell-out:
269///   ip link delete `<name>`
270///
271/// Used in `overlay_manager::attach_to_interface` pre-cleanup,
272/// cleanup-on-error, and the orphan-veth sweeper.
273///
274/// # Errors
275///
276/// Returns [`NetlinkError::Netlink`] if RTNETLINK reports a failure
277/// other than `ENODEV` / "No such device" (which are treated as
278/// success so this is safe to call unconditionally).
279#[cfg(target_os = "linux")]
280pub async fn delete_link_by_name(name: &str) -> Result<(), NetlinkError> {
281    use futures_util::stream::TryStreamExt;
282
283    let (connection, handle, _) = rtnetlink::new_connection()
284        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
285    tokio::spawn(connection);
286
287    // Look up the link by name. Treat "not found" as success so the
288    // helper is safe to call unconditionally in cleanup paths.
289    let lookup = handle
290        .link()
291        .get()
292        .match_name(name.to_string())
293        .execute()
294        .try_next()
295        .await;
296
297    let link = match lookup {
298        Ok(Some(link)) => link,
299        Ok(None) => return Ok(()),
300        Err(rtnetlink::Error::NetlinkError(err)) => {
301            // libc::ENODEV == 19. netlink-packet-core reports the raw
302            // errno as a negative i32 in `code`, but the exact type has
303            // moved between versions, so match by both numeric code and
304            // the human-readable message for belt-and-suspenders safety.
305            let msg = err.to_string();
306            let is_enodev = err
307                .code
308                .is_some_and(|c| c.get().unsigned_abs() == libc::ENODEV as u32);
309            if is_enodev || msg.contains("No such device") {
310                return Ok(());
311            }
312            return Err(NetlinkError::Netlink(format!(
313                "link lookup failed for {name}: {msg}"
314            )));
315        }
316        Err(e) => {
317            let msg = e.to_string();
318            if msg.contains("No such device") {
319                return Ok(());
320            }
321            return Err(NetlinkError::Netlink(format!(
322                "link lookup failed for {name}: {msg}"
323            )));
324        }
325    };
326
327    let index = link.header.index;
328
329    handle
330        .link()
331        .del(index)
332        .execute()
333        .await
334        .map_err(|e| NetlinkError::Netlink(format!("link delete failed for {name}: {e}")))
335}
336
337/// Non-Linux stub.
338#[cfg(not(target_os = "linux"))]
339pub async fn delete_link_by_name(_name: &str) -> Result<(), NetlinkError> {
340    Err(NetlinkError::Netlink(
341        "delete_link_by_name is only supported on Linux".to_string(),
342    ))
343}
344
345/// List all network interfaces in the current netns.
346///
347/// Returns a `Vec` of `(index, name)` tuples for every link the kernel
348/// reports. Used by the orphan veth sweeper to find `veth-<pid>` and
349/// `vc-<pid>` links whose owning PID is dead, so it can clean them up
350/// via [`delete_link_by_name`].
351///
352/// Replaces the shell-out:
353///   ip -br link
354///
355/// Issues a single RTNETLINK `RTM_GETLINK` dump request and iterates
356/// the resulting stream of `LinkMessage`s. Each message contributes
357/// one `(index, name)` tuple; messages without an `IFLA_IFNAME`
358/// attribute (extremely rare in practice — the kernel always emits
359/// one for configured devices) are silently skipped.
360///
361/// # Errors
362///
363/// Returns [`NetlinkError::Netlink`] if the rtnetlink socket cannot
364/// be created or if the dump stream itself reports a failure.
365#[cfg(target_os = "linux")]
366pub async fn list_all_links() -> Result<Vec<(u32, String)>, NetlinkError> {
367    use futures_util::stream::TryStreamExt;
368    use netlink_packet_route::link::LinkAttribute;
369
370    let (connection, handle, _) = rtnetlink::new_connection()
371        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
372    tokio::spawn(connection);
373
374    let mut stream = handle.link().get().execute();
375    let mut links = Vec::new();
376
377    while let Some(msg) = stream
378        .try_next()
379        .await
380        .map_err(|e| NetlinkError::Netlink(format!("link dump failed: {e}")))?
381    {
382        // LinkHeader.index is already u32 in netlink-packet-route
383        // 0.19 — no cast needed.
384        let index = msg.header.index;
385        let Some(name) = msg.attributes.iter().find_map(|a| match a {
386            LinkAttribute::IfName(n) => Some(n.clone()),
387            _ => None,
388        }) else {
389            continue;
390        };
391        links.push((index, name));
392    }
393
394    Ok(links)
395}
396
397/// Non-Linux stub.
398#[cfg(not(target_os = "linux"))]
399pub async fn list_all_links() -> Result<Vec<(u32, String)>, NetlinkError> {
400    Err(NetlinkError::Netlink(
401        "list_all_links is only supported on Linux".to_string(),
402    ))
403}
404
405/// Set the link identified by `name` to the "up" administrative state.
406///
407/// Replaces the shell-out:
408///   ip link set `<name>` up
409///
410/// Unlike [`delete_link_by_name`] this is *not* idempotent for missing
411/// links: if the link does not exist the caller almost certainly has a
412/// bug upstream (we only call this on a veth end we just created), so
413/// we return [`NetlinkError::NotFound`] rather than silently succeeding.
414///
415/// # Errors
416///
417/// Returns [`NetlinkError::NotFound`] if no link with the given name
418/// exists in the current netns. Returns [`NetlinkError::Netlink`] for
419/// any other RTNETLINK failure (permission denied, etc.).
420#[cfg(target_os = "linux")]
421pub async fn set_link_up_by_name(name: &str) -> Result<(), NetlinkError> {
422    use futures_util::stream::TryStreamExt;
423
424    let (connection, handle, _) = rtnetlink::new_connection()
425        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
426    tokio::spawn(connection);
427
428    let link = handle
429        .link()
430        .get()
431        .match_name(name.to_string())
432        .execute()
433        .try_next()
434        .await
435        .map_err(|e| {
436            let msg = e.to_string();
437            if msg.contains("No such device") {
438                NetlinkError::NotFound(name.to_string())
439            } else {
440                NetlinkError::Netlink(format!("link lookup failed for {name}: {msg}"))
441            }
442        })?
443        .ok_or_else(|| NetlinkError::NotFound(name.to_string()))?;
444
445    let index = link.header.index;
446
447    handle
448        .link()
449        .set(index)
450        .up()
451        .execute()
452        .await
453        .map_err(|e| NetlinkError::Netlink(format!("link set up failed for {name}: {e}")))
454}
455
456/// Non-Linux stub.
457#[cfg(not(target_os = "linux"))]
458pub async fn set_link_up_by_name(_name: &str) -> Result<(), NetlinkError> {
459    Err(NetlinkError::Netlink(
460        "set_link_up_by_name is only supported on Linux".to_string(),
461    ))
462}
463
464/// Add an IP address to the link identified by `name` in the current
465/// network namespace.
466///
467/// Replaces (in combination with [`with_netns`]):
468///   nsenter -t `<pid>` -n ip \[-6\] addr add `<addr>/<prefix_len>` dev `<name>`
469///
470/// `addr` may be v4 or v6. `prefix_len` is the CIDR prefix length
471/// (24 for a `/24`, 64 for a `/64`, etc.).
472///
473/// This helper operates on the CURRENT network namespace — it looks
474/// up the interface index via a local rtnetlink socket. To target a
475/// container's netns, wrap the call inside [`with_netns`].
476///
477/// # Errors
478///
479/// Returns [`NetlinkError::NotFound`] if the link is missing. Returns
480/// [`NetlinkError::Netlink`] for any other rtnetlink failure
481/// (permission denied, EEXIST on a duplicate address, etc.).
482#[cfg(target_os = "linux")]
483pub async fn add_address_to_link_by_name(
484    name: &str,
485    addr: std::net::IpAddr,
486    prefix_len: u8,
487) -> Result<(), NetlinkError> {
488    use futures_util::stream::TryStreamExt;
489
490    let (connection, handle, _) = rtnetlink::new_connection()
491        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
492    tokio::spawn(connection);
493
494    let link = handle
495        .link()
496        .get()
497        .match_name(name.to_string())
498        .execute()
499        .try_next()
500        .await
501        .map_err(|e| {
502            let msg = e.to_string();
503            if msg.contains("No such device") {
504                NetlinkError::NotFound(name.to_string())
505            } else {
506                NetlinkError::Netlink(format!("link lookup failed for {name}: {msg}"))
507            }
508        })?
509        .ok_or_else(|| NetlinkError::NotFound(name.to_string()))?;
510
511    let index = link.header.index;
512
513    handle
514        .address()
515        .add(index, addr, prefix_len)
516        .execute()
517        .await
518        .map_err(|e| {
519            NetlinkError::Netlink(format!(
520                "address add failed for {name} ({addr}/{prefix_len}): {e}"
521            ))
522        })
523}
524
525/// Non-Linux stub.
526#[cfg(not(target_os = "linux"))]
527pub async fn add_address_to_link_by_name(
528    _name: &str,
529    _addr: std::net::IpAddr,
530    _prefix_len: u8,
531) -> Result<(), NetlinkError> {
532    Err(NetlinkError::Netlink(
533        "add_address_to_link_by_name is only supported on Linux".to_string(),
534    ))
535}
536
537/// Add a default route via the given device name in the current
538/// network namespace.
539///
540/// Replaces (in combination with [`with_netns`]):
541///   nsenter -t `<pid>` -n ip \[-6\] route add default dev `<dev_name>`
542///
543/// The route is a direct, link-scope route: no gateway, the kernel
544/// ARPs / uses NDISC on the device for destination resolution. This
545/// is the correct form for a point-to-point veth link where the peer
546/// is reachable directly.
547///
548/// For IPv4 the destination prefix is `0.0.0.0/0`. For IPv6 it is
549/// `::/0`. Controlled by `is_v6`.
550///
551/// # Errors
552///
553/// Returns [`NetlinkError::NotFound`] if the device is missing.
554/// Returns [`NetlinkError::Netlink`] for any other rtnetlink failure.
555#[cfg(target_os = "linux")]
556pub async fn add_default_route_via_dev(dev_name: &str, is_v6: bool) -> Result<(), NetlinkError> {
557    use futures_util::stream::TryStreamExt;
558    use netlink_packet_route::route::RouteScope;
559
560    let (connection, handle, _) = rtnetlink::new_connection()
561        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
562    tokio::spawn(connection);
563
564    let link = handle
565        .link()
566        .get()
567        .match_name(dev_name.to_string())
568        .execute()
569        .try_next()
570        .await
571        .map_err(|e| {
572            let msg = e.to_string();
573            if msg.contains("No such device") {
574                NetlinkError::NotFound(dev_name.to_string())
575            } else {
576                NetlinkError::Netlink(format!("link lookup failed for {dev_name}: {msg}"))
577            }
578        })?
579        .ok_or_else(|| NetlinkError::NotFound(dev_name.to_string()))?;
580
581    let oif_idx = link.header.index;
582
583    if is_v6 {
584        handle
585            .route()
586            .add()
587            .v6()
588            .destination_prefix(std::net::Ipv6Addr::UNSPECIFIED, 0)
589            .output_interface(oif_idx)
590            .scope(RouteScope::Link)
591            .execute()
592            .await
593            .map_err(|e| {
594                NetlinkError::Netlink(format!("default route add v6 via {dev_name} failed: {e}"))
595            })
596    } else {
597        handle
598            .route()
599            .add()
600            .v4()
601            .destination_prefix(std::net::Ipv4Addr::UNSPECIFIED, 0)
602            .output_interface(oif_idx)
603            .scope(RouteScope::Link)
604            .execute()
605            .await
606            .map_err(|e| {
607                NetlinkError::Netlink(format!("default route add v4 via {dev_name} failed: {e}"))
608            })
609    }
610}
611
612/// Non-Linux stub.
613#[cfg(not(target_os = "linux"))]
614pub async fn add_default_route_via_dev(_dev_name: &str, _is_v6: bool) -> Result<(), NetlinkError> {
615    Err(NetlinkError::Netlink(
616        "add_default_route_via_dev is only supported on Linux".to_string(),
617    ))
618}
619
620/// Add a default route pointing at the given gateway IP in the current
621/// network namespace.
622///
623/// Replaces (in combination with [`with_netns`]):
624///   nsenter -t `<pid>` -n ip \[-6\] route add default via `<gateway>`
625///
626/// Used by the per-service bridge attach path: containers join the
627/// service bridge via a veth pair and reach the rest of the overlay
628/// through the bridge's L3 gateway IP. The address family of the route
629/// is inferred from `gateway`.
630///
631/// # Errors
632///
633/// Returns [`NetlinkError::Netlink`] for any rtnetlink failure.
634#[cfg(target_os = "linux")]
635pub async fn add_default_route_via_gateway(gateway: std::net::IpAddr) -> Result<(), NetlinkError> {
636    let (connection, handle, _) = rtnetlink::new_connection()
637        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
638    tokio::spawn(connection);
639
640    match gateway {
641        std::net::IpAddr::V4(gw) => handle
642            .route()
643            .add()
644            .v4()
645            .destination_prefix(std::net::Ipv4Addr::UNSPECIFIED, 0)
646            .gateway(gw)
647            .execute()
648            .await
649            .map_err(|e| {
650                NetlinkError::Netlink(format!("default route add v4 via gateway {gw} failed: {e}"))
651            }),
652        std::net::IpAddr::V6(gw) => handle
653            .route()
654            .add()
655            .v6()
656            .destination_prefix(std::net::Ipv6Addr::UNSPECIFIED, 0)
657            .gateway(gw)
658            .execute()
659            .await
660            .map_err(|e| {
661                NetlinkError::Netlink(format!("default route add v6 via gateway {gw} failed: {e}"))
662            }),
663    }
664}
665
666/// Non-Linux stub.
667#[cfg(not(target_os = "linux"))]
668pub async fn add_default_route_via_gateway(_gateway: std::net::IpAddr) -> Result<(), NetlinkError> {
669    Err(NetlinkError::Netlink(
670        "add_default_route_via_gateway is only supported on Linux".to_string(),
671    ))
672}
673
674/// Add or replace a route to `dest/prefix_len` that forwards via the
675/// interface named `dev_name`. Optional `src` sets the preferred source
676/// address.
677///
678/// Replaces the shell-outs:
679///   ip route replace `<dest>/<prefix_len>` dev `<dev_name>` \[src `<src>`\]
680///   ip -6 route replace `<dest>/<prefix_len>` dev `<dev_name>` \[src `<src>`\]
681///
682/// Uses `NLM_F_REPLACE | NLM_F_CREATE` semantics (via rtnetlink's
683/// `.replace()` on the route add builder) so stale routes left behind
684/// by a previous daemon run don't cause `EEXIST`.
685///
686/// The route is installed with link scope (direct-via-dev, no
687/// gateway) which is the correct form for a per-container `/32` or
688/// `/128` pointing at a host-side veth endpoint.
689///
690/// `dest` and `src` (if provided) must have matching address families
691/// — passing a v4 `dest` with a v6 `src` returns
692/// [`NetlinkError::Netlink`] without touching the kernel.
693///
694/// # Errors
695///
696/// Returns [`NetlinkError::NotFound`] if `dev_name` does not exist in
697/// the current netns. Returns [`NetlinkError::Netlink`] on address
698/// family mismatch or any RTNETLINK failure.
699#[cfg(target_os = "linux")]
700pub async fn replace_route_via_dev(
701    dest: std::net::IpAddr,
702    prefix_len: u8,
703    dev_name: &str,
704    src: Option<std::net::IpAddr>,
705) -> Result<(), NetlinkError> {
706    use std::net::IpAddr;
707
708    use futures_util::stream::TryStreamExt;
709    use netlink_packet_route::route::RouteScope;
710
711    let (connection, handle, _) = rtnetlink::new_connection()
712        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
713    tokio::spawn(connection);
714
715    let link = handle
716        .link()
717        .get()
718        .match_name(dev_name.to_string())
719        .execute()
720        .try_next()
721        .await
722        .map_err(|e| {
723            let msg = e.to_string();
724            if msg.contains("No such device") {
725                NetlinkError::NotFound(dev_name.to_string())
726            } else {
727                NetlinkError::Netlink(format!("link lookup failed for {dev_name}: {msg}"))
728            }
729        })?
730        .ok_or_else(|| NetlinkError::NotFound(dev_name.to_string()))?;
731
732    let oif_idx = link.header.index;
733
734    match (dest, src) {
735        (IpAddr::V4(d), Some(IpAddr::V4(s))) => handle
736            .route()
737            .add()
738            .v4()
739            .destination_prefix(d, prefix_len)
740            .output_interface(oif_idx)
741            .scope(RouteScope::Link)
742            .pref_source(s)
743            .replace()
744            .execute()
745            .await
746            .map_err(|e| {
747                NetlinkError::Netlink(format!(
748                    "route replace v4 {d}/{prefix_len} dev {dev_name} src {s} failed: {e}"
749                ))
750            }),
751        (IpAddr::V4(d), None) => handle
752            .route()
753            .add()
754            .v4()
755            .destination_prefix(d, prefix_len)
756            .output_interface(oif_idx)
757            .scope(RouteScope::Link)
758            .replace()
759            .execute()
760            .await
761            .map_err(|e| {
762                NetlinkError::Netlink(format!(
763                    "route replace v4 {d}/{prefix_len} dev {dev_name} failed: {e}"
764                ))
765            }),
766        (IpAddr::V6(d), Some(IpAddr::V6(s))) => handle
767            .route()
768            .add()
769            .v6()
770            .destination_prefix(d, prefix_len)
771            .output_interface(oif_idx)
772            .scope(RouteScope::Link)
773            .pref_source(s)
774            .replace()
775            .execute()
776            .await
777            .map_err(|e| {
778                NetlinkError::Netlink(format!(
779                    "route replace v6 {d}/{prefix_len} dev {dev_name} src {s} failed: {e}"
780                ))
781            }),
782        (IpAddr::V6(d), None) => handle
783            .route()
784            .add()
785            .v6()
786            .destination_prefix(d, prefix_len)
787            .output_interface(oif_idx)
788            .scope(RouteScope::Link)
789            .replace()
790            .execute()
791            .await
792            .map_err(|e| {
793                NetlinkError::Netlink(format!(
794                    "route replace v6 {d}/{prefix_len} dev {dev_name} failed: {e}"
795                ))
796            }),
797        (IpAddr::V4(_), Some(IpAddr::V6(_))) | (IpAddr::V6(_), Some(IpAddr::V4(_))) => Err(
798            NetlinkError::Netlink(format!("address family mismatch: dest={dest} src={src:?}")),
799        ),
800    }
801}
802
803/// Non-Linux stub.
804#[cfg(not(target_os = "linux"))]
805pub async fn replace_route_via_dev(
806    _dest: std::net::IpAddr,
807    _prefix_len: u8,
808    _dev_name: &str,
809    _src: Option<std::net::IpAddr>,
810) -> Result<(), NetlinkError> {
811    Err(NetlinkError::Netlink(
812        "replace_route_via_dev is only supported on Linux".to_string(),
813    ))
814}
815
816/// Set a sysctl via the `/proc/sys/...` filesystem.
817///
818/// `key` uses dotted form like `net.ipv4.ip_forward`; dots are
819/// translated to path separators so the effective path is
820/// `/proc/sys/net/ipv4/ip_forward`. Writes the string form of
821/// `value` to the file.
822///
823/// Replaces the shell-outs:
824///   sysctl -w `<key>`=`<value>`
825///
826/// Writing to `/proc/sys/...` is the kernel-standard way of setting
827/// sysctls and works under any confinement that still allows write
828/// access to `/proc/sys` (which the overlay manager needs anyway for
829/// its other operations).
830///
831/// # Errors
832///
833/// Returns [`NetlinkError::Io`] if the write fails (e.g. permission
834/// denied, file missing because the sysctl doesn't exist on this
835/// kernel, etc.).
836pub fn set_sysctl(key: &str, value: &str) -> Result<(), NetlinkError> {
837    let path = format!("/proc/sys/{}", key.replace('.', "/"));
838    std::fs::write(&path, value)?;
839    Ok(())
840}
841
842/// Run a synchronous closure inside the network namespace referenced
843/// by the given `OwnedFd`.
844///
845/// This is the fd-based variant of [`with_netns`]. Callers that have
846/// already opened `/proc/<pid>/ns/net` (e.g. to pin the namespace
847/// across multiple operations) should use this form to reuse the
848/// same fd and avoid re-opening the procfs path — the reopen would
849/// fail with `ENOENT` if the container init process has exited in
850/// the meantime, even though the namespace itself is still alive
851/// because our pinned fd holds a reference.
852///
853/// The `OwnedFd` is moved into the dedicated worker thread and
854/// closed when the thread exits. Spawns a fresh OS thread (not a
855/// tokio blocking worker) because `setns` affects the whole thread
856/// and we don't want to contaminate a shared worker.
857///
858/// # Errors
859///
860/// Returns [`NetlinkError::Netlink`] if `setns` fails or the
861/// dedicated thread panics. Any error returned by the closure itself
862/// is propagated verbatim.
863#[cfg(target_os = "linux")]
864pub fn with_netns_fd<F, T>(ns_fd: std::os::fd::OwnedFd, f: F) -> Result<T, NetlinkError>
865where
866    F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
867    T: Send + 'static,
868{
869    let join_handle = std::thread::spawn(move || -> Result<T, NetlinkError> {
870        nix::sched::setns(&ns_fd, nix::sched::CloneFlags::CLONE_NEWNET)
871            .map_err(|e| NetlinkError::Netlink(format!("setns(ns_fd) failed: {e}")))?;
872        // Keep the fd alive for the duration of the closure even
873        // though setns only needs it for the syscall itself. Dropping
874        // it explicitly after the closure makes the lifetime obvious.
875        let result = f();
876        drop(ns_fd);
877        result
878    });
879
880    join_handle
881        .join()
882        .map_err(|_| NetlinkError::Netlink("with_netns_fd thread panicked".to_string()))?
883}
884
885/// Non-Linux Unix (macOS/BSD) stub. Not emitted on Windows — the sole caller
886/// chain (`attach_to_interface` in `overlay_manager.rs`) is
887/// `#[cfg(target_os = "linux")]`-gated, and `OwnedFd` is Unix-only.
888#[cfg(all(not(target_os = "linux"), unix))]
889pub fn with_netns_fd<F, T>(_ns_fd: std::os::fd::OwnedFd, _f: F) -> Result<T, NetlinkError>
890where
891    F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
892    T: Send + 'static,
893{
894    Err(NetlinkError::Netlink(
895        "with_netns_fd is only supported on Linux".to_string(),
896    ))
897}
898
899/// Run a synchronous closure inside the network namespace of the
900/// given PID.
901///
902/// Thin wrapper around [`with_netns_fd`] that opens
903/// `/proc/<target_pid>/ns/net` then delegates. Kept for backward
904/// compatibility and for callers that only need a single operation
905/// on the target netns. Callers that need to pin the namespace
906/// across multiple operations (and survive a racing exit of the
907/// container init) should open the fd themselves and call
908/// [`with_netns_fd`] directly.
909///
910/// Because `setns` is synchronous and `rtnetlink` is async, the
911/// typical usage pattern inside the closure is to build a local
912/// current-thread tokio runtime and `block_on` the netlink calls.
913/// See [`with_netns_async`] for a convenience wrapper that does
914/// exactly this.
915///
916/// # Errors
917///
918/// Returns [`NetlinkError::Io`] if `/proc/<target_pid>/ns/net` cannot
919/// be opened. Returns [`NetlinkError::Netlink`] if `setns` fails or
920/// the dedicated thread panics. Any error returned by the closure
921/// itself is propagated verbatim.
922#[cfg(target_os = "linux")]
923pub fn with_netns<F, T>(target_pid: u32, f: F) -> Result<T, NetlinkError>
924where
925    F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
926    T: Send + 'static,
927{
928    use std::os::fd::OwnedFd;
929
930    let ns_file = std::fs::File::open(format!("/proc/{target_pid}/ns/net"))?;
931    let ns_fd: OwnedFd = OwnedFd::from(ns_file);
932    with_netns_fd(ns_fd, f)
933}
934
935/// Non-Linux stub.
936#[cfg(not(target_os = "linux"))]
937pub fn with_netns<F, T>(_target_pid: u32, _f: F) -> Result<T, NetlinkError>
938where
939    F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
940    T: Send + 'static,
941{
942    Err(NetlinkError::Netlink(
943        "with_netns is only supported on Linux".to_string(),
944    ))
945}
946
947/// Convenience wrapper around [`with_netns_fd`] that builds a local
948/// current-thread tokio runtime inside the dedicated thread and
949/// drives the provided async future to completion.
950///
951/// The future is produced by calling `f()` from inside the thread
952/// that has already joined the target netns, so any rtnetlink
953/// operations awaited inside the future will talk to the target
954/// netns's kernel.
955///
956/// The local runtime is lightweight (single-thread, built per call)
957/// and only drives a handful of netlink messages before being
958/// dropped with the thread.
959///
960/// The `OwnedFd` is moved into the worker thread and closed when
961/// the thread exits.
962///
963/// # Errors
964///
965/// Returns [`NetlinkError::Netlink`] per [`with_netns_fd`], plus
966/// [`NetlinkError::Netlink`] if the local runtime fails to build.
967/// Any error returned by the future is propagated verbatim.
968#[cfg(target_os = "linux")]
969pub fn with_netns_fd_async<F, Fut, T>(ns_fd: std::os::fd::OwnedFd, f: F) -> Result<T, NetlinkError>
970where
971    F: FnOnce() -> Fut + Send + 'static,
972    Fut: std::future::Future<Output = Result<T, NetlinkError>>,
973    T: Send + 'static,
974{
975    with_netns_fd(ns_fd, move || {
976        let rt = tokio::runtime::Builder::new_current_thread()
977            .enable_all()
978            .build()
979            .map_err(|e| NetlinkError::Netlink(format!("local runtime build failed: {e}")))?;
980        rt.block_on(f())
981    })
982}
983
984/// Non-Linux Unix (macOS/BSD) stub. Not emitted on Windows — the sole caller
985/// chain (`attach_to_interface` in `overlay_manager.rs`) is
986/// `#[cfg(target_os = "linux")]`-gated, and `OwnedFd` is Unix-only.
987#[cfg(all(not(target_os = "linux"), unix))]
988pub fn with_netns_fd_async<F, Fut, T>(
989    _ns_fd: std::os::fd::OwnedFd,
990    _f: F,
991) -> Result<T, NetlinkError>
992where
993    F: FnOnce() -> Fut + Send + 'static,
994    Fut: std::future::Future<Output = Result<T, NetlinkError>>,
995    T: Send + 'static,
996{
997    Err(NetlinkError::Netlink(
998        "with_netns_fd_async is only supported on Linux".to_string(),
999    ))
1000}
1001
1002/// Convenience wrapper around [`with_netns`] that builds a local
1003/// current-thread tokio runtime inside the dedicated thread and
1004/// drives the provided async future to completion.
1005///
1006/// Thin wrapper around [`with_netns_fd_async`] that opens
1007/// `/proc/<target_pid>/ns/net` then delegates.
1008///
1009/// # Errors
1010///
1011/// Returns [`NetlinkError::Io`] / [`NetlinkError::Netlink`] per
1012/// [`with_netns`], plus [`NetlinkError::Netlink`] if the local
1013/// runtime fails to build. Any error returned by the future is
1014/// propagated verbatim.
1015#[cfg(target_os = "linux")]
1016pub fn with_netns_async<F, Fut, T>(target_pid: u32, f: F) -> Result<T, NetlinkError>
1017where
1018    F: FnOnce() -> Fut + Send + 'static,
1019    Fut: std::future::Future<Output = Result<T, NetlinkError>>,
1020    T: Send + 'static,
1021{
1022    use std::os::fd::OwnedFd;
1023
1024    let ns_file = std::fs::File::open(format!("/proc/{target_pid}/ns/net"))?;
1025    let ns_fd: OwnedFd = OwnedFd::from(ns_file);
1026    with_netns_fd_async(ns_fd, f)
1027}
1028
1029/// Non-Linux stub.
1030#[cfg(not(target_os = "linux"))]
1031pub fn with_netns_async<F, Fut, T>(_target_pid: u32, _f: F) -> Result<T, NetlinkError>
1032where
1033    F: FnOnce() -> Fut + Send + 'static,
1034    Fut: std::future::Future<Output = Result<T, NetlinkError>>,
1035    T: Send + 'static,
1036{
1037    Err(NetlinkError::Netlink(
1038        "with_netns_async is only supported on Linux".to_string(),
1039    ))
1040}
1041
1042/// Create a Linux bridge interface with the given name.
1043///
1044/// Replaces the shell-out:
1045///   ip link add name `<name>` type bridge
1046///
1047/// Idempotent: if a link with that name already exists this returns
1048/// `Ok(())`. This matches how the overlay manager's per-service bridge
1049/// creation path needs to behave — multiple containers landing on the
1050/// same service-on-node bridge must all see "bridge ready" after a
1051/// successful call without racing against existence checks.
1052///
1053/// The bridge is created in the current network namespace. Callers
1054/// that need a different netns should wrap with [`with_netns_async`].
1055/// The bridge is created in the administratively-down state — call
1056/// [`set_link_up_by_name`] separately once any other attributes
1057/// ([`set_bridge_stp`] etc.) have been applied.
1058///
1059/// # Errors
1060///
1061/// Returns [`NetlinkError::Netlink`] for any RTNETLINK failure other
1062/// than `EEXIST` (which is treated as success).
1063#[cfg(target_os = "linux")]
1064pub async fn create_bridge(name: &str) -> Result<(), NetlinkError> {
1065    let (connection, handle, _) = rtnetlink::new_connection()
1066        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
1067    tokio::spawn(connection);
1068
1069    match handle.link().add().bridge(name.to_string()).execute().await {
1070        Ok(()) => Ok(()),
1071        Err(rtnetlink::Error::NetlinkError(err)) => {
1072            // EEXIST means a link with this name already exists. We
1073            // intentionally do NOT verify that the existing link is
1074            // actually a bridge — callers using stable per-service
1075            // names own that invariant, and re-checking here would
1076            // require another rtnetlink round-trip on the hot path.
1077            let is_eexist = err
1078                .code
1079                .is_some_and(|c| c.get().unsigned_abs() == libc::EEXIST as u32);
1080            let msg = err.to_string();
1081            if is_eexist || msg.contains("File exists") {
1082                Ok(())
1083            } else {
1084                Err(NetlinkError::Netlink(format!(
1085                    "bridge create failed for {name}: {msg}"
1086                )))
1087            }
1088        }
1089        Err(e) => {
1090            let msg = e.to_string();
1091            if msg.contains("File exists") {
1092                Ok(())
1093            } else {
1094                Err(NetlinkError::Netlink(format!(
1095                    "bridge create failed for {name}: {msg}"
1096                )))
1097            }
1098        }
1099    }
1100}
1101
1102/// Non-Linux stub.
1103#[cfg(not(target_os = "linux"))]
1104pub async fn create_bridge(_name: &str) -> Result<(), NetlinkError> {
1105    Err(NetlinkError::Netlink(
1106        "create_bridge is only supported on Linux".to_string(),
1107    ))
1108}
1109
1110/// Delete the bridge interface with the given name.
1111///
1112/// Replaces the shell-out:
1113///   ip link delete `<name>` type bridge
1114///
1115/// Idempotent: returns `Ok(())` if the bridge does not exist.
1116/// Delegates to [`delete_link_by_name`] — from RTNETLINK's perspective
1117/// deleting a bridge is the same `RTM_DELLINK` as deleting any other
1118/// link, and `delete_link_by_name` already has the ENODEV-as-success
1119/// handling we want.
1120///
1121/// # Errors
1122///
1123/// Returns [`NetlinkError::Netlink`] for any RTNETLINK failure other
1124/// than `ENODEV` (which is treated as success).
1125#[cfg(target_os = "linux")]
1126pub async fn delete_bridge(name: &str) -> Result<(), NetlinkError> {
1127    delete_link_by_name(name).await
1128}
1129
1130/// Non-Linux stub.
1131#[cfg(not(target_os = "linux"))]
1132pub async fn delete_bridge(_name: &str) -> Result<(), NetlinkError> {
1133    Err(NetlinkError::Netlink(
1134        "delete_bridge is only supported on Linux".to_string(),
1135    ))
1136}
1137
1138/// Attach `link` to `bridge` by setting the link's `IFLA_MASTER` to
1139/// the bridge's ifindex.
1140///
1141/// Replaces the shell-out:
1142///   ip link set `<link>` master `<bridge>`
1143///
1144/// Both interfaces must already exist in the current network
1145/// namespace. This is what the overlay manager will call to splice a
1146/// container's host-side veth end into the per-service bridge instead
1147/// of /32-routing it directly.
1148///
1149/// # Errors
1150///
1151/// Returns [`NetlinkError::NotFound`] if either `link` or `bridge`
1152/// does not exist in the current netns. Returns
1153/// [`NetlinkError::Netlink`] for any other RTNETLINK failure.
1154#[cfg(target_os = "linux")]
1155pub async fn add_link_to_bridge(link: &str, bridge: &str) -> Result<(), NetlinkError> {
1156    use futures_util::stream::TryStreamExt;
1157
1158    let (connection, handle, _) = rtnetlink::new_connection()
1159        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
1160    tokio::spawn(connection);
1161
1162    let bridge_link = handle
1163        .link()
1164        .get()
1165        .match_name(bridge.to_string())
1166        .execute()
1167        .try_next()
1168        .await
1169        .map_err(|e| {
1170            let msg = e.to_string();
1171            if msg.contains("No such device") {
1172                NetlinkError::NotFound(bridge.to_string())
1173            } else {
1174                NetlinkError::Netlink(format!("link lookup failed for {bridge}: {msg}"))
1175            }
1176        })?
1177        .ok_or_else(|| NetlinkError::NotFound(bridge.to_string()))?;
1178    let bridge_idx = bridge_link.header.index;
1179
1180    let member_link = handle
1181        .link()
1182        .get()
1183        .match_name(link.to_string())
1184        .execute()
1185        .try_next()
1186        .await
1187        .map_err(|e| {
1188            let msg = e.to_string();
1189            if msg.contains("No such device") {
1190                NetlinkError::NotFound(link.to_string())
1191            } else {
1192                NetlinkError::Netlink(format!("link lookup failed for {link}: {msg}"))
1193            }
1194        })?
1195        .ok_or_else(|| NetlinkError::NotFound(link.to_string()))?;
1196    let member_idx = member_link.header.index;
1197
1198    handle
1199        .link()
1200        .set(member_idx)
1201        .controller(bridge_idx)
1202        .execute()
1203        .await
1204        .map_err(|e| {
1205            NetlinkError::Netlink(format!(
1206                "set master failed: link={link} bridge={bridge}: {e}"
1207            ))
1208        })
1209}
1210
1211/// Non-Linux stub.
1212#[cfg(not(target_os = "linux"))]
1213pub async fn add_link_to_bridge(_link: &str, _bridge: &str) -> Result<(), NetlinkError> {
1214    Err(NetlinkError::Netlink(
1215        "add_link_to_bridge is only supported on Linux".to_string(),
1216    ))
1217}
1218
1219/// Enable or disable Spanning Tree Protocol (STP) on the named bridge.
1220///
1221/// STP is disabled by default on bridges created via [`create_bridge`]
1222/// (the kernel default for a freshly-created bridge is STP off), and
1223/// for `ZLayer`'s per-service bridges we want to keep it off: each
1224/// bridge is single-host, has no possibility of a loop, and STP's
1225/// initial 30s forwarding-delay would stall container traffic on
1226/// attach.
1227///
1228/// rtnetlink 0.14 does not expose a typed builder for `IFLA_BR_STP_STATE`
1229/// (it lives inside the nested `IFLA_LINKINFO` -> `IFLA_INFO_DATA` ->
1230/// `IFLA_BR_STP_STATE` attribute and the crate's bridge builder only
1231/// covers it at create-time, not as a post-create modification). The
1232/// portable kernel-supported alternative is the sysfs knob at
1233/// `/sys/class/net/<name>/bridge/stp_state`, which is what
1234/// `brctl stp <name> on|off` writes under the hood. We use the sysfs
1235/// path so the helper works on every kernel that has bridge support
1236/// without depending on an rtnetlink API surface that may move
1237/// between crate versions.
1238///
1239/// # Errors
1240///
1241/// Returns [`NetlinkError::NotFound`] if the bridge does not exist (no
1242/// `/sys/class/net/<name>/bridge` directory). Returns
1243/// [`NetlinkError::Io`] for any other write failure (permission
1244/// denied, the link exists but is not a bridge, etc.).
1245#[cfg(target_os = "linux")]
1246pub fn set_bridge_stp(name: &str, stp_on: bool) -> Result<(), NetlinkError> {
1247    let bridge_dir = format!("/sys/class/net/{name}/bridge");
1248    if !std::path::Path::new(&bridge_dir).exists() {
1249        return Err(NetlinkError::NotFound(name.to_string()));
1250    }
1251    let path = format!("{bridge_dir}/stp_state");
1252    let value = if stp_on { "1" } else { "0" };
1253    std::fs::write(&path, value)?;
1254    Ok(())
1255}
1256
1257/// Non-Linux stub.
1258#[cfg(not(target_os = "linux"))]
1259pub fn set_bridge_stp(_name: &str, _stp_on: bool) -> Result<(), NetlinkError> {
1260    Err(NetlinkError::Netlink(
1261        "set_bridge_stp is only supported on Linux".to_string(),
1262    ))
1263}
1264
1265#[cfg(test)]
1266mod tests {
1267    // The helpers and tests in this module are Linux-only (they require
1268    // netlink + CAP_NET_ADMIN). Keep imports/fixtures gated so the lib
1269    // tests still compile on Windows/macOS cross-checks.
1270    #[cfg(target_os = "linux")]
1271    use super::*;
1272
1273    /// Generate a short random-ish suffix for test interface names so
1274    /// parallel `cargo test` invocations don't collide. Bounded to 6
1275    /// chars so the full name (`zlb-` prefix + suffix) stays under the
1276    /// 15-char `IFNAMSIZ` limit.
1277    #[cfg(target_os = "linux")]
1278    fn rand_suffix() -> String {
1279        use std::time::{SystemTime, UNIX_EPOCH};
1280        const CHARS: &[u8] = b"0123456789abcdefghijklmnopqrstuvwxyz";
1281        let nanos = SystemTime::now()
1282            .duration_since(UNIX_EPOCH)
1283            .map(|d| d.subsec_nanos())
1284            .unwrap_or(0);
1285        // base36-ish, 6 chars
1286        let mut n = u64::from(nanos);
1287        let mut out = String::new();
1288        let base = CHARS.len() as u64;
1289        for _ in 0..6 {
1290            let idx = usize::try_from(n % base).unwrap_or(0);
1291            out.push(CHARS[idx] as char);
1292            n /= base;
1293        }
1294        out
1295    }
1296
1297    /// Create a dummy interface with the given name (used as a stand-in
1298    /// for a host-side veth end in `bridge_add_link_membership`).
1299    #[cfg(target_os = "linux")]
1300    async fn create_dummy(name: &str) -> Result<(), NetlinkError> {
1301        let (connection, handle, _) = rtnetlink::new_connection()
1302            .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
1303        tokio::spawn(connection);
1304        handle
1305            .link()
1306            .add()
1307            .dummy(name.to_string())
1308            .execute()
1309            .await
1310            .map_err(|e| NetlinkError::Netlink(format!("dummy create failed for {name}: {e}")))
1311    }
1312
1313    #[cfg(target_os = "linux")]
1314    #[tokio::test]
1315    #[ignore = "requires CAP_NET_ADMIN; run manually or in privileged CI"]
1316    async fn bridge_create_idempotent() {
1317        let name = format!("zlb-{}", rand_suffix());
1318        assert!(name.len() <= 15, "interface name exceeds IFNAMSIZ: {name}");
1319
1320        // First create.
1321        create_bridge(&name).await.expect("first create_bridge");
1322        assert!(
1323            std::path::Path::new(&format!("/sys/class/net/{name}")).exists(),
1324            "bridge {name} should exist after create"
1325        );
1326
1327        // Second create on same name must be Ok.
1328        create_bridge(&name)
1329            .await
1330            .expect("second create_bridge should be idempotent");
1331
1332        // Delete and confirm gone.
1333        delete_bridge(&name).await.expect("delete_bridge");
1334        assert!(
1335            !std::path::Path::new(&format!("/sys/class/net/{name}")).exists(),
1336            "bridge {name} should be gone after delete"
1337        );
1338
1339        // Second delete on missing name must be Ok.
1340        delete_bridge(&name)
1341            .await
1342            .expect("second delete_bridge should be idempotent");
1343    }
1344
1345    #[cfg(target_os = "linux")]
1346    #[tokio::test]
1347    #[ignore = "requires CAP_NET_ADMIN; run manually or in privileged CI"]
1348    async fn bridge_add_link_membership() {
1349        let suffix = rand_suffix();
1350        let bridge = format!("zlb-{suffix}");
1351        let dummy = format!("zld-{suffix}");
1352        assert!(bridge.len() <= 15);
1353        assert!(dummy.len() <= 15);
1354
1355        create_bridge(&bridge).await.expect("create_bridge");
1356        create_dummy(&dummy).await.expect("create_dummy");
1357
1358        add_link_to_bridge(&dummy, &bridge)
1359            .await
1360            .expect("add_link_to_bridge");
1361
1362        // The dummy's master/ifindex symlink should resolve to the
1363        // bridge's ifindex.
1364        let master_ifindex_path = format!("/sys/class/net/{dummy}/master/ifindex");
1365        let dummy_master_ifindex = std::fs::read_to_string(&master_ifindex_path)
1366            .expect("read dummy master ifindex")
1367            .trim()
1368            .parse::<u32>()
1369            .expect("parse dummy master ifindex");
1370
1371        let bridge_ifindex = std::fs::read_to_string(format!("/sys/class/net/{bridge}/ifindex"))
1372            .expect("read bridge ifindex")
1373            .trim()
1374            .parse::<u32>()
1375            .expect("parse bridge ifindex");
1376
1377        assert_eq!(
1378            dummy_master_ifindex, bridge_ifindex,
1379            "dummy's master ifindex should equal bridge's ifindex"
1380        );
1381
1382        // Cleanup.
1383        delete_link_by_name(&dummy).await.expect("delete dummy");
1384        delete_bridge(&bridge).await.expect("delete bridge");
1385    }
1386
1387    #[cfg(target_os = "linux")]
1388    #[tokio::test]
1389    #[ignore = "requires CAP_NET_ADMIN; run manually or in privileged CI"]
1390    async fn bridge_stp_off() {
1391        let name = format!("zlb-{}", rand_suffix());
1392        assert!(name.len() <= 15);
1393
1394        create_bridge(&name).await.expect("create_bridge");
1395
1396        set_bridge_stp(&name, false).expect("set_bridge_stp off");
1397        let stp_state = std::fs::read_to_string(format!("/sys/class/net/{name}/bridge/stp_state"))
1398            .expect("read stp_state")
1399            .trim()
1400            .to_string();
1401        assert_eq!(
1402            stp_state, "0",
1403            "stp_state should be 0 after set_bridge_stp(false)"
1404        );
1405
1406        // Cleanup.
1407        delete_bridge(&name).await.expect("delete_bridge");
1408    }
1409}