Skip to main content

zlayer_agent/
netlink.rs

1//! Rust netlink helpers that replace shell-outs to `ip`/`nsenter`/`sysctl`
2//! for per-container overlay network setup.
3//!
4//! This module is populated incrementally through a phased migration.
5//! Stage 1: `move_link_into_netns_and_rename` replaces the shell pair
6//!          `ip link set <name> netns <pid>` + `nsenter -t <pid> -n ip
7//!          link set <name> name <new>` with a single atomic RTNETLINK
8//!          `SetLink` carrying both `IFLA_NET_NS_FD` and `IFLA_IFNAME`.
9//!          This bypasses the `/proc/<pid>/ns/net` access problem caused
10//!          by libcontainer setting `PR_SET_DUMPABLE(false)` on the
11//!          container init process under `SELinux` enforcing.
12//! Stage 2: `create_veth_pair`, `delete_link_by_name`, and
13//!          `set_link_up_by_name` replace the host-side veth shell
14//!          commands (`ip link add ... type veth peer name ...`,
15//!          `ip link delete ...`, `ip link set ... up`) used by
16//!          `overlay_manager::attach_to_interface` and the orphan
17//!          sweeper. These helpers talk RTNETLINK directly via the
18//!          `rtnetlink` crate (async, tokio-backed).
19//! Stage 3: `with_netns`, `add_address_to_link_by_name`, and
20//!          `add_default_route_via_dev` replace the remaining
21//!          container-netns shell-outs in
22//!          `overlay_manager::attach_to_interface`. `with_netns`
23//!          runs a closure on a dedicated OS thread that has joined
24//!          the target container's network namespace via `setns(2)`,
25//!          while the two new RTNETLINK helpers operate on the
26//!          current netns (so they must be invoked from inside a
27//!          `with_netns` closure). This removes the last three
28//!          `nsenter -t <pid> -n ip ...` shell-outs used to assign
29//!          the container IP, bring `eth0` / `lo` up, and add the
30//!          default route.
31
32#![cfg_attr(
33    not(target_os = "linux"),
34    allow(clippy::missing_errors_doc, clippy::unused_async)
35)]
36
37use thiserror::Error;
38
39/// Errors returned by the netlink helpers in this module.
40#[derive(Debug, Error)]
41pub enum NetlinkError {
42    /// Failed to open or access a file (typically `/proc/<pid>/ns/net`).
43    #[error("io error: {0}")]
44    Io(#[from] std::io::Error),
45
46    /// The requested link was not found in the current network namespace.
47    #[error("link '{0}' not found in current netns")]
48    NotFound(String),
49
50    /// A netlink operation failed.
51    #[error("netlink operation failed: {0}")]
52    Netlink(String),
53}
54
55/// Move a link from the current network namespace into the network
56/// namespace referenced by `ns_fd`, renaming it in the same atomic
57/// operation.
58///
59/// This is the fd-based variant of [`move_link_into_netns_and_rename`].
60/// Callers that have already opened `/proc/<pid>/ns/net` (e.g. to pin
61/// the namespace across multiple operations and survive a racing
62/// container init exit) should use this form so we don't reopen the
63/// path and lose the race.
64///
65/// The single RTNETLINK `SetLink` request carries both `IFLA_NET_NS_FD`
66/// and `IFLA_IFNAME`, so the kernel performs the move and the rename
67/// atomically.
68///
69/// # Errors
70///
71/// Returns [`NetlinkError::NotFound`] if `link_name` does not exist in
72/// the current netns. Returns [`NetlinkError::Netlink`] for any other
73/// netlink-level failure (permission denied, name collision in the
74/// target netns, etc.).
75#[cfg(all(target_os = "linux", feature = "youki-runtime"))]
76pub fn move_link_into_netns_fd_and_rename(
77    link_name: &str,
78    ns_fd: std::os::fd::BorrowedFd<'_>,
79    new_name: &str,
80) -> Result<(), NetlinkError> {
81    use std::os::fd::AsRawFd;
82
83    use libcontainer::network::link::LinkClient;
84    use libcontainer::network::wrapper::create_network_client;
85
86    // Build a LinkClient backed by the real rtnetlink socket. If the
87    // socket failed to initialize, libcontainer stores an error state
88    // and every subsequent call returns `ClientInitializeError`; we
89    // surface that through `NetlinkError::Netlink` below.
90    let client = create_network_client();
91    let mut link_client = LinkClient::new(client)
92        .map_err(|e| NetlinkError::Netlink(format!("failed to create LinkClient: {e}")))?;
93
94    // Resolve the host-side interface index. libcontainer returns an
95    // error for missing interfaces; map that to our dedicated variant
96    // so callers can distinguish "nothing to move" from real failures.
97    let link = link_client.get_by_name(link_name).map_err(|e| {
98        // libcontainer's NetworkError does not expose a kind we can
99        // match on, so we fall back to string inspection. In practice
100        // the only expected failure at this stage is ENODEV which
101        // manifests as "No such device" from the kernel.
102        let msg = e.to_string();
103        if msg.contains("No such device") || msg.contains("not found") {
104            NetlinkError::NotFound(link_name.to_string())
105        } else {
106            NetlinkError::Netlink(format!("get_by_name({link_name}) failed: {msg}"))
107        }
108    })?;
109
110    let index = link.header.index;
111
112    // Atomically move the link into the target netns and rename it.
113    // The caller retains ownership of `ns_fd`; `as_raw_fd()` only
114    // borrows the raw fd for the duration of the call.
115    link_client
116        .set_ns_fd(index, new_name, ns_fd.as_raw_fd())
117        .map_err(|e| {
118            NetlinkError::Netlink(format!(
119                "set_ns_fd(index={index}, new_name={new_name}) failed: {e}"
120            ))
121        })?;
122
123    Ok(())
124}
125
126/// Stub for non-Linux Unix platforms (macOS/BSD) and for Linux builds without
127/// the `youki-runtime` feature (which provides the libcontainer-backed impl).
128///
129/// Not emitted on Windows: `attach_to_interface` (the sole caller) is itself
130/// gated `#[cfg(target_os = "linux")]` in `overlay_manager.rs`, so there are
131/// no Windows callers, and the `BorrowedFd` parameter type is Unix-only.
132///
133/// # Errors
134///
135/// Always returns [`NetlinkError::Netlink`] — this function is unsupported on
136/// the current target/feature combination.
137#[cfg(any(
138    all(not(target_os = "linux"), unix),
139    all(target_os = "linux", not(feature = "youki-runtime")),
140))]
141pub fn move_link_into_netns_fd_and_rename(
142    _link_name: &str,
143    _ns_fd: std::os::fd::BorrowedFd<'_>,
144    _new_name: &str,
145) -> Result<(), NetlinkError> {
146    Err(NetlinkError::Netlink(
147        "move_link_into_netns_fd_and_rename requires Linux with the 'youki-runtime' feature"
148            .to_string(),
149    ))
150}
151
152/// Move a link from the current network namespace into the target PID's
153/// network namespace, renaming it in the same atomic operation.
154///
155/// Thin wrapper around [`move_link_into_netns_fd_and_rename`] that
156/// opens `/proc/<target_pid>/ns/net` then delegates. Kept for
157/// backward compatibility and for callers that only need a single
158/// operation on the target netns. Callers that need to perform
159/// multiple operations on the same netns (and want to survive a
160/// racing exit of the container init process) should open the fd
161/// themselves and call [`move_link_into_netns_fd_and_rename`]
162/// directly.
163///
164/// # Errors
165///
166/// Returns [`NetlinkError::Io`] if `/proc/<target_pid>/ns/net` cannot be
167/// opened (e.g. the container process is gone or is not dumpable and we
168/// lack `CAP_SYS_PTRACE`). Returns [`NetlinkError::NotFound`] if
169/// `link_name` does not exist in the current netns. Returns
170/// [`NetlinkError::Netlink`] for any other netlink-level failure
171/// (permission denied, name collision in the target netns, etc.).
172#[cfg(target_os = "linux")]
173pub fn move_link_into_netns_and_rename(
174    link_name: &str,
175    target_pid: u32,
176    new_name: &str,
177) -> Result<(), NetlinkError> {
178    use std::os::fd::{AsFd, OwnedFd};
179
180    let ns_file = std::fs::File::open(format!("/proc/{target_pid}/ns/net"))?;
181    let ns_fd: OwnedFd = OwnedFd::from(ns_file);
182    move_link_into_netns_fd_and_rename(link_name, ns_fd.as_fd(), new_name)
183}
184
185/// Non-Linux stub: the overlay manager never calls this on non-Linux
186/// platforms (libcontainer itself is a Linux-only dep), but keeping the
187/// signature available lets `overlay_manager.rs` stay platform-agnostic.
188#[cfg(not(target_os = "linux"))]
189pub fn move_link_into_netns_and_rename(
190    _link_name: &str,
191    _target_pid: u32,
192    _new_name: &str,
193) -> Result<(), NetlinkError> {
194    Err(NetlinkError::Netlink(
195        "move_link_into_netns_and_rename is only supported on Linux".to_string(),
196    ))
197}
198
199/// Create a veth pair with the two ends named `host_name` and `peer_name`.
200///
201/// Both ends start in the current network namespace. The caller is
202/// responsible for moving the peer end into the container netns (see
203/// [`move_link_into_netns_and_rename`]) and bringing the host end up
204/// (see [`set_link_up_by_name`]).
205///
206/// Replaces the shell-out:
207///   ip link add `<host_name>` type veth peer name `<peer_name>`
208///
209/// # Errors
210///
211/// Returns [`NetlinkError::Netlink`] if RTNETLINK fails for any
212/// reason. `EEXIST` / "File exists" is surfaced verbatim so the caller
213/// can distinguish a leaked endpoint (typically a sign the orphan
214/// sweeper missed something) from a permission or interface-name
215/// problem.
216#[cfg(target_os = "linux")]
217pub async fn create_veth_pair(host_name: &str, peer_name: &str) -> Result<(), NetlinkError> {
218    let (connection, handle, _) = rtnetlink::new_connection()
219        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
220    tokio::spawn(connection);
221
222    handle
223        .link()
224        .add()
225        .veth(host_name.to_string(), peer_name.to_string())
226        .execute()
227        .await
228        .map_err(|e| {
229            let msg = e.to_string();
230            if msg.contains("File exists") || msg.contains("EEXIST") {
231                NetlinkError::Netlink(format!(
232                    "veth pair already exists: host={host_name} peer={peer_name}: {msg}"
233                ))
234            } else {
235                NetlinkError::Netlink(format!(
236                    "veth create failed (host={host_name}, peer={peer_name}): {msg}"
237                ))
238            }
239        })
240}
241
242/// Non-Linux stub.
243#[cfg(not(target_os = "linux"))]
244pub async fn create_veth_pair(_host_name: &str, _peer_name: &str) -> Result<(), NetlinkError> {
245    Err(NetlinkError::Netlink(
246        "create_veth_pair is only supported on Linux".to_string(),
247    ))
248}
249
250/// Delete the link by name. Idempotent: returns `Ok(())` if the link
251/// does not exist. Any other error surfaces as
252/// [`NetlinkError::Netlink`].
253///
254/// Replaces the shell-out:
255///   ip link delete `<name>`
256///
257/// Used in `overlay_manager::attach_to_interface` pre-cleanup,
258/// cleanup-on-error, and the orphan-veth sweeper.
259///
260/// # Errors
261///
262/// Returns [`NetlinkError::Netlink`] if RTNETLINK reports a failure
263/// other than `ENODEV` / "No such device" (which are treated as
264/// success so this is safe to call unconditionally).
265#[cfg(target_os = "linux")]
266pub async fn delete_link_by_name(name: &str) -> Result<(), NetlinkError> {
267    use futures_util::stream::TryStreamExt;
268
269    let (connection, handle, _) = rtnetlink::new_connection()
270        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
271    tokio::spawn(connection);
272
273    // Look up the link by name. Treat "not found" as success so the
274    // helper is safe to call unconditionally in cleanup paths.
275    let lookup = handle
276        .link()
277        .get()
278        .match_name(name.to_string())
279        .execute()
280        .try_next()
281        .await;
282
283    let link = match lookup {
284        Ok(Some(link)) => link,
285        Ok(None) => return Ok(()),
286        Err(rtnetlink::Error::NetlinkError(err)) => {
287            // libc::ENODEV == 19. netlink-packet-core reports the raw
288            // errno as a negative i32 in `code`, but the exact type has
289            // moved between versions, so match by both numeric code and
290            // the human-readable message for belt-and-suspenders safety.
291            let msg = err.to_string();
292            let is_enodev = err
293                .code
294                .is_some_and(|c| c.get().unsigned_abs() == libc::ENODEV as u32);
295            if is_enodev || msg.contains("No such device") {
296                return Ok(());
297            }
298            return Err(NetlinkError::Netlink(format!(
299                "link lookup failed for {name}: {msg}"
300            )));
301        }
302        Err(e) => {
303            let msg = e.to_string();
304            if msg.contains("No such device") {
305                return Ok(());
306            }
307            return Err(NetlinkError::Netlink(format!(
308                "link lookup failed for {name}: {msg}"
309            )));
310        }
311    };
312
313    let index = link.header.index;
314
315    handle
316        .link()
317        .del(index)
318        .execute()
319        .await
320        .map_err(|e| NetlinkError::Netlink(format!("link delete failed for {name}: {e}")))
321}
322
323/// Non-Linux stub.
324#[cfg(not(target_os = "linux"))]
325pub async fn delete_link_by_name(_name: &str) -> Result<(), NetlinkError> {
326    Err(NetlinkError::Netlink(
327        "delete_link_by_name is only supported on Linux".to_string(),
328    ))
329}
330
331/// List all network interfaces in the current netns.
332///
333/// Returns a `Vec` of `(index, name)` tuples for every link the kernel
334/// reports. Used by the orphan veth sweeper to find `veth-<pid>` and
335/// `vc-<pid>` links whose owning PID is dead, so it can clean them up
336/// via [`delete_link_by_name`].
337///
338/// Replaces the shell-out:
339///   ip -br link
340///
341/// Issues a single RTNETLINK `RTM_GETLINK` dump request and iterates
342/// the resulting stream of `LinkMessage`s. Each message contributes
343/// one `(index, name)` tuple; messages without an `IFLA_IFNAME`
344/// attribute (extremely rare in practice — the kernel always emits
345/// one for configured devices) are silently skipped.
346///
347/// # Errors
348///
349/// Returns [`NetlinkError::Netlink`] if the rtnetlink socket cannot
350/// be created or if the dump stream itself reports a failure.
351#[cfg(target_os = "linux")]
352pub async fn list_all_links() -> Result<Vec<(u32, String)>, NetlinkError> {
353    use futures_util::stream::TryStreamExt;
354    use netlink_packet_route::link::LinkAttribute;
355
356    let (connection, handle, _) = rtnetlink::new_connection()
357        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
358    tokio::spawn(connection);
359
360    let mut stream = handle.link().get().execute();
361    let mut links = Vec::new();
362
363    while let Some(msg) = stream
364        .try_next()
365        .await
366        .map_err(|e| NetlinkError::Netlink(format!("link dump failed: {e}")))?
367    {
368        // LinkHeader.index is already u32 in netlink-packet-route
369        // 0.19 — no cast needed.
370        let index = msg.header.index;
371        let Some(name) = msg.attributes.iter().find_map(|a| match a {
372            LinkAttribute::IfName(n) => Some(n.clone()),
373            _ => None,
374        }) else {
375            continue;
376        };
377        links.push((index, name));
378    }
379
380    Ok(links)
381}
382
383/// Non-Linux stub.
384#[cfg(not(target_os = "linux"))]
385pub async fn list_all_links() -> Result<Vec<(u32, String)>, NetlinkError> {
386    Err(NetlinkError::Netlink(
387        "list_all_links is only supported on Linux".to_string(),
388    ))
389}
390
391/// Set the link identified by `name` to the "up" administrative state.
392///
393/// Replaces the shell-out:
394///   ip link set `<name>` up
395///
396/// Unlike [`delete_link_by_name`] this is *not* idempotent for missing
397/// links: if the link does not exist the caller almost certainly has a
398/// bug upstream (we only call this on a veth end we just created), so
399/// we return [`NetlinkError::NotFound`] rather than silently succeeding.
400///
401/// # Errors
402///
403/// Returns [`NetlinkError::NotFound`] if no link with the given name
404/// exists in the current netns. Returns [`NetlinkError::Netlink`] for
405/// any other RTNETLINK failure (permission denied, etc.).
406#[cfg(target_os = "linux")]
407pub async fn set_link_up_by_name(name: &str) -> Result<(), NetlinkError> {
408    use futures_util::stream::TryStreamExt;
409
410    let (connection, handle, _) = rtnetlink::new_connection()
411        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
412    tokio::spawn(connection);
413
414    let link = handle
415        .link()
416        .get()
417        .match_name(name.to_string())
418        .execute()
419        .try_next()
420        .await
421        .map_err(|e| {
422            let msg = e.to_string();
423            if msg.contains("No such device") {
424                NetlinkError::NotFound(name.to_string())
425            } else {
426                NetlinkError::Netlink(format!("link lookup failed for {name}: {msg}"))
427            }
428        })?
429        .ok_or_else(|| NetlinkError::NotFound(name.to_string()))?;
430
431    let index = link.header.index;
432
433    handle
434        .link()
435        .set(index)
436        .up()
437        .execute()
438        .await
439        .map_err(|e| NetlinkError::Netlink(format!("link set up failed for {name}: {e}")))
440}
441
442/// Non-Linux stub.
443#[cfg(not(target_os = "linux"))]
444pub async fn set_link_up_by_name(_name: &str) -> Result<(), NetlinkError> {
445    Err(NetlinkError::Netlink(
446        "set_link_up_by_name is only supported on Linux".to_string(),
447    ))
448}
449
450/// Add an IP address to the link identified by `name` in the current
451/// network namespace.
452///
453/// Replaces (in combination with [`with_netns`]):
454///   nsenter -t `<pid>` -n ip \[-6\] addr add `<addr>/<prefix_len>` dev `<name>`
455///
456/// `addr` may be v4 or v6. `prefix_len` is the CIDR prefix length
457/// (24 for a `/24`, 64 for a `/64`, etc.).
458///
459/// This helper operates on the CURRENT network namespace — it looks
460/// up the interface index via a local rtnetlink socket. To target a
461/// container's netns, wrap the call inside [`with_netns`].
462///
463/// # Errors
464///
465/// Returns [`NetlinkError::NotFound`] if the link is missing. Returns
466/// [`NetlinkError::Netlink`] for any other rtnetlink failure
467/// (permission denied, EEXIST on a duplicate address, etc.).
468#[cfg(target_os = "linux")]
469pub async fn add_address_to_link_by_name(
470    name: &str,
471    addr: std::net::IpAddr,
472    prefix_len: u8,
473) -> Result<(), NetlinkError> {
474    use futures_util::stream::TryStreamExt;
475
476    let (connection, handle, _) = rtnetlink::new_connection()
477        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
478    tokio::spawn(connection);
479
480    let link = handle
481        .link()
482        .get()
483        .match_name(name.to_string())
484        .execute()
485        .try_next()
486        .await
487        .map_err(|e| {
488            let msg = e.to_string();
489            if msg.contains("No such device") {
490                NetlinkError::NotFound(name.to_string())
491            } else {
492                NetlinkError::Netlink(format!("link lookup failed for {name}: {msg}"))
493            }
494        })?
495        .ok_or_else(|| NetlinkError::NotFound(name.to_string()))?;
496
497    let index = link.header.index;
498
499    handle
500        .address()
501        .add(index, addr, prefix_len)
502        .execute()
503        .await
504        .map_err(|e| {
505            NetlinkError::Netlink(format!(
506                "address add failed for {name} ({addr}/{prefix_len}): {e}"
507            ))
508        })
509}
510
511/// Non-Linux stub.
512#[cfg(not(target_os = "linux"))]
513pub async fn add_address_to_link_by_name(
514    _name: &str,
515    _addr: std::net::IpAddr,
516    _prefix_len: u8,
517) -> Result<(), NetlinkError> {
518    Err(NetlinkError::Netlink(
519        "add_address_to_link_by_name is only supported on Linux".to_string(),
520    ))
521}
522
523/// Add a default route via the given device name in the current
524/// network namespace.
525///
526/// Replaces (in combination with [`with_netns`]):
527///   nsenter -t `<pid>` -n ip \[-6\] route add default dev `<dev_name>`
528///
529/// The route is a direct, link-scope route: no gateway, the kernel
530/// ARPs / uses NDISC on the device for destination resolution. This
531/// is the correct form for a point-to-point veth link where the peer
532/// is reachable directly.
533///
534/// For IPv4 the destination prefix is `0.0.0.0/0`. For IPv6 it is
535/// `::/0`. Controlled by `is_v6`.
536///
537/// # Errors
538///
539/// Returns [`NetlinkError::NotFound`] if the device is missing.
540/// Returns [`NetlinkError::Netlink`] for any other rtnetlink failure.
541#[cfg(target_os = "linux")]
542pub async fn add_default_route_via_dev(dev_name: &str, is_v6: bool) -> Result<(), NetlinkError> {
543    use futures_util::stream::TryStreamExt;
544    use netlink_packet_route::route::RouteScope;
545
546    let (connection, handle, _) = rtnetlink::new_connection()
547        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
548    tokio::spawn(connection);
549
550    let link = handle
551        .link()
552        .get()
553        .match_name(dev_name.to_string())
554        .execute()
555        .try_next()
556        .await
557        .map_err(|e| {
558            let msg = e.to_string();
559            if msg.contains("No such device") {
560                NetlinkError::NotFound(dev_name.to_string())
561            } else {
562                NetlinkError::Netlink(format!("link lookup failed for {dev_name}: {msg}"))
563            }
564        })?
565        .ok_or_else(|| NetlinkError::NotFound(dev_name.to_string()))?;
566
567    let oif_idx = link.header.index;
568
569    if is_v6 {
570        handle
571            .route()
572            .add()
573            .v6()
574            .destination_prefix(std::net::Ipv6Addr::UNSPECIFIED, 0)
575            .output_interface(oif_idx)
576            .scope(RouteScope::Link)
577            .execute()
578            .await
579            .map_err(|e| {
580                NetlinkError::Netlink(format!("default route add v6 via {dev_name} failed: {e}"))
581            })
582    } else {
583        handle
584            .route()
585            .add()
586            .v4()
587            .destination_prefix(std::net::Ipv4Addr::UNSPECIFIED, 0)
588            .output_interface(oif_idx)
589            .scope(RouteScope::Link)
590            .execute()
591            .await
592            .map_err(|e| {
593                NetlinkError::Netlink(format!("default route add v4 via {dev_name} failed: {e}"))
594            })
595    }
596}
597
598/// Non-Linux stub.
599#[cfg(not(target_os = "linux"))]
600pub async fn add_default_route_via_dev(_dev_name: &str, _is_v6: bool) -> Result<(), NetlinkError> {
601    Err(NetlinkError::Netlink(
602        "add_default_route_via_dev is only supported on Linux".to_string(),
603    ))
604}
605
606/// Add or replace a route to `dest/prefix_len` that forwards via the
607/// interface named `dev_name`. Optional `src` sets the preferred source
608/// address.
609///
610/// Replaces the shell-outs:
611///   ip route replace `<dest>/<prefix_len>` dev `<dev_name>` \[src `<src>`\]
612///   ip -6 route replace `<dest>/<prefix_len>` dev `<dev_name>` \[src `<src>`\]
613///
614/// Uses `NLM_F_REPLACE | NLM_F_CREATE` semantics (via rtnetlink's
615/// `.replace()` on the route add builder) so stale routes left behind
616/// by a previous daemon run don't cause `EEXIST`.
617///
618/// The route is installed with link scope (direct-via-dev, no
619/// gateway) which is the correct form for a per-container `/32` or
620/// `/128` pointing at a host-side veth endpoint.
621///
622/// `dest` and `src` (if provided) must have matching address families
623/// — passing a v4 `dest` with a v6 `src` returns
624/// [`NetlinkError::Netlink`] without touching the kernel.
625///
626/// # Errors
627///
628/// Returns [`NetlinkError::NotFound`] if `dev_name` does not exist in
629/// the current netns. Returns [`NetlinkError::Netlink`] on address
630/// family mismatch or any RTNETLINK failure.
631#[cfg(target_os = "linux")]
632pub async fn replace_route_via_dev(
633    dest: std::net::IpAddr,
634    prefix_len: u8,
635    dev_name: &str,
636    src: Option<std::net::IpAddr>,
637) -> Result<(), NetlinkError> {
638    use std::net::IpAddr;
639
640    use futures_util::stream::TryStreamExt;
641    use netlink_packet_route::route::RouteScope;
642
643    let (connection, handle, _) = rtnetlink::new_connection()
644        .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
645    tokio::spawn(connection);
646
647    let link = handle
648        .link()
649        .get()
650        .match_name(dev_name.to_string())
651        .execute()
652        .try_next()
653        .await
654        .map_err(|e| {
655            let msg = e.to_string();
656            if msg.contains("No such device") {
657                NetlinkError::NotFound(dev_name.to_string())
658            } else {
659                NetlinkError::Netlink(format!("link lookup failed for {dev_name}: {msg}"))
660            }
661        })?
662        .ok_or_else(|| NetlinkError::NotFound(dev_name.to_string()))?;
663
664    let oif_idx = link.header.index;
665
666    match (dest, src) {
667        (IpAddr::V4(d), Some(IpAddr::V4(s))) => handle
668            .route()
669            .add()
670            .v4()
671            .destination_prefix(d, prefix_len)
672            .output_interface(oif_idx)
673            .scope(RouteScope::Link)
674            .pref_source(s)
675            .replace()
676            .execute()
677            .await
678            .map_err(|e| {
679                NetlinkError::Netlink(format!(
680                    "route replace v4 {d}/{prefix_len} dev {dev_name} src {s} failed: {e}"
681                ))
682            }),
683        (IpAddr::V4(d), None) => handle
684            .route()
685            .add()
686            .v4()
687            .destination_prefix(d, prefix_len)
688            .output_interface(oif_idx)
689            .scope(RouteScope::Link)
690            .replace()
691            .execute()
692            .await
693            .map_err(|e| {
694                NetlinkError::Netlink(format!(
695                    "route replace v4 {d}/{prefix_len} dev {dev_name} failed: {e}"
696                ))
697            }),
698        (IpAddr::V6(d), Some(IpAddr::V6(s))) => handle
699            .route()
700            .add()
701            .v6()
702            .destination_prefix(d, prefix_len)
703            .output_interface(oif_idx)
704            .scope(RouteScope::Link)
705            .pref_source(s)
706            .replace()
707            .execute()
708            .await
709            .map_err(|e| {
710                NetlinkError::Netlink(format!(
711                    "route replace v6 {d}/{prefix_len} dev {dev_name} src {s} failed: {e}"
712                ))
713            }),
714        (IpAddr::V6(d), None) => handle
715            .route()
716            .add()
717            .v6()
718            .destination_prefix(d, prefix_len)
719            .output_interface(oif_idx)
720            .scope(RouteScope::Link)
721            .replace()
722            .execute()
723            .await
724            .map_err(|e| {
725                NetlinkError::Netlink(format!(
726                    "route replace v6 {d}/{prefix_len} dev {dev_name} failed: {e}"
727                ))
728            }),
729        (IpAddr::V4(_), Some(IpAddr::V6(_))) | (IpAddr::V6(_), Some(IpAddr::V4(_))) => Err(
730            NetlinkError::Netlink(format!("address family mismatch: dest={dest} src={src:?}")),
731        ),
732    }
733}
734
735/// Non-Linux stub.
736#[cfg(not(target_os = "linux"))]
737pub async fn replace_route_via_dev(
738    _dest: std::net::IpAddr,
739    _prefix_len: u8,
740    _dev_name: &str,
741    _src: Option<std::net::IpAddr>,
742) -> Result<(), NetlinkError> {
743    Err(NetlinkError::Netlink(
744        "replace_route_via_dev is only supported on Linux".to_string(),
745    ))
746}
747
748/// Set a sysctl via the `/proc/sys/...` filesystem.
749///
750/// `key` uses dotted form like `net.ipv4.ip_forward`; dots are
751/// translated to path separators so the effective path is
752/// `/proc/sys/net/ipv4/ip_forward`. Writes the string form of
753/// `value` to the file.
754///
755/// Replaces the shell-outs:
756///   sysctl -w `<key>`=`<value>`
757///
758/// Writing to `/proc/sys/...` is the kernel-standard way of setting
759/// sysctls and works under any confinement that still allows write
760/// access to `/proc/sys` (which the overlay manager needs anyway for
761/// its other operations).
762///
763/// # Errors
764///
765/// Returns [`NetlinkError::Io`] if the write fails (e.g. permission
766/// denied, file missing because the sysctl doesn't exist on this
767/// kernel, etc.).
768pub fn set_sysctl(key: &str, value: &str) -> Result<(), NetlinkError> {
769    let path = format!("/proc/sys/{}", key.replace('.', "/"));
770    std::fs::write(&path, value)?;
771    Ok(())
772}
773
774/// Run a synchronous closure inside the network namespace referenced
775/// by the given `OwnedFd`.
776///
777/// This is the fd-based variant of [`with_netns`]. Callers that have
778/// already opened `/proc/<pid>/ns/net` (e.g. to pin the namespace
779/// across multiple operations) should use this form to reuse the
780/// same fd and avoid re-opening the procfs path — the reopen would
781/// fail with `ENOENT` if the container init process has exited in
782/// the meantime, even though the namespace itself is still alive
783/// because our pinned fd holds a reference.
784///
785/// The `OwnedFd` is moved into the dedicated worker thread and
786/// closed when the thread exits. Spawns a fresh OS thread (not a
787/// tokio blocking worker) because `setns` affects the whole thread
788/// and we don't want to contaminate a shared worker.
789///
790/// # Errors
791///
792/// Returns [`NetlinkError::Netlink`] if `setns` fails or the
793/// dedicated thread panics. Any error returned by the closure itself
794/// is propagated verbatim.
795#[cfg(target_os = "linux")]
796pub fn with_netns_fd<F, T>(ns_fd: std::os::fd::OwnedFd, f: F) -> Result<T, NetlinkError>
797where
798    F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
799    T: Send + 'static,
800{
801    let join_handle = std::thread::spawn(move || -> Result<T, NetlinkError> {
802        nix::sched::setns(&ns_fd, nix::sched::CloneFlags::CLONE_NEWNET)
803            .map_err(|e| NetlinkError::Netlink(format!("setns(ns_fd) failed: {e}")))?;
804        // Keep the fd alive for the duration of the closure even
805        // though setns only needs it for the syscall itself. Dropping
806        // it explicitly after the closure makes the lifetime obvious.
807        let result = f();
808        drop(ns_fd);
809        result
810    });
811
812    join_handle
813        .join()
814        .map_err(|_| NetlinkError::Netlink("with_netns_fd thread panicked".to_string()))?
815}
816
817/// Non-Linux Unix (macOS/BSD) stub. Not emitted on Windows — the sole caller
818/// chain (`attach_to_interface` in `overlay_manager.rs`) is
819/// `#[cfg(target_os = "linux")]`-gated, and `OwnedFd` is Unix-only.
820#[cfg(all(not(target_os = "linux"), unix))]
821pub fn with_netns_fd<F, T>(_ns_fd: std::os::fd::OwnedFd, _f: F) -> Result<T, NetlinkError>
822where
823    F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
824    T: Send + 'static,
825{
826    Err(NetlinkError::Netlink(
827        "with_netns_fd is only supported on Linux".to_string(),
828    ))
829}
830
831/// Run a synchronous closure inside the network namespace of the
832/// given PID.
833///
834/// Thin wrapper around [`with_netns_fd`] that opens
835/// `/proc/<target_pid>/ns/net` then delegates. Kept for backward
836/// compatibility and for callers that only need a single operation
837/// on the target netns. Callers that need to pin the namespace
838/// across multiple operations (and survive a racing exit of the
839/// container init) should open the fd themselves and call
840/// [`with_netns_fd`] directly.
841///
842/// Because `setns` is synchronous and `rtnetlink` is async, the
843/// typical usage pattern inside the closure is to build a local
844/// current-thread tokio runtime and `block_on` the netlink calls.
845/// See [`with_netns_async`] for a convenience wrapper that does
846/// exactly this.
847///
848/// # Errors
849///
850/// Returns [`NetlinkError::Io`] if `/proc/<target_pid>/ns/net` cannot
851/// be opened. Returns [`NetlinkError::Netlink`] if `setns` fails or
852/// the dedicated thread panics. Any error returned by the closure
853/// itself is propagated verbatim.
854#[cfg(target_os = "linux")]
855pub fn with_netns<F, T>(target_pid: u32, f: F) -> Result<T, NetlinkError>
856where
857    F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
858    T: Send + 'static,
859{
860    use std::os::fd::OwnedFd;
861
862    let ns_file = std::fs::File::open(format!("/proc/{target_pid}/ns/net"))?;
863    let ns_fd: OwnedFd = OwnedFd::from(ns_file);
864    with_netns_fd(ns_fd, f)
865}
866
867/// Non-Linux stub.
868#[cfg(not(target_os = "linux"))]
869pub fn with_netns<F, T>(_target_pid: u32, _f: F) -> Result<T, NetlinkError>
870where
871    F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
872    T: Send + 'static,
873{
874    Err(NetlinkError::Netlink(
875        "with_netns is only supported on Linux".to_string(),
876    ))
877}
878
879/// Convenience wrapper around [`with_netns_fd`] that builds a local
880/// current-thread tokio runtime inside the dedicated thread and
881/// drives the provided async future to completion.
882///
883/// The future is produced by calling `f()` from inside the thread
884/// that has already joined the target netns, so any rtnetlink
885/// operations awaited inside the future will talk to the target
886/// netns's kernel.
887///
888/// The local runtime is lightweight (single-thread, built per call)
889/// and only drives a handful of netlink messages before being
890/// dropped with the thread.
891///
892/// The `OwnedFd` is moved into the worker thread and closed when
893/// the thread exits.
894///
895/// # Errors
896///
897/// Returns [`NetlinkError::Netlink`] per [`with_netns_fd`], plus
898/// [`NetlinkError::Netlink`] if the local runtime fails to build.
899/// Any error returned by the future is propagated verbatim.
900#[cfg(target_os = "linux")]
901pub fn with_netns_fd_async<F, Fut, T>(ns_fd: std::os::fd::OwnedFd, f: F) -> Result<T, NetlinkError>
902where
903    F: FnOnce() -> Fut + Send + 'static,
904    Fut: std::future::Future<Output = Result<T, NetlinkError>>,
905    T: Send + 'static,
906{
907    with_netns_fd(ns_fd, move || {
908        let rt = tokio::runtime::Builder::new_current_thread()
909            .enable_all()
910            .build()
911            .map_err(|e| NetlinkError::Netlink(format!("local runtime build failed: {e}")))?;
912        rt.block_on(f())
913    })
914}
915
916/// Non-Linux Unix (macOS/BSD) stub. Not emitted on Windows — the sole caller
917/// chain (`attach_to_interface` in `overlay_manager.rs`) is
918/// `#[cfg(target_os = "linux")]`-gated, and `OwnedFd` is Unix-only.
919#[cfg(all(not(target_os = "linux"), unix))]
920pub fn with_netns_fd_async<F, Fut, T>(
921    _ns_fd: std::os::fd::OwnedFd,
922    _f: F,
923) -> Result<T, NetlinkError>
924where
925    F: FnOnce() -> Fut + Send + 'static,
926    Fut: std::future::Future<Output = Result<T, NetlinkError>>,
927    T: Send + 'static,
928{
929    Err(NetlinkError::Netlink(
930        "with_netns_fd_async is only supported on Linux".to_string(),
931    ))
932}
933
934/// Convenience wrapper around [`with_netns`] that builds a local
935/// current-thread tokio runtime inside the dedicated thread and
936/// drives the provided async future to completion.
937///
938/// Thin wrapper around [`with_netns_fd_async`] that opens
939/// `/proc/<target_pid>/ns/net` then delegates.
940///
941/// # Errors
942///
943/// Returns [`NetlinkError::Io`] / [`NetlinkError::Netlink`] per
944/// [`with_netns`], plus [`NetlinkError::Netlink`] if the local
945/// runtime fails to build. Any error returned by the future is
946/// propagated verbatim.
947#[cfg(target_os = "linux")]
948pub fn with_netns_async<F, Fut, T>(target_pid: u32, f: F) -> Result<T, NetlinkError>
949where
950    F: FnOnce() -> Fut + Send + 'static,
951    Fut: std::future::Future<Output = Result<T, NetlinkError>>,
952    T: Send + 'static,
953{
954    use std::os::fd::OwnedFd;
955
956    let ns_file = std::fs::File::open(format!("/proc/{target_pid}/ns/net"))?;
957    let ns_fd: OwnedFd = OwnedFd::from(ns_file);
958    with_netns_fd_async(ns_fd, f)
959}
960
961/// Non-Linux stub.
962#[cfg(not(target_os = "linux"))]
963pub fn with_netns_async<F, Fut, T>(_target_pid: u32, _f: F) -> Result<T, NetlinkError>
964where
965    F: FnOnce() -> Fut + Send + 'static,
966    Fut: std::future::Future<Output = Result<T, NetlinkError>>,
967    T: Send + 'static,
968{
969    Err(NetlinkError::Netlink(
970        "with_netns_async is only supported on Linux".to_string(),
971    ))
972}