zlayer_agent/netlink.rs
1//! Rust netlink helpers that replace shell-outs to `ip`/`nsenter`/`sysctl`
2//! for per-container overlay network setup.
3//!
4//! This module is populated incrementally through a phased migration.
5//! Stage 1: `move_link_into_netns_and_rename` replaces the shell pair
6//! `ip link set <name> netns <pid>` + `nsenter -t <pid> -n ip
7//! link set <name> name <new>` with a single atomic RTNETLINK
8//! `SetLink` carrying both `IFLA_NET_NS_FD` and `IFLA_IFNAME`.
9//! This bypasses the `/proc/<pid>/ns/net` access problem caused
10//! by libcontainer setting `PR_SET_DUMPABLE(false)` on the
11//! container init process under `SELinux` enforcing.
12//! Stage 2: `create_veth_pair`, `delete_link_by_name`, and
13//! `set_link_up_by_name` replace the host-side veth shell
14//! commands (`ip link add ... type veth peer name ...`,
15//! `ip link delete ...`, `ip link set ... up`) used by
16//! `overlay_manager::attach_to_interface` and the orphan
17//! sweeper. These helpers talk RTNETLINK directly via the
18//! `rtnetlink` crate (async, tokio-backed).
19//! Stage 3: `with_netns`, `add_address_to_link_by_name`, and
20//! `add_default_route_via_dev` replace the remaining
21//! container-netns shell-outs in
22//! `overlay_manager::attach_to_interface`. `with_netns`
23//! runs a closure on a dedicated OS thread that has joined
24//! the target container's network namespace via `setns(2)`,
25//! while the two new RTNETLINK helpers operate on the
26//! current netns (so they must be invoked from inside a
27//! `with_netns` closure). This removes the last three
28//! `nsenter -t <pid> -n ip ...` shell-outs used to assign
29//! the container IP, bring `eth0` / `lo` up, and add the
30//! default route.
31
32#![cfg_attr(
33 not(target_os = "linux"),
34 allow(clippy::missing_errors_doc, clippy::unused_async)
35)]
36
37use thiserror::Error;
38
39/// Errors returned by the netlink helpers in this module.
40#[derive(Debug, Error)]
41pub enum NetlinkError {
42 /// Failed to open or access a file (typically `/proc/<pid>/ns/net`).
43 #[error("io error: {0}")]
44 Io(#[from] std::io::Error),
45
46 /// The requested link was not found in the current network namespace.
47 #[error("link '{0}' not found in current netns")]
48 NotFound(String),
49
50 /// A netlink operation failed.
51 #[error("netlink operation failed: {0}")]
52 Netlink(String),
53}
54
55/// Move a link from the current network namespace into the network
56/// namespace referenced by `ns_fd`, renaming it in the same atomic
57/// operation.
58///
59/// This is the fd-based variant of [`move_link_into_netns_and_rename`].
60/// Callers that have already opened `/proc/<pid>/ns/net` (e.g. to pin
61/// the namespace across multiple operations and survive a racing
62/// container init exit) should use this form so we don't reopen the
63/// path and lose the race.
64///
65/// The single RTNETLINK `SetLink` request carries both `IFLA_NET_NS_FD`
66/// and `IFLA_IFNAME`, so the kernel performs the move and the rename
67/// atomically.
68///
69/// # Errors
70///
71/// Returns [`NetlinkError::NotFound`] if `link_name` does not exist in
72/// the current netns. Returns [`NetlinkError::Netlink`] for any other
73/// netlink-level failure (permission denied, name collision in the
74/// target netns, etc.).
75#[cfg(all(target_os = "linux", feature = "youki-runtime"))]
76pub fn move_link_into_netns_fd_and_rename(
77 link_name: &str,
78 ns_fd: std::os::fd::BorrowedFd<'_>,
79 new_name: &str,
80) -> Result<(), NetlinkError> {
81 use std::os::fd::AsRawFd;
82
83 use libcontainer::network::link::LinkClient;
84 use libcontainer::network::wrapper::create_network_client;
85
86 // Build a LinkClient backed by the real rtnetlink socket. If the
87 // socket failed to initialize, libcontainer stores an error state
88 // and every subsequent call returns `ClientInitializeError`; we
89 // surface that through `NetlinkError::Netlink` below.
90 let client = create_network_client();
91 let mut link_client = LinkClient::new(client)
92 .map_err(|e| NetlinkError::Netlink(format!("failed to create LinkClient: {e}")))?;
93
94 // Resolve the host-side interface index. libcontainer returns an
95 // error for missing interfaces; map that to our dedicated variant
96 // so callers can distinguish "nothing to move" from real failures.
97 let link = link_client.get_by_name(link_name).map_err(|e| {
98 // libcontainer's NetworkError does not expose a kind we can
99 // match on, so we fall back to string inspection. In practice
100 // the only expected failure at this stage is ENODEV which
101 // manifests as "No such device" from the kernel.
102 let msg = e.to_string();
103 if msg.contains("No such device") || msg.contains("not found") {
104 NetlinkError::NotFound(link_name.to_string())
105 } else {
106 NetlinkError::Netlink(format!("get_by_name({link_name}) failed: {msg}"))
107 }
108 })?;
109
110 let index = link.header.index;
111
112 // Atomically move the link into the target netns and rename it.
113 // The caller retains ownership of `ns_fd`; `as_raw_fd()` only
114 // borrows the raw fd for the duration of the call.
115 link_client
116 .set_ns_fd(index, new_name, ns_fd.as_raw_fd())
117 .map_err(|e| {
118 NetlinkError::Netlink(format!(
119 "set_ns_fd(index={index}, new_name={new_name}) failed: {e}"
120 ))
121 })?;
122
123 Ok(())
124}
125
126/// Stub for non-Linux Unix platforms (macOS/BSD) and for Linux builds without
127/// the `youki-runtime` feature (which provides the libcontainer-backed impl).
128///
129/// Not emitted on Windows: `attach_to_interface` (the sole caller) is itself
130/// gated `#[cfg(target_os = "linux")]` in `overlay_manager.rs`, so there are
131/// no Windows callers, and the `BorrowedFd` parameter type is Unix-only.
132///
133/// # Errors
134///
135/// Always returns [`NetlinkError::Netlink`] — this function is unsupported on
136/// the current target/feature combination.
137#[cfg(any(
138 all(not(target_os = "linux"), unix),
139 all(target_os = "linux", not(feature = "youki-runtime")),
140))]
141pub fn move_link_into_netns_fd_and_rename(
142 _link_name: &str,
143 _ns_fd: std::os::fd::BorrowedFd<'_>,
144 _new_name: &str,
145) -> Result<(), NetlinkError> {
146 Err(NetlinkError::Netlink(
147 "move_link_into_netns_fd_and_rename requires Linux with the 'youki-runtime' feature"
148 .to_string(),
149 ))
150}
151
152/// Move a link from the current network namespace into the target PID's
153/// network namespace, renaming it in the same atomic operation.
154///
155/// Thin wrapper around [`move_link_into_netns_fd_and_rename`] that
156/// opens `/proc/<target_pid>/ns/net` then delegates. Kept for
157/// backward compatibility and for callers that only need a single
158/// operation on the target netns. Callers that need to perform
159/// multiple operations on the same netns (and want to survive a
160/// racing exit of the container init process) should open the fd
161/// themselves and call [`move_link_into_netns_fd_and_rename`]
162/// directly.
163///
164/// # Errors
165///
166/// Returns [`NetlinkError::Io`] if `/proc/<target_pid>/ns/net` cannot be
167/// opened (e.g. the container process is gone or is not dumpable and we
168/// lack `CAP_SYS_PTRACE`). Returns [`NetlinkError::NotFound`] if
169/// `link_name` does not exist in the current netns. Returns
170/// [`NetlinkError::Netlink`] for any other netlink-level failure
171/// (permission denied, name collision in the target netns, etc.).
172#[cfg(target_os = "linux")]
173pub fn move_link_into_netns_and_rename(
174 link_name: &str,
175 target_pid: u32,
176 new_name: &str,
177) -> Result<(), NetlinkError> {
178 use std::os::fd::{AsFd, OwnedFd};
179
180 let ns_file = std::fs::File::open(format!("/proc/{target_pid}/ns/net"))?;
181 let ns_fd: OwnedFd = OwnedFd::from(ns_file);
182 move_link_into_netns_fd_and_rename(link_name, ns_fd.as_fd(), new_name)
183}
184
185/// Non-Linux stub: the overlay manager never calls this on non-Linux
186/// platforms (libcontainer itself is a Linux-only dep), but keeping the
187/// signature available lets `overlay_manager.rs` stay platform-agnostic.
188#[cfg(not(target_os = "linux"))]
189pub fn move_link_into_netns_and_rename(
190 _link_name: &str,
191 _target_pid: u32,
192 _new_name: &str,
193) -> Result<(), NetlinkError> {
194 Err(NetlinkError::Netlink(
195 "move_link_into_netns_and_rename is only supported on Linux".to_string(),
196 ))
197}
198
199/// Create a veth pair with the two ends named `host_name` and `peer_name`.
200///
201/// Both ends start in the current network namespace. The caller is
202/// responsible for moving the peer end into the container netns (see
203/// [`move_link_into_netns_and_rename`]) and bringing the host end up
204/// (see [`set_link_up_by_name`]).
205///
206/// Replaces the shell-out:
207/// ip link add `<host_name>` type veth peer name `<peer_name>`
208///
209/// # Errors
210///
211/// Returns [`NetlinkError::Netlink`] if RTNETLINK fails for any
212/// reason. `EEXIST` / "File exists" is surfaced verbatim so the caller
213/// can distinguish a leaked endpoint (typically a sign the orphan
214/// sweeper missed something) from a permission or interface-name
215/// problem.
216#[cfg(target_os = "linux")]
217pub async fn create_veth_pair(host_name: &str, peer_name: &str) -> Result<(), NetlinkError> {
218 let (connection, handle, _) = rtnetlink::new_connection()
219 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
220 tokio::spawn(connection);
221
222 handle
223 .link()
224 .add()
225 .veth(host_name.to_string(), peer_name.to_string())
226 .execute()
227 .await
228 .map_err(|e| {
229 let msg = e.to_string();
230 if msg.contains("File exists") || msg.contains("EEXIST") {
231 NetlinkError::Netlink(format!(
232 "veth pair already exists: host={host_name} peer={peer_name}: {msg}"
233 ))
234 } else {
235 NetlinkError::Netlink(format!(
236 "veth create failed (host={host_name}, peer={peer_name}): {msg}"
237 ))
238 }
239 })
240}
241
242/// Non-Linux stub.
243#[cfg(not(target_os = "linux"))]
244pub async fn create_veth_pair(_host_name: &str, _peer_name: &str) -> Result<(), NetlinkError> {
245 Err(NetlinkError::Netlink(
246 "create_veth_pair is only supported on Linux".to_string(),
247 ))
248}
249
250/// Delete the link by name. Idempotent: returns `Ok(())` if the link
251/// does not exist. Any other error surfaces as
252/// [`NetlinkError::Netlink`].
253///
254/// Replaces the shell-out:
255/// ip link delete `<name>`
256///
257/// Used in `overlay_manager::attach_to_interface` pre-cleanup,
258/// cleanup-on-error, and the orphan-veth sweeper.
259///
260/// # Errors
261///
262/// Returns [`NetlinkError::Netlink`] if RTNETLINK reports a failure
263/// other than `ENODEV` / "No such device" (which are treated as
264/// success so this is safe to call unconditionally).
265#[cfg(target_os = "linux")]
266pub async fn delete_link_by_name(name: &str) -> Result<(), NetlinkError> {
267 use futures_util::stream::TryStreamExt;
268
269 let (connection, handle, _) = rtnetlink::new_connection()
270 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
271 tokio::spawn(connection);
272
273 // Look up the link by name. Treat "not found" as success so the
274 // helper is safe to call unconditionally in cleanup paths.
275 let lookup = handle
276 .link()
277 .get()
278 .match_name(name.to_string())
279 .execute()
280 .try_next()
281 .await;
282
283 let link = match lookup {
284 Ok(Some(link)) => link,
285 Ok(None) => return Ok(()),
286 Err(rtnetlink::Error::NetlinkError(err)) => {
287 // libc::ENODEV == 19. netlink-packet-core reports the raw
288 // errno as a negative i32 in `code`, but the exact type has
289 // moved between versions, so match by both numeric code and
290 // the human-readable message for belt-and-suspenders safety.
291 let msg = err.to_string();
292 let is_enodev = err
293 .code
294 .is_some_and(|c| c.get().unsigned_abs() == libc::ENODEV as u32);
295 if is_enodev || msg.contains("No such device") {
296 return Ok(());
297 }
298 return Err(NetlinkError::Netlink(format!(
299 "link lookup failed for {name}: {msg}"
300 )));
301 }
302 Err(e) => {
303 let msg = e.to_string();
304 if msg.contains("No such device") {
305 return Ok(());
306 }
307 return Err(NetlinkError::Netlink(format!(
308 "link lookup failed for {name}: {msg}"
309 )));
310 }
311 };
312
313 let index = link.header.index;
314
315 handle
316 .link()
317 .del(index)
318 .execute()
319 .await
320 .map_err(|e| NetlinkError::Netlink(format!("link delete failed for {name}: {e}")))
321}
322
323/// Non-Linux stub.
324#[cfg(not(target_os = "linux"))]
325pub async fn delete_link_by_name(_name: &str) -> Result<(), NetlinkError> {
326 Err(NetlinkError::Netlink(
327 "delete_link_by_name is only supported on Linux".to_string(),
328 ))
329}
330
331/// List all network interfaces in the current netns.
332///
333/// Returns a `Vec` of `(index, name)` tuples for every link the kernel
334/// reports. Used by the orphan veth sweeper to find `veth-<pid>` and
335/// `vc-<pid>` links whose owning PID is dead, so it can clean them up
336/// via [`delete_link_by_name`].
337///
338/// Replaces the shell-out:
339/// ip -br link
340///
341/// Issues a single RTNETLINK `RTM_GETLINK` dump request and iterates
342/// the resulting stream of `LinkMessage`s. Each message contributes
343/// one `(index, name)` tuple; messages without an `IFLA_IFNAME`
344/// attribute (extremely rare in practice — the kernel always emits
345/// one for configured devices) are silently skipped.
346///
347/// # Errors
348///
349/// Returns [`NetlinkError::Netlink`] if the rtnetlink socket cannot
350/// be created or if the dump stream itself reports a failure.
351#[cfg(target_os = "linux")]
352pub async fn list_all_links() -> Result<Vec<(u32, String)>, NetlinkError> {
353 use futures_util::stream::TryStreamExt;
354 use netlink_packet_route::link::LinkAttribute;
355
356 let (connection, handle, _) = rtnetlink::new_connection()
357 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
358 tokio::spawn(connection);
359
360 let mut stream = handle.link().get().execute();
361 let mut links = Vec::new();
362
363 while let Some(msg) = stream
364 .try_next()
365 .await
366 .map_err(|e| NetlinkError::Netlink(format!("link dump failed: {e}")))?
367 {
368 // LinkHeader.index is already u32 in netlink-packet-route
369 // 0.19 — no cast needed.
370 let index = msg.header.index;
371 let Some(name) = msg.attributes.iter().find_map(|a| match a {
372 LinkAttribute::IfName(n) => Some(n.clone()),
373 _ => None,
374 }) else {
375 continue;
376 };
377 links.push((index, name));
378 }
379
380 Ok(links)
381}
382
383/// Non-Linux stub.
384#[cfg(not(target_os = "linux"))]
385pub async fn list_all_links() -> Result<Vec<(u32, String)>, NetlinkError> {
386 Err(NetlinkError::Netlink(
387 "list_all_links is only supported on Linux".to_string(),
388 ))
389}
390
391/// Set the link identified by `name` to the "up" administrative state.
392///
393/// Replaces the shell-out:
394/// ip link set `<name>` up
395///
396/// Unlike [`delete_link_by_name`] this is *not* idempotent for missing
397/// links: if the link does not exist the caller almost certainly has a
398/// bug upstream (we only call this on a veth end we just created), so
399/// we return [`NetlinkError::NotFound`] rather than silently succeeding.
400///
401/// # Errors
402///
403/// Returns [`NetlinkError::NotFound`] if no link with the given name
404/// exists in the current netns. Returns [`NetlinkError::Netlink`] for
405/// any other RTNETLINK failure (permission denied, etc.).
406#[cfg(target_os = "linux")]
407pub async fn set_link_up_by_name(name: &str) -> Result<(), NetlinkError> {
408 use futures_util::stream::TryStreamExt;
409
410 let (connection, handle, _) = rtnetlink::new_connection()
411 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
412 tokio::spawn(connection);
413
414 let link = handle
415 .link()
416 .get()
417 .match_name(name.to_string())
418 .execute()
419 .try_next()
420 .await
421 .map_err(|e| {
422 let msg = e.to_string();
423 if msg.contains("No such device") {
424 NetlinkError::NotFound(name.to_string())
425 } else {
426 NetlinkError::Netlink(format!("link lookup failed for {name}: {msg}"))
427 }
428 })?
429 .ok_or_else(|| NetlinkError::NotFound(name.to_string()))?;
430
431 let index = link.header.index;
432
433 handle
434 .link()
435 .set(index)
436 .up()
437 .execute()
438 .await
439 .map_err(|e| NetlinkError::Netlink(format!("link set up failed for {name}: {e}")))
440}
441
442/// Non-Linux stub.
443#[cfg(not(target_os = "linux"))]
444pub async fn set_link_up_by_name(_name: &str) -> Result<(), NetlinkError> {
445 Err(NetlinkError::Netlink(
446 "set_link_up_by_name is only supported on Linux".to_string(),
447 ))
448}
449
450/// Add an IP address to the link identified by `name` in the current
451/// network namespace.
452///
453/// Replaces (in combination with [`with_netns`]):
454/// nsenter -t `<pid>` -n ip \[-6\] addr add `<addr>/<prefix_len>` dev `<name>`
455///
456/// `addr` may be v4 or v6. `prefix_len` is the CIDR prefix length
457/// (24 for a `/24`, 64 for a `/64`, etc.).
458///
459/// This helper operates on the CURRENT network namespace — it looks
460/// up the interface index via a local rtnetlink socket. To target a
461/// container's netns, wrap the call inside [`with_netns`].
462///
463/// # Errors
464///
465/// Returns [`NetlinkError::NotFound`] if the link is missing. Returns
466/// [`NetlinkError::Netlink`] for any other rtnetlink failure
467/// (permission denied, EEXIST on a duplicate address, etc.).
468#[cfg(target_os = "linux")]
469pub async fn add_address_to_link_by_name(
470 name: &str,
471 addr: std::net::IpAddr,
472 prefix_len: u8,
473) -> Result<(), NetlinkError> {
474 use futures_util::stream::TryStreamExt;
475
476 let (connection, handle, _) = rtnetlink::new_connection()
477 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
478 tokio::spawn(connection);
479
480 let link = handle
481 .link()
482 .get()
483 .match_name(name.to_string())
484 .execute()
485 .try_next()
486 .await
487 .map_err(|e| {
488 let msg = e.to_string();
489 if msg.contains("No such device") {
490 NetlinkError::NotFound(name.to_string())
491 } else {
492 NetlinkError::Netlink(format!("link lookup failed for {name}: {msg}"))
493 }
494 })?
495 .ok_or_else(|| NetlinkError::NotFound(name.to_string()))?;
496
497 let index = link.header.index;
498
499 handle
500 .address()
501 .add(index, addr, prefix_len)
502 .execute()
503 .await
504 .map_err(|e| {
505 NetlinkError::Netlink(format!(
506 "address add failed for {name} ({addr}/{prefix_len}): {e}"
507 ))
508 })
509}
510
511/// Non-Linux stub.
512#[cfg(not(target_os = "linux"))]
513pub async fn add_address_to_link_by_name(
514 _name: &str,
515 _addr: std::net::IpAddr,
516 _prefix_len: u8,
517) -> Result<(), NetlinkError> {
518 Err(NetlinkError::Netlink(
519 "add_address_to_link_by_name is only supported on Linux".to_string(),
520 ))
521}
522
523/// Add a default route via the given device name in the current
524/// network namespace.
525///
526/// Replaces (in combination with [`with_netns`]):
527/// nsenter -t `<pid>` -n ip \[-6\] route add default dev `<dev_name>`
528///
529/// The route is a direct, link-scope route: no gateway, the kernel
530/// ARPs / uses NDISC on the device for destination resolution. This
531/// is the correct form for a point-to-point veth link where the peer
532/// is reachable directly.
533///
534/// For IPv4 the destination prefix is `0.0.0.0/0`. For IPv6 it is
535/// `::/0`. Controlled by `is_v6`.
536///
537/// # Errors
538///
539/// Returns [`NetlinkError::NotFound`] if the device is missing.
540/// Returns [`NetlinkError::Netlink`] for any other rtnetlink failure.
541#[cfg(target_os = "linux")]
542pub async fn add_default_route_via_dev(dev_name: &str, is_v6: bool) -> Result<(), NetlinkError> {
543 use futures_util::stream::TryStreamExt;
544 use netlink_packet_route::route::RouteScope;
545
546 let (connection, handle, _) = rtnetlink::new_connection()
547 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
548 tokio::spawn(connection);
549
550 let link = handle
551 .link()
552 .get()
553 .match_name(dev_name.to_string())
554 .execute()
555 .try_next()
556 .await
557 .map_err(|e| {
558 let msg = e.to_string();
559 if msg.contains("No such device") {
560 NetlinkError::NotFound(dev_name.to_string())
561 } else {
562 NetlinkError::Netlink(format!("link lookup failed for {dev_name}: {msg}"))
563 }
564 })?
565 .ok_or_else(|| NetlinkError::NotFound(dev_name.to_string()))?;
566
567 let oif_idx = link.header.index;
568
569 if is_v6 {
570 handle
571 .route()
572 .add()
573 .v6()
574 .destination_prefix(std::net::Ipv6Addr::UNSPECIFIED, 0)
575 .output_interface(oif_idx)
576 .scope(RouteScope::Link)
577 .execute()
578 .await
579 .map_err(|e| {
580 NetlinkError::Netlink(format!("default route add v6 via {dev_name} failed: {e}"))
581 })
582 } else {
583 handle
584 .route()
585 .add()
586 .v4()
587 .destination_prefix(std::net::Ipv4Addr::UNSPECIFIED, 0)
588 .output_interface(oif_idx)
589 .scope(RouteScope::Link)
590 .execute()
591 .await
592 .map_err(|e| {
593 NetlinkError::Netlink(format!("default route add v4 via {dev_name} failed: {e}"))
594 })
595 }
596}
597
598/// Non-Linux stub.
599#[cfg(not(target_os = "linux"))]
600pub async fn add_default_route_via_dev(_dev_name: &str, _is_v6: bool) -> Result<(), NetlinkError> {
601 Err(NetlinkError::Netlink(
602 "add_default_route_via_dev is only supported on Linux".to_string(),
603 ))
604}
605
606/// Add or replace a route to `dest/prefix_len` that forwards via the
607/// interface named `dev_name`. Optional `src` sets the preferred source
608/// address.
609///
610/// Replaces the shell-outs:
611/// ip route replace `<dest>/<prefix_len>` dev `<dev_name>` \[src `<src>`\]
612/// ip -6 route replace `<dest>/<prefix_len>` dev `<dev_name>` \[src `<src>`\]
613///
614/// Uses `NLM_F_REPLACE | NLM_F_CREATE` semantics (via rtnetlink's
615/// `.replace()` on the route add builder) so stale routes left behind
616/// by a previous daemon run don't cause `EEXIST`.
617///
618/// The route is installed with link scope (direct-via-dev, no
619/// gateway) which is the correct form for a per-container `/32` or
620/// `/128` pointing at a host-side veth endpoint.
621///
622/// `dest` and `src` (if provided) must have matching address families
623/// — passing a v4 `dest` with a v6 `src` returns
624/// [`NetlinkError::Netlink`] without touching the kernel.
625///
626/// # Errors
627///
628/// Returns [`NetlinkError::NotFound`] if `dev_name` does not exist in
629/// the current netns. Returns [`NetlinkError::Netlink`] on address
630/// family mismatch or any RTNETLINK failure.
631#[cfg(target_os = "linux")]
632pub async fn replace_route_via_dev(
633 dest: std::net::IpAddr,
634 prefix_len: u8,
635 dev_name: &str,
636 src: Option<std::net::IpAddr>,
637) -> Result<(), NetlinkError> {
638 use std::net::IpAddr;
639
640 use futures_util::stream::TryStreamExt;
641 use netlink_packet_route::route::RouteScope;
642
643 let (connection, handle, _) = rtnetlink::new_connection()
644 .map_err(|e| NetlinkError::Netlink(format!("new_connection failed: {e}")))?;
645 tokio::spawn(connection);
646
647 let link = handle
648 .link()
649 .get()
650 .match_name(dev_name.to_string())
651 .execute()
652 .try_next()
653 .await
654 .map_err(|e| {
655 let msg = e.to_string();
656 if msg.contains("No such device") {
657 NetlinkError::NotFound(dev_name.to_string())
658 } else {
659 NetlinkError::Netlink(format!("link lookup failed for {dev_name}: {msg}"))
660 }
661 })?
662 .ok_or_else(|| NetlinkError::NotFound(dev_name.to_string()))?;
663
664 let oif_idx = link.header.index;
665
666 match (dest, src) {
667 (IpAddr::V4(d), Some(IpAddr::V4(s))) => handle
668 .route()
669 .add()
670 .v4()
671 .destination_prefix(d, prefix_len)
672 .output_interface(oif_idx)
673 .scope(RouteScope::Link)
674 .pref_source(s)
675 .replace()
676 .execute()
677 .await
678 .map_err(|e| {
679 NetlinkError::Netlink(format!(
680 "route replace v4 {d}/{prefix_len} dev {dev_name} src {s} failed: {e}"
681 ))
682 }),
683 (IpAddr::V4(d), None) => handle
684 .route()
685 .add()
686 .v4()
687 .destination_prefix(d, prefix_len)
688 .output_interface(oif_idx)
689 .scope(RouteScope::Link)
690 .replace()
691 .execute()
692 .await
693 .map_err(|e| {
694 NetlinkError::Netlink(format!(
695 "route replace v4 {d}/{prefix_len} dev {dev_name} failed: {e}"
696 ))
697 }),
698 (IpAddr::V6(d), Some(IpAddr::V6(s))) => handle
699 .route()
700 .add()
701 .v6()
702 .destination_prefix(d, prefix_len)
703 .output_interface(oif_idx)
704 .scope(RouteScope::Link)
705 .pref_source(s)
706 .replace()
707 .execute()
708 .await
709 .map_err(|e| {
710 NetlinkError::Netlink(format!(
711 "route replace v6 {d}/{prefix_len} dev {dev_name} src {s} failed: {e}"
712 ))
713 }),
714 (IpAddr::V6(d), None) => handle
715 .route()
716 .add()
717 .v6()
718 .destination_prefix(d, prefix_len)
719 .output_interface(oif_idx)
720 .scope(RouteScope::Link)
721 .replace()
722 .execute()
723 .await
724 .map_err(|e| {
725 NetlinkError::Netlink(format!(
726 "route replace v6 {d}/{prefix_len} dev {dev_name} failed: {e}"
727 ))
728 }),
729 (IpAddr::V4(_), Some(IpAddr::V6(_))) | (IpAddr::V6(_), Some(IpAddr::V4(_))) => Err(
730 NetlinkError::Netlink(format!("address family mismatch: dest={dest} src={src:?}")),
731 ),
732 }
733}
734
735/// Non-Linux stub.
736#[cfg(not(target_os = "linux"))]
737pub async fn replace_route_via_dev(
738 _dest: std::net::IpAddr,
739 _prefix_len: u8,
740 _dev_name: &str,
741 _src: Option<std::net::IpAddr>,
742) -> Result<(), NetlinkError> {
743 Err(NetlinkError::Netlink(
744 "replace_route_via_dev is only supported on Linux".to_string(),
745 ))
746}
747
748/// Set a sysctl via the `/proc/sys/...` filesystem.
749///
750/// `key` uses dotted form like `net.ipv4.ip_forward`; dots are
751/// translated to path separators so the effective path is
752/// `/proc/sys/net/ipv4/ip_forward`. Writes the string form of
753/// `value` to the file.
754///
755/// Replaces the shell-outs:
756/// sysctl -w `<key>`=`<value>`
757///
758/// Writing to `/proc/sys/...` is the kernel-standard way of setting
759/// sysctls and works under any confinement that still allows write
760/// access to `/proc/sys` (which the overlay manager needs anyway for
761/// its other operations).
762///
763/// # Errors
764///
765/// Returns [`NetlinkError::Io`] if the write fails (e.g. permission
766/// denied, file missing because the sysctl doesn't exist on this
767/// kernel, etc.).
768pub fn set_sysctl(key: &str, value: &str) -> Result<(), NetlinkError> {
769 let path = format!("/proc/sys/{}", key.replace('.', "/"));
770 std::fs::write(&path, value)?;
771 Ok(())
772}
773
774/// Run a synchronous closure inside the network namespace referenced
775/// by the given `OwnedFd`.
776///
777/// This is the fd-based variant of [`with_netns`]. Callers that have
778/// already opened `/proc/<pid>/ns/net` (e.g. to pin the namespace
779/// across multiple operations) should use this form to reuse the
780/// same fd and avoid re-opening the procfs path — the reopen would
781/// fail with `ENOENT` if the container init process has exited in
782/// the meantime, even though the namespace itself is still alive
783/// because our pinned fd holds a reference.
784///
785/// The `OwnedFd` is moved into the dedicated worker thread and
786/// closed when the thread exits. Spawns a fresh OS thread (not a
787/// tokio blocking worker) because `setns` affects the whole thread
788/// and we don't want to contaminate a shared worker.
789///
790/// # Errors
791///
792/// Returns [`NetlinkError::Netlink`] if `setns` fails or the
793/// dedicated thread panics. Any error returned by the closure itself
794/// is propagated verbatim.
795#[cfg(target_os = "linux")]
796pub fn with_netns_fd<F, T>(ns_fd: std::os::fd::OwnedFd, f: F) -> Result<T, NetlinkError>
797where
798 F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
799 T: Send + 'static,
800{
801 let join_handle = std::thread::spawn(move || -> Result<T, NetlinkError> {
802 nix::sched::setns(&ns_fd, nix::sched::CloneFlags::CLONE_NEWNET)
803 .map_err(|e| NetlinkError::Netlink(format!("setns(ns_fd) failed: {e}")))?;
804 // Keep the fd alive for the duration of the closure even
805 // though setns only needs it for the syscall itself. Dropping
806 // it explicitly after the closure makes the lifetime obvious.
807 let result = f();
808 drop(ns_fd);
809 result
810 });
811
812 join_handle
813 .join()
814 .map_err(|_| NetlinkError::Netlink("with_netns_fd thread panicked".to_string()))?
815}
816
817/// Non-Linux Unix (macOS/BSD) stub. Not emitted on Windows — the sole caller
818/// chain (`attach_to_interface` in `overlay_manager.rs`) is
819/// `#[cfg(target_os = "linux")]`-gated, and `OwnedFd` is Unix-only.
820#[cfg(all(not(target_os = "linux"), unix))]
821pub fn with_netns_fd<F, T>(_ns_fd: std::os::fd::OwnedFd, _f: F) -> Result<T, NetlinkError>
822where
823 F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
824 T: Send + 'static,
825{
826 Err(NetlinkError::Netlink(
827 "with_netns_fd is only supported on Linux".to_string(),
828 ))
829}
830
831/// Run a synchronous closure inside the network namespace of the
832/// given PID.
833///
834/// Thin wrapper around [`with_netns_fd`] that opens
835/// `/proc/<target_pid>/ns/net` then delegates. Kept for backward
836/// compatibility and for callers that only need a single operation
837/// on the target netns. Callers that need to pin the namespace
838/// across multiple operations (and survive a racing exit of the
839/// container init) should open the fd themselves and call
840/// [`with_netns_fd`] directly.
841///
842/// Because `setns` is synchronous and `rtnetlink` is async, the
843/// typical usage pattern inside the closure is to build a local
844/// current-thread tokio runtime and `block_on` the netlink calls.
845/// See [`with_netns_async`] for a convenience wrapper that does
846/// exactly this.
847///
848/// # Errors
849///
850/// Returns [`NetlinkError::Io`] if `/proc/<target_pid>/ns/net` cannot
851/// be opened. Returns [`NetlinkError::Netlink`] if `setns` fails or
852/// the dedicated thread panics. Any error returned by the closure
853/// itself is propagated verbatim.
854#[cfg(target_os = "linux")]
855pub fn with_netns<F, T>(target_pid: u32, f: F) -> Result<T, NetlinkError>
856where
857 F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
858 T: Send + 'static,
859{
860 use std::os::fd::OwnedFd;
861
862 let ns_file = std::fs::File::open(format!("/proc/{target_pid}/ns/net"))?;
863 let ns_fd: OwnedFd = OwnedFd::from(ns_file);
864 with_netns_fd(ns_fd, f)
865}
866
867/// Non-Linux stub.
868#[cfg(not(target_os = "linux"))]
869pub fn with_netns<F, T>(_target_pid: u32, _f: F) -> Result<T, NetlinkError>
870where
871 F: FnOnce() -> Result<T, NetlinkError> + Send + 'static,
872 T: Send + 'static,
873{
874 Err(NetlinkError::Netlink(
875 "with_netns is only supported on Linux".to_string(),
876 ))
877}
878
879/// Convenience wrapper around [`with_netns_fd`] that builds a local
880/// current-thread tokio runtime inside the dedicated thread and
881/// drives the provided async future to completion.
882///
883/// The future is produced by calling `f()` from inside the thread
884/// that has already joined the target netns, so any rtnetlink
885/// operations awaited inside the future will talk to the target
886/// netns's kernel.
887///
888/// The local runtime is lightweight (single-thread, built per call)
889/// and only drives a handful of netlink messages before being
890/// dropped with the thread.
891///
892/// The `OwnedFd` is moved into the worker thread and closed when
893/// the thread exits.
894///
895/// # Errors
896///
897/// Returns [`NetlinkError::Netlink`] per [`with_netns_fd`], plus
898/// [`NetlinkError::Netlink`] if the local runtime fails to build.
899/// Any error returned by the future is propagated verbatim.
900#[cfg(target_os = "linux")]
901pub fn with_netns_fd_async<F, Fut, T>(ns_fd: std::os::fd::OwnedFd, f: F) -> Result<T, NetlinkError>
902where
903 F: FnOnce() -> Fut + Send + 'static,
904 Fut: std::future::Future<Output = Result<T, NetlinkError>>,
905 T: Send + 'static,
906{
907 with_netns_fd(ns_fd, move || {
908 let rt = tokio::runtime::Builder::new_current_thread()
909 .enable_all()
910 .build()
911 .map_err(|e| NetlinkError::Netlink(format!("local runtime build failed: {e}")))?;
912 rt.block_on(f())
913 })
914}
915
916/// Non-Linux Unix (macOS/BSD) stub. Not emitted on Windows — the sole caller
917/// chain (`attach_to_interface` in `overlay_manager.rs`) is
918/// `#[cfg(target_os = "linux")]`-gated, and `OwnedFd` is Unix-only.
919#[cfg(all(not(target_os = "linux"), unix))]
920pub fn with_netns_fd_async<F, Fut, T>(
921 _ns_fd: std::os::fd::OwnedFd,
922 _f: F,
923) -> Result<T, NetlinkError>
924where
925 F: FnOnce() -> Fut + Send + 'static,
926 Fut: std::future::Future<Output = Result<T, NetlinkError>>,
927 T: Send + 'static,
928{
929 Err(NetlinkError::Netlink(
930 "with_netns_fd_async is only supported on Linux".to_string(),
931 ))
932}
933
934/// Convenience wrapper around [`with_netns`] that builds a local
935/// current-thread tokio runtime inside the dedicated thread and
936/// drives the provided async future to completion.
937///
938/// Thin wrapper around [`with_netns_fd_async`] that opens
939/// `/proc/<target_pid>/ns/net` then delegates.
940///
941/// # Errors
942///
943/// Returns [`NetlinkError::Io`] / [`NetlinkError::Netlink`] per
944/// [`with_netns`], plus [`NetlinkError::Netlink`] if the local
945/// runtime fails to build. Any error returned by the future is
946/// propagated verbatim.
947#[cfg(target_os = "linux")]
948pub fn with_netns_async<F, Fut, T>(target_pid: u32, f: F) -> Result<T, NetlinkError>
949where
950 F: FnOnce() -> Fut + Send + 'static,
951 Fut: std::future::Future<Output = Result<T, NetlinkError>>,
952 T: Send + 'static,
953{
954 use std::os::fd::OwnedFd;
955
956 let ns_file = std::fs::File::open(format!("/proc/{target_pid}/ns/net"))?;
957 let ns_fd: OwnedFd = OwnedFd::from(ns_file);
958 with_netns_fd_async(ns_fd, f)
959}
960
961/// Non-Linux stub.
962#[cfg(not(target_os = "linux"))]
963pub fn with_netns_async<F, Fut, T>(_target_pid: u32, _f: F) -> Result<T, NetlinkError>
964where
965 F: FnOnce() -> Fut + Send + 'static,
966 Fut: std::future::Future<Output = Result<T, NetlinkError>>,
967 T: Send + 'static,
968{
969 Err(NetlinkError::Netlink(
970 "with_netns_async is only supported on Linux".to_string(),
971 ))
972}