ts_runtime/lib.rs
1#![doc = include_str!("../README.md")]
2
3extern crate ts_netstack_smoltcp as netstack;
4
5use core::time::Duration;
6use std::sync::Arc;
7
8use kameo::{
9 actor::{ActorRef, Spawn, WeakActorRef},
10 mailbox::Signal,
11};
12use netstack::netcore::Channel;
13use tokio::sync::watch;
14
15use crate::{
16 control_runner::ControlRunner, dataplane::DataplaneActor, direct::DirectManager,
17 forwarder_actor::ForwarderActor, multiderp::Multiderp, netstack_actor::NetstackActor,
18};
19
20/// Pcap stream framer for debug packet capture (`CapturePcap`).
21pub mod capture;
22/// Control runner.
23pub mod control_runner;
24mod dataplane;
25mod derp_latency;
26/// Device connection-state tracking ([`DeviceState`]) and typed registration outcome
27/// ([`RegistrationError`]).
28pub mod device_state;
29mod direct;
30mod env;
31mod error;
32/// Fallback TCP handler registry (`tsnet.Server.RegisterFallbackTCPHandler` parity).
33pub mod fallback_tcp;
34mod forwarder_actor;
35/// Client-side Funnel ingress termination (`tsnet`'s `ListenFunnel` data path).
36pub mod funnel;
37mod magic_dns;
38mod multiderp;
39mod netstack_actor;
40mod packetfilter;
41pub mod peer_tracker;
42mod peerapi;
43mod peerapi_doh;
44mod route_updater;
45/// Stored Serve config + accept-loop runtime (`tsnet`'s `Get/SetServeConfig` + serving runtime).
46pub mod serve;
47mod src_filter;
48/// Netmap status snapshot, WhoIs, and watcher types.
49pub mod status;
50/// Taildrop peer-to-peer file transfer store.
51pub mod taildrop;
52pub mod taildrop_send;
53/// Tailnet-Lock (TKA) chain-sync orchestration: bootstrap + offer/send driver (the runtime layer
54/// that bridges the `ts_control` sync RPCs and the `ts_tka` chain logic).
55mod tka_sync;
56#[cfg(feature = "tun")]
57mod tun_actor;
58
59pub use device_state::{DeviceState, RegistrationError};
60pub(crate) use env::Env;
61pub use error::{Error, ErrorKind};
62pub use status::{FileTarget, NetcheckReport, RegionLatency, Status, StatusNode, WhoIs};
63pub use ts_dataplane::{CaptureHook, CapturePath};
64
65use crate::peer_tracker::PeerTracker;
66
67/// The runtime for a tailscale device.
68pub struct Runtime {
69 /// Reference to the control actor.
70 pub control: ActorRef<ControlRunner>,
71 dataplane: ActorRef<DataplaneActor>,
72 /// Reference to the direct (disco/UDP underlay) manager, retained so [`Runtime::rebind`] can
73 /// ask it to re-bind the underlay socket on a network/link change.
74 direct: ActorRef<DirectManager>,
75 /// Reference to the application netstack actor. `None` in TUN transport mode, where there is
76 /// no userspace application netstack (the application data path is a real kernel TUN device).
77 netstack: Option<WeakActorRef<NetstackActor>>,
78 /// Reference to the peer tracker for peer lookups.
79 pub peer_tracker: WeakActorRef<PeerTracker>,
80 /// Fallback TCP handler registry, bound to the application netstack. `None` in TUN transport
81 /// mode (no application netstack exists to attach it to).
82 fallback_tcp: Option<fallback_tcp::FallbackTcpManager>,
83 env: Env,
84 shutdown: watch::Sender<bool>,
85 /// Sender side of the exit-node selector `watch` cell. Held privately here (not on the cloned
86 /// `Env`, which keeps only the read side) so that only `Runtime::set_exit_node` can mutate the
87 /// selection; the route updater and source filter re-read it via [`Env::exit_node`].
88 exit_node_tx: watch::Sender<Option<ts_control::ExitNodeSelector>>,
89 /// Receiver mirroring the *active* (resolved + fail-closed) exit node's stable id, fed by the
90 /// route updater. Read by [`Runtime::status`] / [`Runtime::active_exit_node`] to report which
91 /// exit node traffic is actually egressing through (vs. the merely-configured selector).
92 active_exit_rx: watch::Receiver<Option<ts_control::StableNodeId>>,
93 /// Receiver for the device connection-state cell, fed by the control runner. Read by
94 /// [`Runtime::watch_state`] and [`Runtime::wait_until_running`].
95 state_rx: watch::Receiver<DeviceState>,
96}
97
98impl Runtime {
99 /// Spawn a new runtime with the given parameters for connecting to a tailnet.
100 pub async fn spawn(
101 config: ts_control::Config,
102 auth_key: Option<String>,
103 keys: ts_keys::NodeState,
104 ) -> Result<Self, Error> {
105 let (shutdown_tx, shutdown_rx) = watch::channel(false);
106
107 // The exit-node selector is a live `watch` cell so `Device::set_exit_node` can change it at
108 // runtime. `new_with_exit_tx` returns the `Sender` (mutation capability) separately so it is
109 // retained privately on the `Runtime`, while only the `Receiver` (the readers' contract)
110 // lives on the cloned `Env`. The initial value comes from `ForwarderConfig.exit_node`.
111 let (env, exit_node_tx) = Env::new_with_exit_tx(
112 keys,
113 shutdown_rx,
114 env::ForwarderConfig::from_control_config(&config),
115 );
116
117 // Both userspace netstacks (application + forwarder) share one netstack config. Honor the
118 // per-deployment TCP buffer knob when set, otherwise fall back to the netstack default.
119 let netstack_config = netstack_config_from(config.tcp_buffer_size);
120
121 let dataplane = DataplaneActor::spawn(env.clone());
122
123 let (netstack_id, netstack_up, netstack_down) =
124 dataplane.ask(dataplane::NewOverlayTransport).await?;
125
126 // A second overlay transport feeds the dedicated any-IP forwarder netstack. Inbound packets
127 // for advertised subnet routes / the exit-node default route are routed here (see
128 // `route_updater`), keeping forwarded flows off the application netstack.
129 let (forwarder_id, forwarder_up, forwarder_down) =
130 dataplane.ask(dataplane::NewOverlayTransport).await?;
131
132 let multiderp = Multiderp::spawn((env.clone(), dataplane.clone()));
133
134 // Spawn the direct (disco) underlay manager before the route updater. Its `on_start`
135 // binds the UDP socket and registers its transport synchronously, so by the time the
136 // route updater asks it for the direct transport id it is guaranteed to be available.
137 let direct = DirectManager::spawn((env.clone(), dataplane.clone(), multiderp.clone()));
138
139 // Spawn the forwarder before the route updater. Its `on_start` builds the forwarder
140 // netstack, enables any-IP acceptance, and starts the per-port accept loops synchronously,
141 // so by the time the route updater begins delivering advertised prefixes to
142 // `forwarder_id` the netstack is already draining its transport.
143 let forwarder = ForwarderActor::spawn((
144 env.clone(),
145 netstack_config.clone(),
146 forwarder_up,
147 forwarder_down,
148 ));
149 // Force `on_start` to finish (any-IP enabled, accept loops live) before the route updater
150 // can route the first inbound flow to `forwarder_id`: an `ask` blocks until the actor has
151 // started.
152 //
153 // The forwarder netstack's overlay `Channel` is reused by the TUN application path for
154 // recursive / exit-node-DoH MagicDNS forwarding (TUN mode has no application netstack of its
155 // own, but the forwarder netstack runs in both modes and egresses over the overlay — the
156 // anti-leak property `forward_query`/`forward_doh` require). Only the `tun` Tun arm consumes
157 // it, so it is unused when the `tun` feature is off — allow that without warn-as-error.
158 #[cfg_attr(not(feature = "tun"), allow(unused_variables))]
159 let (forwarder_channel,) = forwarder.ask(forwarder_actor::GetChannel).await?;
160
161 // The route updater is the single authoritative resolver of the active (resolved,
162 // fail-closed) exit node; it publishes the resolved stable id into this watch cell so
163 // `Runtime::status` can report which exit is actually engaged (not just configured).
164 let (active_exit_tx, active_exit_rx) = watch::channel(None);
165 route_updater::RouteUpdater::spawn((
166 multiderp.clone(),
167 direct.clone(),
168 env.clone(),
169 netstack_id,
170 forwarder_id,
171 active_exit_tx,
172 ));
173 packetfilter::PacketfilterUpdater::spawn(env.clone());
174 src_filter::SourceFilterUpdater::spawn(env.clone());
175 let peer_tracker = PeerTracker::spawn(env.clone()).downgrade();
176
177 // Select the application data path from the transport mode. The forwarder/egress path
178 // above is UNCHANGED in both modes — TUN mode only swaps the application data path, never
179 // the forwarder. `config` is moved into `ControlRunner::spawn` below, so branch on a
180 // borrow and clone the small `TunConfig` where needed before the move.
181 //
182 // - Netstack (the default, and the only reachable arm when the `tun` feature is off):
183 // spawn the application netstack + MagicDNS responder + fallback-TCP registry, all on
184 // the `netstack_up`/`netstack_down` overlay seam.
185 // - Tun: spawn `TunActor` on that same overlay seam instead; no application netstack and
186 // no MagicDNS responder exist, and `netstack`/`fallback_tcp` are `None`.
187 // - Tun requested but built without the `tun` feature: hard-error (a config/build
188 // mismatch knowable at spawn time). NEVER silently fall back to netstack.
189 let (netstack, fallback_tcp) = match &config.transport_mode {
190 ts_control::TransportMode::Netstack => {
191 let netstack = NetstackActor::spawn((
192 env.clone(),
193 netstack_config,
194 netstack_up,
195 netstack_down,
196 ));
197
198 // Fetch the netstack channel while we still hold the strong ActorRef, then spawn
199 // the MagicDNS responder on it. Fire-and-forget: like src_filter/route_updater,
200 // it's owned by the message bus and isn't stored on `Runtime`.
201 let (channel,) = netstack.ask(netstack_actor::GetChannel).await?;
202 // The fallback-TCP registry attaches to the application netstack — the same one
203 // that carries the embedder's explicit `Device::tcp_listen` sockets — so a
204 // fallback handler sees exactly the inbound flows no explicit listener matched.
205 let fallback_tcp = fallback_tcp::FallbackTcpManager::new(channel.clone());
206 magic_dns::MagicDnsActor::spawn((env.clone(), channel));
207
208 (Some(netstack.downgrade()), Some(fallback_tcp))
209 }
210
211 #[cfg(feature = "tun")]
212 ts_control::TransportMode::Tun(tun_cfg) => {
213 // Reuse the same `netstack_up`/`netstack_down` overlay-transport pair that would
214 // have fed the netstack — it is just the application-side overlay seam (the name
215 // is historical). No NetstackActor / MagicDnsActor is spawned.
216 tun_actor::TunActor::spawn((
217 env.clone(),
218 tun_cfg.clone(),
219 netstack_up,
220 netstack_down,
221 // Host-route gating inputs derived from `Env`: subnet routes are only steered
222 // into the TUN when `--accept-routes` is set, and the host `/0` only when the
223 // embedder configured an exit node. See `tun_actor::host_routes_from_node`.
224 tun_actor::HostRouteGating {
225 accept_routes: env.accept_routes,
226 exit_node_configured: env.exit_node().is_some(),
227 },
228 // Reuse the forwarder netstack's overlay `Channel` for recursive / exit-node-DoH
229 // MagicDNS forwarding in the TUN datapath (TUN mode has no application netstack
230 // Channel of its own). Egresses over the overlay — anti-leak preserved.
231 forwarder_channel.clone(),
232 ));
233
234 (None, None)
235 }
236
237 #[cfg(not(feature = "tun"))]
238 ts_control::TransportMode::Tun(_) => {
239 return Err(Error {
240 kind: ErrorKind::TunUnavailable,
241 target_actor: None,
242 message_ty: None,
243 });
244 }
245 };
246
247 // Device connection-state cell. Created here (not inside the actor) so the control runner's
248 // `on_start` can publish `Failed`/`NeedsLogin` and still return `Err` without the sender
249 // being tied to a `Self` that never gets constructed on a hard registration failure.
250 let (state_tx, state_rx) = watch::channel(DeviceState::Connecting);
251
252 let control = ControlRunner::spawn(control_runner::Params {
253 config,
254 auth_key,
255 env: env.clone(),
256 state_tx,
257 });
258
259 Ok(Self {
260 control,
261 dataplane,
262 direct,
263 peer_tracker,
264 fallback_tcp,
265 netstack,
266 env,
267 shutdown: shutdown_tx,
268 exit_node_tx,
269 active_exit_rx,
270 state_rx,
271 })
272 }
273
274 /// Register a fallback TCP handler consulted for every inbound TCP flow that matches no
275 /// explicit listener (`tsnet.Server.RegisterFallbackTCPHandler` parity).
276 ///
277 /// The returned [`fallback_tcp::FallbackTcpHandle`] deregisters the handler when dropped. See
278 /// [`fallback_tcp`] for the dispatch contract and anti-leak guarantees.
279 ///
280 /// Returns [`ErrorKind::UnsupportedInTunMode`] in TUN transport mode, where there is no
281 /// application netstack to attach a fallback handler to.
282 pub fn register_fallback_tcp_handler(
283 &self,
284 cb: Arc<
285 dyn Fn(core::net::SocketAddr, core::net::SocketAddr) -> fallback_tcp::FallbackDecision
286 + Send
287 + Sync,
288 >,
289 ) -> Result<fallback_tcp::FallbackTcpHandle, Error> {
290 Ok(self
291 .fallback_tcp
292 .as_ref()
293 .ok_or(Error {
294 kind: ErrorKind::UnsupportedInTunMode,
295 target_actor: None,
296 message_ty: None,
297 })?
298 .register(cb))
299 }
300
301 /// Get a channel to send commands to the netstack.
302 ///
303 /// Returns [`ErrorKind::UnsupportedInTunMode`] in TUN transport mode, where there is no
304 /// application netstack.
305 pub async fn channel(&self) -> Result<Channel, Error> {
306 let (channel,) = self
307 .netstack
308 .as_ref()
309 .ok_or(Error {
310 kind: ErrorKind::UnsupportedInTunMode,
311 target_actor: None,
312 message_ty: None,
313 })?
314 .upgrade()
315 .ok_or(Error {
316 kind: ErrorKind::ActorGone,
317 target_actor: None,
318 message_ty: None,
319 })?
320 .ask(netstack_actor::GetChannel)
321 .await?;
322
323 Ok(channel)
324 }
325
326 /// The Taildrop file store, if Taildrop is enabled (`taildrop_dir` configured and the store
327 /// initialized). `None` when disabled — fail-closed. Shared with the peerAPI Taildrop server so
328 /// the embedder's read APIs and the receive path see the same on-disk store.
329 pub fn taildrop_store(&self) -> Option<Arc<crate::taildrop::TaildropStore>> {
330 self.env.taildrop_store.clone()
331 }
332
333 /// The shared Funnel ingress slot the peerAPI `/v0/ingress` route reads per connection.
334 ///
335 /// `Device::listen_funnel` installs a [`FunnelManager`](crate::funnel::FunnelManager)'s sink here
336 /// to make the route live (the peerAPI server is already running from startup). Returns a clone of
337 /// the runtime-lifetime `Arc` so the device can write the slot without restarting the server. See
338 /// [`crate::funnel`] for the ingress data path.
339 pub fn funnel_ingress_slot(&self) -> crate::funnel::FunnelIngressSlot {
340 self.env.funnel_ingress.clone()
341 }
342
343 /// The shared "Funnel ingress listener active" flag (the same `Arc` the control session reads to
344 /// set `HostInfo.IngressEnabled`). `Device::listen_funnel` flips it `true` while a funnel listener
345 /// is up so control routes Funnel traffic to this node; clearing it advertises no live endpoint.
346 pub fn ingress_active_flag(&self) -> std::sync::Arc<std::sync::atomic::AtomicBool> {
347 self.env.ingress_active.clone()
348 }
349
350 /// Install (`Some`) or clear (`None`) the debug packet-capture hook on the running dataplane.
351 /// `Some(hook)` tees every plaintext packet crossing the datapath to `hook` until it is cleared;
352 /// `None` stops capture. Mirrors Go `tstun.Wrapper.InstallCaptureHook` / `ClearCaptureSink`.
353 pub async fn install_capture(
354 &self,
355 hook: Option<ts_dataplane::CaptureHook>,
356 ) -> Result<(), Error> {
357 self.dataplane
358 .ask(dataplane::InstallCapture { hook })
359 .await
360 .map_err(Into::into)
361 }
362
363 /// Re-bind the underlay UDP socket after a network/link change (Wi-Fi switch, sleep/wake). The
364 /// embedder's own link monitor calls this (the engine owns the socket re-bind; the embedder owns
365 /// OS netmon). Re-binds the socket (same-port-preferred, IPv4-only invariant preserved) and
366 /// resets the now-stale local NAT mapping — clearing learned reflexive addresses and every
367 /// confirmed direct path while keeping candidate endpoints, so peers re-probe over the new socket
368 /// and relay over DERP (never a direct host dial) until a path re-confirms. Peers, control, the
369 /// netmap, disco state, and DERP are untouched. A no-op when the underlay is inert (bind failed
370 /// at startup, DERP-only). Mirrors Go magicsock `Conn.Rebind` + `resetEndpointStates`.
371 pub async fn rebind(&self) -> Result<(), Error> {
372 self.direct.ask(direct::Rebind).await.map_err(Error::from)
373 }
374
375 /// A snapshot of the local netmap: this node plus every known peer.
376 ///
377 /// Combines the self node held by the control runner with the peer set held by the peer
378 /// tracker. Mirrors tsnet's `LocalClient::Status`.
379 ///
380 /// `self_node` is `None` until the first netmap update has been received from control. Peer
381 /// entries carry no online/user/capability data (see the [`status`] module docs for that gap).
382 pub async fn status(&self) -> Result<Status, Error> {
383 let self_node_domain = self.control.ask(control_runner::SelfNode).await?;
384 // The MagicDNS suffix is the self node's FQDN minus its host label — already split into
385 // `Node.tailnet` at decode time (Go derives it the same way in `NetworkMap.MagicDNSSuffix`).
386 // Capture it before the domain `Node` is mapped away into a `StatusNode`.
387 let magic_dns_suffix = self_node_domain.as_ref().and_then(|n| n.tailnet.clone());
388 let self_node = self_node_domain.as_ref().map(StatusNode::from_node);
389
390 let peers = self
391 .peer_tracker
392 .upgrade()
393 .ok_or(Error {
394 kind: ErrorKind::ActorGone,
395 target_actor: None,
396 message_ty: None,
397 })?
398 .ask(peer_tracker::GetStatus)
399 .await?;
400
401 Ok(Status {
402 self_node,
403 peers,
404 active_exit_node: self.active_exit_node(),
405 magic_dns_suffix,
406 })
407 }
408
409 /// List the tailnet peers this node can Taildrop a file *to* (Go LocalAPI `FileTargets`).
410 ///
411 /// Mirrors the upstream send-path filter (`feature/taildrop` `Extension::FileTargets`): a peer
412 /// qualifies when it advertises a reachable peerAPI **and** is either owned by the same user as
413 /// this node **or** explicitly granted the file-sharing-target capability. The whole list is
414 /// gated on this node holding the file-sharing capability (control sets it when the admin enables
415 /// Taildrop) — absent that, an empty list (fail-closed, not an error, matching how the receive
416 /// store returns empty when disabled). Results are sorted by the peer's MagicDNS name.
417 ///
418 /// Targets are listed regardless of current online state (upstream's `FileTargets` does not gate
419 /// on online either; an offline target's send will simply time out). The self node is never
420 /// included. Returns empty before the first netmap.
421 ///
422 /// Divergence from Go: the upstream filter also excludes `tvOS` peers, which this fork cannot
423 /// reproduce (the domain node carries no OS string); the impact is negligible — the actual send
424 /// fail-closes if such a peer refused the transfer.
425 pub async fn file_targets(&self) -> Result<Vec<FileTarget>, Error> {
426 // Node-level gate: this node must hold the file-sharing capability (Taildrop enabled by the
427 // admin). Read it off the self node's cap map, like Go's `hasCapFileSharing()`.
428 let self_node = self.control.ask(control_runner::SelfNode).await?;
429 let Some(self_node) = self_node else {
430 return Ok(Vec::new()); // no netmap yet
431 };
432 if !self_node.can_share_files() {
433 return Ok(Vec::new()); // Taildrop not enabled for the tailnet — fail-closed
434 }
435 let self_user_id = self_node.user_id;
436
437 let peers = self
438 .peer_tracker
439 .upgrade()
440 .ok_or(Error {
441 kind: ErrorKind::ActorGone,
442 target_actor: None,
443 message_ty: None,
444 })?
445 .ask(peer_tracker::AllPeers)
446 .await?;
447
448 // Eligibility + ordering live in `build_file_targets` (pure, unit-tested in `status`).
449 Ok(status::build_file_targets(peers, self_user_id))
450 }
451
452 /// The stable id of the exit node traffic is currently egressing through, or `None` if none is
453 /// engaged. This is the route updater's resolved + fail-closed answer (see
454 /// [`Status::active_exit_node`](crate::status::Status::active_exit_node)): it differs from the
455 /// configured [`exit_node`](Self::exit_node) selector, which may name a peer that is absent or
456 /// no longer advertising a default route (in which case egress is dropped and this returns
457 /// `None`).
458 pub fn active_exit_node(&self) -> Option<ts_control::StableNodeId> {
459 self.active_exit_rx.borrow().clone()
460 }
461
462 /// Request an OIDC ID token from control scoped to `audience` (workload-identity federation).
463 ///
464 /// Returns the signed JWT, or the token RPC's own [`ts_control::IdTokenError`]. The kameo
465 /// delegated-reply send error is flattened: a handler error carries the real `IdTokenError`,
466 /// any other send failure (actor shutdown / mailbox closed) is surfaced as
467 /// [`ts_control::IdTokenError::NetworkError`].
468 pub async fn fetch_id_token(
469 &self,
470 audience: String,
471 ) -> Result<String, ts_control::IdTokenError> {
472 self.control
473 .ask(control_runner::FetchIdToken { audience })
474 .await
475 .map_err(flatten_send_err)
476 }
477
478 /// Log this node out of the tailnet: deregister it by expiring its current node key.
479 ///
480 /// Forwards to the control runner, which re-POSTs `/machine/register` with a past expiry over a
481 /// fresh Noise channel. This is a control-plane state change only — it does NOT shut the runtime
482 /// down (the caller follows with [`graceful_shutdown`](Self::graceful_shutdown)) and does not
483 /// touch the on-disk node key. The kameo delegated-reply send error is flattened the same way as
484 /// [`fetch_id_token`](Self::fetch_id_token): a handler error carries the real
485 /// [`ts_control::LogoutError`]; any other send failure (actor shutdown / mailbox closed) is
486 /// surfaced as [`ts_control::LogoutError::NetworkError`].
487 pub async fn logout(&self) -> Result<(), ts_control::LogoutError> {
488 self.control
489 .ask(control_runner::Logout)
490 .await
491 .map_err(flatten_logout_send_err)
492 }
493
494 /// Publish a `TXT` DNS record for this node via control's `/machine/set-dns` (Go
495 /// `LocalClient.SetDNS`).
496 ///
497 /// Forwards to the control runner, which POSTs the record over a fresh Noise channel. The kameo
498 /// delegated-reply send error is flattened the same way as [`fetch_id_token`](Self::fetch_id_token):
499 /// a handler error carries the real [`ts_control::SetDnsError`]; any other send failure (actor
500 /// shutdown / mailbox closed) is surfaced as [`ts_control::SetDnsError::NetworkError`].
501 pub async fn set_dns(
502 &self,
503 name: String,
504 value: String,
505 ) -> Result<(), ts_control::SetDnsError> {
506 self.control
507 .ask(control_runner::SetDns { name, value })
508 .await
509 .map_err(flatten_set_dns_send_err)
510 }
511
512 /// Issue a real Let's Encrypt certificate for this node's MagicDNS `name` (`acme` feature).
513 ///
514 /// Mirrors [`fetch_id_token`](Self::fetch_id_token): forwards to the control runner, which runs
515 /// the client-side ACME DNS-01 flow on a spawned task and publishes the challenge TXT via the
516 /// node's set-dns RPC. The kameo delegated-reply send error is flattened — a handler error
517 /// carries the real [`ts_control::CertError`]; any other send failure (actor shutdown / mailbox
518 /// closed) is surfaced as a [`ts_control::CertError::Io`]. SaaS-only: a self-hosted control
519 /// plane 501s on set-dns.
520 #[cfg(feature = "acme")]
521 pub async fn get_certificate(
522 &self,
523 name: String,
524 ) -> Result<ts_control::tls::CertifiedKey, ts_control::CertError> {
525 self.control
526 .ask(control_runner::GetCertificate { name })
527 .await
528 .map_err(flatten_cert_send_err)
529 }
530
531 /// Resolve which node owns a tailnet source address.
532 ///
533 /// Maps the source IP of `addr` to its owning node. Mirrors tsnet's `LocalClient::WhoIs`.
534 /// Returns `None` if no peer holds that tailnet IP. The returned [`WhoIs`] carries no
535 /// user/login or capability data in this fork (see the [`status`] module docs).
536 pub async fn whois(&self, addr: core::net::SocketAddr) -> Result<Option<WhoIs>, Error> {
537 self.peer_tracker
538 .upgrade()
539 .ok_or(Error {
540 kind: ErrorKind::ActorGone,
541 target_actor: None,
542 message_ty: None,
543 })?
544 .ask(peer_tracker::Whois { addr })
545 .await
546 .map_err(Into::into)
547 }
548
549 /// Change the selected exit node at runtime (the equivalent of Go `tsnet`'s
550 /// `LocalClient.EditPrefs(ExitNodeID/ExitNodeIP)`), without recreating the device.
551 ///
552 /// Updates the live exit-node selector, then asks the peer tracker to re-broadcast the current
553 /// peer set so the route updater and source filter re-resolve the new selector immediately.
554 /// `None` clears the exit node (internet-bound traffic is then dropped, fail-closed, unless this
555 /// node egresses directly). The selection is re-resolved against the live peer set, so passing a
556 /// selector for a peer not yet in the netmap simply takes effect once that peer appears.
557 pub async fn set_exit_node(
558 &self,
559 selector: Option<ts_control::ExitNodeSelector>,
560 ) -> Result<(), Error> {
561 // Update the live cell every reader borrows from. `send_replace` keeps the value current
562 // even with no active receivers (none can have dropped while the runtime is up, but it is
563 // the right non-failing primitive here).
564 self.exit_node_tx.send_replace(selector);
565
566 // Trigger an immediate re-resolution: the route updater (outbound routes + DoH delegation)
567 // and the source filter (inbound validation) both recompute on an `Arc<PeerState>`, so a
568 // re-broadcast applies the new exit without waiting for the next netmap update.
569 self.peer_tracker
570 .upgrade()
571 .ok_or(Error {
572 kind: ErrorKind::ActorGone,
573 target_actor: None,
574 message_ty: None,
575 })?
576 .ask(peer_tracker::RepublishState)
577 .await
578 .map_err(Into::into)
579 }
580
581 /// The currently-selected exit node, or `None` if none is selected.
582 pub fn exit_node(&self) -> Option<ts_control::ExitNodeSelector> {
583 self.env.exit_node()
584 }
585
586 /// Subscribe to netmap peer-change events.
587 ///
588 /// Returns a [`watch::Receiver`] whose value is the current set of peer [`StatusNode`]s,
589 /// updated on every netmap state update from control. Mirrors tsnet's `WatchIPNBus`. Await
590 /// [`watch::Receiver::changed`](tokio::sync::watch::Receiver::changed) to react to peers
591 /// joining, leaving, or changing.
592 pub async fn watch_netmap(&self) -> Result<watch::Receiver<Vec<StatusNode>>, Error> {
593 self.peer_tracker
594 .upgrade()
595 .ok_or(Error {
596 kind: ErrorKind::ActorGone,
597 target_actor: None,
598 message_ty: None,
599 })?
600 .ask(peer_tracker::WatchNetmap)
601 .await
602 .map_err(Into::into)
603 }
604
605 /// The current device connection-[`DeviceState`].
606 pub fn device_state(&self) -> DeviceState {
607 self.state_rx.borrow().clone()
608 }
609
610 /// Watch the device connection-[`DeviceState`] (`Connecting` → `Running` / `NeedsLogin` /
611 /// `Expired` / `Failed`).
612 ///
613 /// Returns a [`watch::Receiver`]; await
614 /// [`changed`](tokio::sync::watch::Receiver::changed) to react push-style to control connection
615 /// transitions instead of polling [`status`](Self::status). The initial value is the current
616 /// state. Note: a transient per-reconnect dip back to `Connecting` is **not** currently
617 /// emitted (control transparently reconnects below this layer); the state reflects registration
618 /// outcome and node-key expiry.
619 pub fn watch_state(&self) -> watch::Receiver<DeviceState> {
620 self.state_rx.clone()
621 }
622
623 /// Wait until the device finishes registering, returning a typed outcome.
624 ///
625 /// Resolves `Ok(())` once the device reaches [`DeviceState::Running`]. Returns a typed
626 /// [`RegistrationError`] otherwise — the actionable distinction between "retry", "re-pair", and
627 /// "drive interactive login" that replaces polling [`ipv4_addr`](Self::ipv4_addr) in a loop:
628 /// - `AuthRejected` — bad/expired/unknown auth key. **Permanent** (re-pair).
629 /// - `NeedsLogin(url)` — interactive authorization required (no usable auth key). **Not
630 /// permanent**: the runtime keeps retrying and will reach `Running` once the user authorizes
631 /// the URL. An **auth-key** caller should treat this as a failure; an **interactive** caller
632 /// should ignore this return and instead drive the flow via [`watch_state`](Self::watch_state)
633 /// (this method returns the URL eagerly rather than blocking for the whole login).
634 /// - `NetworkUnreachable` — control unreachable. **Transient** (retry).
635 /// - `Timeout` — no settled state within `timeout`.
636 ///
637 /// `KeyExpired` is not produced by this initial wait (a node key expires only *after* it has
638 /// come up); observe post-registration expiry via [`watch_state`](Self::watch_state).
639 /// `timeout` of `None` waits indefinitely for a settled state.
640 pub async fn wait_until_running(
641 &self,
642 timeout: Option<Duration>,
643 ) -> Result<(), RegistrationError> {
644 device_state::wait_for_running(self.state_rx.clone(), timeout).await
645 }
646
647 /// Attempt to shut down the runtime gracefully.
648 ///
649 /// Returns false if the shutdown timed out. It is still shut down if it timed out, just
650 /// more violently and with possible resource leaks.
651 pub async fn graceful_shutdown(self, timeout: Option<Duration>) -> bool {
652 self.shutdown.send_replace(true);
653
654 async fn _shutdown_all(runtime: Runtime) {
655 // See the note in `Drop` for why we only need to stop these actors to bring down the
656 // whole runtime.
657
658 let _ignore = runtime.control.stop_gracefully().await;
659 let _ignore = runtime.dataplane.stop_gracefully().await;
660 let _ignore = runtime.env.bus.stop_gracefully().await;
661
662 tokio::join![
663 runtime.control.wait_for_shutdown(),
664 runtime.dataplane.wait_for_shutdown(),
665 runtime.env.bus.wait_for_shutdown(),
666 ];
667 }
668
669 let fut = _shutdown_all(self);
670
671 match timeout {
672 Some(timeout) => tokio::time::timeout(timeout, fut).await.is_ok(),
673 None => {
674 fut.await;
675 true
676 }
677 }
678 }
679}
680
681impl Drop for Runtime {
682 fn drop(&mut self) {
683 // We must have already run `graceful_shutdown`: on the happy path, this does nothing, but
684 // if it timed out, we need to make sure the actors are dead so we don't leak them and their
685 // dependents.
686 if *self.shutdown.borrow() {
687 self.control.kill();
688 self.dataplane.kill();
689 self.env.bus.kill();
690 return;
691 }
692
693 self.shutdown.send_replace(true);
694
695 // Actors shut down when the last ActorRef to them is dropped (as nothing can send them
696 // messages anymore). If we don't hold an ActorRef in Runtime, in general the only thing
697 // that has one is the MessageBus, which each actor subscribes to for a subset of messages.
698 // Hence, if we shut down the bus, most actors die as well.
699
700 // First shut down the actors we have an ActorRef to:
701 try_shutdown(&self.control);
702 try_shutdown(&self.dataplane);
703
704 // Then shutdown the message bus, stopping the rest of the actors:
705 try_shutdown(&self.env.bus);
706 }
707}
708
709fn try_shutdown(a: &ActorRef<impl kameo::Actor>) {
710 if let Err(e) = a.mailbox_sender().try_send(Signal::Stop) {
711 tracing::error!(error = %e, "graceful shutdown failed, killing actor");
712 a.kill();
713 }
714}
715
716/// Build the netstack config shared by both userspace netstacks (application + forwarder) from the
717/// per-deployment `tcp_buffer_size` knob.
718///
719/// `None` keeps the netstack default (256 KiB/direction); `Some(n)` overrides it (e.g. a smaller
720/// window on a memory-constrained exit node forwarding many concurrent flows — see
721/// [`netstack::netcore::Config::tcp_buffer_size`]). Factored out of [`Runtime::spawn`] so the
722/// None-default / Some-override mapping is unit-testable without standing up the actor system.
723fn netstack_config_from(tcp_buffer_size: Option<usize>) -> netstack::netcore::Config {
724 let mut c = netstack::netcore::Config::default();
725 if let Some(tcp_buffer_size) = tcp_buffer_size {
726 c.tcp_buffer_size = tcp_buffer_size;
727 }
728 c
729}
730
731/// Flatten a kameo delegated-reply [`SendError`] for the id-token RPC into the RPC's own
732/// [`ts_control::IdTokenError`].
733///
734/// A [`SendError::HandlerError`](kameo::error::SendError::HandlerError) carries the real
735/// `IdTokenError` produced by the handler and is surfaced verbatim. Any other send failure (actor
736/// not running / stopped, mailbox full, send timeout) is a delivery problem rather than an RPC
737/// result, so it collapses to a transient [`ts_control::IdTokenError::NetworkError`]. Factored out
738/// of [`Runtime::fetch_id_token`] so this mapping is unit-testable without standing up an actor.
739fn flatten_send_err<M>(
740 e: kameo::error::SendError<M, ts_control::IdTokenError>,
741) -> ts_control::IdTokenError {
742 match e {
743 kameo::error::SendError::HandlerError(err) => err,
744 _ => ts_control::IdTokenError::NetworkError,
745 }
746}
747
748/// Flatten a kameo `SendError` from the `Logout` ask into a [`ts_control::LogoutError`].
749///
750/// A `HandlerError` carries the real `LogoutError` from the control RPC and is surfaced verbatim;
751/// any other send failure (actor not running / stopped, mailbox full, send timeout) — a delivery
752/// problem, not a logout result — collapses to the transient [`ts_control::LogoutError::NetworkError`]
753/// (logout is idempotent, so a retry after a delivery failure is safe). Factored out of
754/// [`Runtime::logout`] so the mapping is unit-testable without standing up an actor.
755fn flatten_logout_send_err<M>(
756 e: kameo::error::SendError<M, ts_control::LogoutError>,
757) -> ts_control::LogoutError {
758 match e {
759 kameo::error::SendError::HandlerError(err) => err,
760 _ => ts_control::LogoutError::NetworkError,
761 }
762}
763
764/// Flatten a kameo `SendError` from the `SetDns` ask into a [`ts_control::SetDnsError`].
765///
766/// A `HandlerError` carries the real `SetDnsError` from the set-dns RPC and is surfaced verbatim;
767/// any other send failure (actor not running / stopped, mailbox full, send timeout) — a delivery
768/// problem, not a publish result — collapses to the transient
769/// [`ts_control::SetDnsError::NetworkError`]. Factored out of [`Runtime::set_dns`] so the mapping is
770/// unit-testable without standing up an actor.
771fn flatten_set_dns_send_err<M>(
772 e: kameo::error::SendError<M, ts_control::SetDnsError>,
773) -> ts_control::SetDnsError {
774 match e {
775 kameo::error::SendError::HandlerError(err) => err,
776 _ => ts_control::SetDnsError::NetworkError,
777 }
778}
779
780/// Flatten a kameo `SendError` from the `GetCertificate` ask into a [`ts_control::CertError`].
781///
782/// A `HandlerError` carries the real `CertError` produced by the ACME issuance and is surfaced
783/// verbatim. `CertError` has no transient-network variant, so any other send failure (actor not
784/// running / stopped, mailbox full, send timeout) — a delivery problem rather than an issuance
785/// result — collapses to a [`ts_control::CertError::Io`]. Factored out of
786/// [`Runtime::get_certificate`] so this mapping is unit-testable without standing up an actor.
787#[cfg(feature = "acme")]
788fn flatten_cert_send_err<M>(
789 e: kameo::error::SendError<M, ts_control::CertError>,
790) -> ts_control::CertError {
791 match e {
792 kameo::error::SendError::HandlerError(err) => err,
793 _ => ts_control::CertError::Io(std::io::Error::other(
794 "control runner unavailable for certificate issuance",
795 )),
796 }
797}
798
799#[cfg(test)]
800mod tests {
801 use super::*;
802
803 /// `None` must leave the netstack's own default TCP window in place (the 256 KiB throughput
804 /// default), and must not silently coerce to some other value.
805 #[test]
806 fn netstack_config_none_uses_netstack_default() {
807 let default = netstack::netcore::Config::default();
808 let built = netstack_config_from(None);
809 assert_eq!(
810 built.tcp_buffer_size, default.tcp_buffer_size,
811 "None must inherit the netstack default TCP buffer size"
812 );
813 }
814
815 /// `Some(n)` must override the TCP window (the memory-vs-throughput knob exit-node operators
816 /// reach for), reaching the config that both netstacks are built from.
817 #[test]
818 fn netstack_config_some_overrides_buffer() {
819 let built = netstack_config_from(Some(64 * 1024));
820 assert_eq!(
821 built.tcp_buffer_size,
822 64 * 1024,
823 "Some(n) must override the TCP buffer size that both netstacks use"
824 );
825 }
826
827 /// A `HandlerError` carries the real `IdTokenError` from the RPC handler and must pass through
828 /// verbatim, not be flattened to a generic network error. Using an `Internal(_)` payload (not
829 /// `NetworkError`) makes the passthrough observable: a buggy flatten that always returned
830 /// `NetworkError` would fail this assertion.
831 #[test]
832 fn flatten_send_err_handler_error_passes_through() {
833 // Build an `Internal(_)` payload via the public `From<Utf8Error>` conversion (no extra
834 // deps): it is distinct from the `_ => NetworkError` fallback, so a buggy flatten that
835 // always returned `NetworkError` would fail this assertion.
836 // Route the invalid bytes through a runtime Vec so the `invalid_from_utf8` lint (which only
837 // fires on compile-time-known literals) doesn't flag this intentional bad input.
838 let bytes = vec![0xffu8, 0xfe];
839 let utf8_err = core::str::from_utf8(&bytes).unwrap_err();
840 let inner = ts_control::IdTokenError::from(utf8_err);
841 assert!(matches!(inner, ts_control::IdTokenError::Internal(_)));
842 let e: kameo::error::SendError<control_runner::FetchIdToken, ts_control::IdTokenError> =
843 kameo::error::SendError::HandlerError(inner.clone());
844 assert_eq!(flatten_send_err(e), inner);
845 }
846
847 /// A non-handler send failure (actor stopped) is a delivery problem, not an RPC result, so it
848 /// must collapse to a transient `NetworkError`.
849 #[test]
850 fn flatten_send_err_actor_stopped_is_network_error() {
851 let e: kameo::error::SendError<control_runner::FetchIdToken, ts_control::IdTokenError> =
852 kameo::error::SendError::ActorStopped;
853 assert_eq!(flatten_send_err(e), ts_control::IdTokenError::NetworkError);
854 }
855
856 /// `ActorNotRunning` (the message bounces back undelivered) is likewise a delivery failure and
857 /// must map to a transient `NetworkError`.
858 #[test]
859 fn flatten_send_err_actor_not_running_is_network_error() {
860 let e: kameo::error::SendError<control_runner::FetchIdToken, ts_control::IdTokenError> =
861 kameo::error::SendError::ActorNotRunning(control_runner::FetchIdToken {
862 audience: "sts.amazonaws.com".to_string(),
863 });
864 assert_eq!(flatten_send_err(e), ts_control::IdTokenError::NetworkError);
865 }
866
867 /// A `HandlerError` from the logout RPC carries the real `LogoutError` and must pass through
868 /// verbatim. An `Internal(_)` payload (distinct from the `_ => NetworkError` fallback) makes the
869 /// passthrough observable.
870 #[test]
871 fn flatten_logout_send_err_handler_error_passes_through() {
872 let inner = ts_control::LogoutError::Internal(ts_control::LogoutInternalErrorKind::Http);
873 assert!(matches!(inner, ts_control::LogoutError::Internal(_)));
874 let e: kameo::error::SendError<control_runner::Logout, ts_control::LogoutError> =
875 kameo::error::SendError::HandlerError(inner.clone());
876 assert_eq!(flatten_logout_send_err(e), inner);
877 }
878
879 /// A non-handler send failure (actor stopped) is a delivery problem, not a logout result, and
880 /// collapses to a transient `NetworkError` (logout is idempotent, so a retry is safe).
881 #[test]
882 fn flatten_logout_send_err_actor_stopped_is_network_error() {
883 let e: kameo::error::SendError<control_runner::Logout, ts_control::LogoutError> =
884 kameo::error::SendError::ActorStopped;
885 assert_eq!(
886 flatten_logout_send_err(e),
887 ts_control::LogoutError::NetworkError
888 );
889 }
890
891 /// A `HandlerError` from the set-dns RPC carries the real `SetDnsError` and must pass through
892 /// verbatim. An `Internal(_)` payload (distinct from the `_ => NetworkError` fallback) makes the
893 /// passthrough observable.
894 #[test]
895 fn flatten_set_dns_send_err_handler_error_passes_through() {
896 let inner = ts_control::SetDnsError::Internal(ts_control::SetDnsInternalErrorKind::Http);
897 assert!(matches!(inner, ts_control::SetDnsError::Internal(_)));
898 let e: kameo::error::SendError<control_runner::SetDns, ts_control::SetDnsError> =
899 kameo::error::SendError::HandlerError(inner.clone());
900 assert_eq!(flatten_set_dns_send_err(e), inner);
901 }
902
903 /// A non-handler send failure (actor stopped) is a delivery problem, not a publish result, and
904 /// collapses to a transient `NetworkError`.
905 #[test]
906 fn flatten_set_dns_send_err_actor_stopped_is_network_error() {
907 let e: kameo::error::SendError<control_runner::SetDns, ts_control::SetDnsError> =
908 kameo::error::SendError::ActorStopped;
909 assert_eq!(
910 flatten_set_dns_send_err(e),
911 ts_control::SetDnsError::NetworkError
912 );
913 }
914}