ts_runtime/device_state.rs
1//! Device connection-state tracking: a push-style view of where a [`Runtime`](crate::Runtime) is in
2//! its control-plane lifecycle, plus a typed registration outcome.
3//!
4//! Mirrors the part of Go `tsnet`/`ipn`'s state machine an embedder actually reacts to: is the node
5//! still coming up, running, waiting for interactive login, expired, or did registration hard-fail?
6//! The [`ControlRunner`](crate::control_runner::ControlRunner) publishes transitions into a
7//! `watch` cell so an embedder can `await` them ([`Runtime::watch_state`](crate::Runtime::watch_state))
8//! instead of polling [`status`](crate::Runtime::status), and
9//! [`Runtime::wait_until_running`](crate::Runtime::wait_until_running) is a one-shot convenience
10//! built on the same cell.
11
12/// The control-plane lifecycle state of a device.
13///
14/// Published by the control runner as it brings the node up and maintains the netmap stream. A
15/// consumer watches this to drive UI ("connecting…", "needs login", "expired") and to distinguish a
16/// permanent failure from a transient one without inspecting logs.
17///
18/// `#[non_exhaustive]`: more lifecycle states may be added (as `Reauthenticating` was), so an
19/// embedder matching on this must include a wildcard arm and treat an unknown state conservatively
20/// (e.g. as "still coming up") rather than failing to compile on a fork upgrade. (Same-crate matches
21/// are unaffected — `#[non_exhaustive]` only forces the wildcard on out-of-crate consumers.)
22#[derive(Debug, Clone, PartialEq, Eq)]
23#[non_exhaustive]
24pub enum DeviceState {
25 /// The runtime has spawned and is registering / establishing the control session. The initial
26 /// state of every device.
27 Connecting,
28 /// Registered and the netmap stream is live — the node is up.
29 Running,
30 /// Control requires interactive authentication (no usable auth key): the node is waiting for a
31 /// human to authorize it at the carried URL. Transient — registration retries until authorized.
32 NeedsLogin(url::Url),
33 /// The node is registered with a valid key but awaiting **admin approval** on an approval-gated
34 /// tailnet, and control offered **no** interactive auth URL (so, unlike
35 /// [`NeedsLogin`](Self::NeedsLogin), there is nothing for a human to open — an admin must approve
36 /// the node out of band). **Transient** — treated like [`Connecting`](Self::Connecting) by the
37 /// waiters ([`wait_until_running`](crate::Runtime::wait_until_running) keeps waiting, never
38 /// settling on it); the runtime polls registration and, once an admin approves, auto-transitions
39 /// to [`Running`](Self::Running) with no re-registration (Go's `ipn.State::NeedsMachineAuth` →
40 /// `Starting`). No `browse_to_url` is derived from it (there is no URL).
41 NeedsMachineAuth,
42 /// The node key expired and an automatic, non-interactive re-authentication is in progress: the
43 /// runtime is rotating the node key and re-registering with the stored auth key (Go `doLogin`).
44 /// **Transient** — treated like [`Connecting`](Self::Connecting) by the waiters
45 /// ([`wait_until_running`](crate::Runtime::wait_until_running) keeps waiting, never settling on
46 /// it), and the next good self-node flips the state back to [`Running`](Self::Running). No
47 /// `browse_to_url` is derived from it (the recovery is non-interactive, unlike
48 /// [`NeedsLogin`](Self::NeedsLogin)). Entered only when an auth key is retained, auto-reauth is
49 /// enabled, and Tailnet Lock enforcement is NOT active; otherwise the runtime falls through to
50 /// [`Expired`](Self::Expired). See the runtime's `expiry_action` for the decision matrix.
51 Reauthenticating,
52 /// The node key has expired (control reported the self-node's key expiry is in the past). The
53 /// node must re-authenticate to continue. Surfaced from the netmap self-node, not registration.
54 Expired,
55 /// Registration hard-failed with a permanent reason (e.g. a bad/expired/unknown auth key). The
56 /// control runner stops; this carries the typed reason. Not retried.
57 Failed(RegistrationError),
58}
59
60/// A typed registration outcome, distinguishing a **permanent** failure (don't retry — tell the
61/// user) from a **transient** one (worth retrying).
62///
63/// This is the error surfaced by [`Runtime::wait_until_running`](crate::Runtime::wait_until_running),
64/// replacing the previous "poll `ipv4_addr` until a deadline and report a generic timeout" workaround
65/// with an actionable reason.
66#[derive(Debug, thiserror::Error, Clone, PartialEq, Eq)]
67pub enum RegistrationError {
68 /// Control rejected registration with a permanent reason — typically a bad, expired, or unknown
69 /// auth key. The string is control's verbatim reason. **Permanent**: re-pairing (a new auth
70 /// key) is required; retrying with the same key will not succeed.
71 #[error("authentication rejected by control: {0}")]
72 AuthRejected(String),
73
74 /// The node key has expired. **Permanent** until re-authentication.
75 #[error("node key expired; re-authentication required")]
76 KeyExpired,
77
78 /// Interactive authorization is required: control offered an auth URL (no usable auth key).
79 /// **Actionable but not permanent** — direct the user to the URL; the runtime keeps retrying
80 /// registration and will reach `Running` once the user authorizes (so this is *not*
81 /// [`is_permanent`](Self::is_permanent)). A caller using an auth key should not hit this; a
82 /// caller doing interactive auth should drive it via
83 /// [`watch_state`](crate::Runtime::watch_state) rather than treating this as a hard failure.
84 #[error("interactive login required at {0}")]
85 NeedsLogin(url::Url),
86
87 /// The control plane was unreachable (network/transport error). **Transient**: retrying later
88 /// may succeed.
89 #[error("control plane unreachable")]
90 NetworkUnreachable,
91
92 /// No settled state was reached before the caller's timeout elapsed. **Indeterminate**:
93 /// registration may still be in flight (e.g. slow control plane); the caller may retry the wait.
94 #[error("timed out waiting for the device to finish registering")]
95 Timeout,
96}
97
98impl RegistrationError {
99 /// Whether this outcome is **permanent** — re-pairing / new credentials are required and
100 /// retrying as-is will not succeed (`AuthRejected`, `KeyExpired`). Everything else is not
101 /// permanent: `NetworkUnreachable`/`Timeout` are transient (retry may succeed), and `NeedsLogin`
102 /// is actionable-but-recoverable (the runtime keeps retrying and reaches `Running` once the user
103 /// authorizes the offered URL — so it is *not* permanent).
104 pub fn is_permanent(&self) -> bool {
105 matches!(
106 self,
107 RegistrationError::AuthRejected(_) | RegistrationError::KeyExpired
108 )
109 }
110}
111
112/// Map a control-layer [`ts_control::Error`] from the registration path into a typed
113/// [`RegistrationError`]. Used by the control runner when its `check_auth` loop hard-fails.
114impl From<&ts_control::Error> for RegistrationError {
115 fn from(e: &ts_control::Error) -> Self {
116 match e {
117 ts_control::Error::MachineNotAuthorized(u) => RegistrationError::NeedsLogin(u.clone()),
118 ts_control::Error::Registration(reason) => {
119 RegistrationError::AuthRejected(reason.clone())
120 }
121 ts_control::Error::NetworkError(_) => RegistrationError::NetworkUnreachable,
122 // A 429 rate-limit is **transient** — control is asking us to wait, not rejecting us —
123 // so it must NOT become a permanent `AuthRejected`. The control runner's `check_auth`
124 // loop already intercepts `RateLimited` and sleeps the server delay before this mapping
125 // is reached; classifying it as `NetworkUnreachable` here keeps any other caller of this
126 // conversion on the correct (non-permanent, retry-may-succeed) branch.
127 ts_control::Error::RateLimited(_) => RegistrationError::NetworkUnreachable,
128 // "Awaiting admin approval, no URL" is **transient** — the node holds a valid key and the
129 // runtime polls until an admin approves, then comes up (Go `NeedsMachineAuth → Starting`),
130 // so it must NOT become a permanent `AuthRejected`. The control runner's `check_auth` and
131 // `connect` loops already intercept `NeedsMachineAuth` and poll before this mapping is
132 // reached; classifying it as `NetworkUnreachable` here keeps any other caller of this
133 // conversion on the correct (non-permanent, retry-may-succeed) branch.
134 ts_control::Error::NeedsMachineAuth => RegistrationError::NetworkUnreachable,
135 // InvalidUrl / Internal: not a transient network condition and not an auth decision —
136 // treat as a (permanent-ish) auth rejection carrying the display reason so the caller
137 // sees something actionable rather than an opaque "timeout".
138 other => RegistrationError::AuthRejected(other.to_string()),
139 }
140 }
141}
142
143/// Wait on a [`DeviceState`] `watch` channel until it settles, mapping the settled state to the
144/// typed [`wait_until_running`](crate::Runtime::wait_until_running) result.
145///
146/// Factored out of [`Runtime::wait_until_running`](crate::Runtime) so the (non-trivial) loop — the
147/// see-then-await ordering, the per-state mapping, sender-drop handling, and the timeout — is
148/// unit-testable against a plain `watch::channel` without standing up a runtime.
149pub(crate) async fn wait_for_running(
150 mut rx: tokio::sync::watch::Receiver<DeviceState>,
151 timeout: Option<core::time::Duration>,
152) -> Result<(), RegistrationError> {
153 let wait = async {
154 loop {
155 // Evaluate the current value, then await a change. `borrow_and_update` marks the current
156 // value seen so a transition isn't missed between this check and `changed()`.
157 let settled = match &*rx.borrow_and_update() {
158 DeviceState::Running => Some(Ok(())),
159 DeviceState::Failed(e) => Some(Err(e.clone())),
160 DeviceState::Expired => Some(Err(RegistrationError::KeyExpired)),
161 DeviceState::NeedsLogin(u) => Some(Err(RegistrationError::NeedsLogin(u.clone()))),
162 // Transient, like `Connecting`: keep waiting rather than settling. `Reauthenticating`
163 // — an auto-reauth is in flight and the next good self-node flips back to `Running`.
164 // `NeedsMachineAuth` — awaiting admin approval (no URL); the runtime polls and
165 // auto-transitions to `Running` once approved (Go `NeedsMachineAuth → Starting`).
166 DeviceState::Connecting
167 | DeviceState::Reauthenticating
168 | DeviceState::NeedsMachineAuth => None,
169 };
170 if let Some(result) = settled {
171 return result;
172 }
173 // Not settled yet — wait for the next transition. If the sender is dropped (runtime
174 // tearing down), treat it as unreachable rather than hanging forever.
175 if rx.changed().await.is_err() {
176 return Err(RegistrationError::NetworkUnreachable);
177 }
178 }
179 };
180
181 match timeout {
182 Some(timeout) => tokio::time::timeout(timeout, wait)
183 .await
184 .unwrap_or(Err(RegistrationError::Timeout)),
185 None => wait.await,
186 }
187}
188
189#[cfg(test)]
190mod tests {
191 use core::time::Duration;
192
193 use tokio::sync::watch;
194
195 use super::*;
196
197 #[test]
198 fn permanence_classification() {
199 // Permanent: re-pairing / new credentials required.
200 assert!(RegistrationError::AuthRejected("bad key".into()).is_permanent());
201 assert!(RegistrationError::KeyExpired.is_permanent());
202 // Not permanent: NeedsLogin recovers once the user authorizes (runtime keeps retrying);
203 // network/timeout are transient.
204 assert!(
205 !RegistrationError::NeedsLogin("https://login.example/x".parse().unwrap())
206 .is_permanent()
207 );
208 assert!(!RegistrationError::NetworkUnreachable.is_permanent());
209 assert!(!RegistrationError::Timeout.is_permanent());
210 }
211
212 #[test]
213 fn maps_control_error_variants() {
214 let url: url::Url = "https://login.example/a".parse().unwrap();
215 assert_eq!(
216 RegistrationError::from(&ts_control::Error::MachineNotAuthorized(url.clone())),
217 RegistrationError::NeedsLogin(url)
218 );
219 assert_eq!(
220 RegistrationError::from(&ts_control::Error::Registration("bad auth key".into())),
221 RegistrationError::AuthRejected("bad auth key".into())
222 );
223 assert_eq!(
224 RegistrationError::from(&ts_control::Error::NetworkError(
225 ts_control::Operation::Registration
226 )),
227 RegistrationError::NetworkUnreachable
228 );
229 // A 429 rate-limit is TRANSIENT and must map to a non-permanent state, never the
230 // `AuthRejected` catch-all (which would wrongly stop the runtime). This pins the explicit
231 // arm: if a refactor drops it and lets `RateLimited` fall into `other => AuthRejected`, this
232 // assertion fails.
233 let rl = RegistrationError::from(&ts_control::Error::RateLimited(Duration::from_secs(30)));
234 assert_eq!(rl, RegistrationError::NetworkUnreachable);
235 assert!(
236 !rl.is_permanent(),
237 "a rate-limit must be a transient (non-permanent) failure"
238 );
239 // "Awaiting admin approval, no URL" (tsr-dvu) is TRANSIENT and must map to a non-permanent
240 // state, never the `AuthRejected` catch-all (which would wrongly mark an approval-gated node
241 // as a hard failure). Pins the explicit arm: if a refactor drops it and lets `NeedsMachineAuth`
242 // fall into `other => AuthRejected`, this fails.
243 let nma = RegistrationError::from(&ts_control::Error::NeedsMachineAuth);
244 assert_eq!(nma, RegistrationError::NetworkUnreachable);
245 assert!(
246 !nma.is_permanent(),
247 "awaiting admin approval must be a transient (non-permanent) failure"
248 );
249 }
250
251 // --- wait_for_running loop ---
252
253 /// An already-`Running` cell resolves `Ok(())` immediately (the initial `borrow_and_update`
254 /// sees it without waiting for a transition).
255 #[tokio::test]
256 async fn wait_resolves_when_already_running() {
257 let (_tx, rx) = watch::channel(DeviceState::Running);
258 assert_eq!(
259 wait_for_running(rx, Some(Duration::from_secs(1))).await,
260 Ok(())
261 );
262 }
263
264 /// A transition `Connecting → Running` published from another task is observed (no missed
265 /// wakeup) and resolves `Ok(())`.
266 #[tokio::test]
267 async fn wait_resolves_on_transition_to_running() {
268 let (tx, rx) = watch::channel(DeviceState::Connecting);
269 tokio::spawn(async move {
270 tokio::time::sleep(Duration::from_millis(20)).await;
271 tx.send_replace(DeviceState::Running);
272 });
273 assert_eq!(
274 wait_for_running(rx, Some(Duration::from_secs(1))).await,
275 Ok(())
276 );
277 }
278
279 /// Each settled non-running state maps to its typed error.
280 #[tokio::test]
281 async fn wait_maps_each_settled_failure() {
282 for (state, expected) in [
283 (
284 DeviceState::Failed(RegistrationError::AuthRejected("bad".into())),
285 RegistrationError::AuthRejected("bad".into()),
286 ),
287 (DeviceState::Expired, RegistrationError::KeyExpired),
288 (
289 DeviceState::NeedsLogin("https://login.example/x".parse().unwrap()),
290 RegistrationError::NeedsLogin("https://login.example/x".parse().unwrap()),
291 ),
292 ] {
293 let (_tx, rx) = watch::channel(state);
294 assert_eq!(
295 wait_for_running(rx, Some(Duration::from_secs(1))).await,
296 Err(expected)
297 );
298 }
299 }
300
301 /// A cell stuck at `Connecting` past the timeout yields `Timeout`.
302 #[tokio::test]
303 async fn wait_times_out_while_connecting() {
304 let (_tx, rx) = watch::channel(DeviceState::Connecting);
305 assert_eq!(
306 wait_for_running(rx, Some(Duration::from_millis(30))).await,
307 Err(RegistrationError::Timeout)
308 );
309 }
310
311 /// `Reauthenticating` is transient (a waiter never settles on it): a waiter must NOT resolve on
312 /// it — like `Connecting`, it times out rather than resolving to a terminal error, because the
313 /// next good self-node flips the state back to `Running`. This is the behavioral guard that an
314 /// in-flight auto-reauth never surfaces as a permanent failure.
315 #[tokio::test]
316 async fn wait_does_not_settle_on_reauthenticating() {
317 let (_tx, rx) = watch::channel(DeviceState::Reauthenticating);
318 assert_eq!(
319 wait_for_running(rx, Some(Duration::from_millis(30))).await,
320 Err(RegistrationError::Timeout),
321 "Reauthenticating is transient — a waiter keeps waiting, it does not settle"
322 );
323 }
324
325 /// The full auto-reauth recovery as a waiter sees it: `Reauthenticating` (in flight) → `Running`
326 /// (the next good self-node) resolves `Ok(())`. Proves the transient state is observed and then
327 /// recovered, never surfaced as a failure.
328 #[tokio::test]
329 async fn wait_resolves_on_reauthenticating_then_running() {
330 let (tx, rx) = watch::channel(DeviceState::Reauthenticating);
331 tokio::spawn(async move {
332 tokio::time::sleep(Duration::from_millis(20)).await;
333 tx.send_replace(DeviceState::Running);
334 });
335 assert_eq!(
336 wait_for_running(rx, Some(Duration::from_secs(1))).await,
337 Ok(())
338 );
339 }
340
341 /// `NeedsMachineAuth` is transient (a waiter never settles on it): like `Connecting`, it times
342 /// out rather than resolving to a terminal error, because the runtime polls registration and the
343 /// next good self-node flips the state back to `Running` once an admin approves. This is the
344 /// behavioral guard for tsr-dvu: an approval-gated node awaiting admin approval never surfaces as
345 /// a permanent failure (the bug was it dying terminally instead of polling).
346 #[tokio::test]
347 async fn wait_does_not_settle_on_needs_machine_auth() {
348 let (_tx, rx) = watch::channel(DeviceState::NeedsMachineAuth);
349 assert_eq!(
350 wait_for_running(rx, Some(Duration::from_millis(30))).await,
351 Err(RegistrationError::Timeout),
352 "NeedsMachineAuth is transient — a waiter keeps waiting, it does not settle"
353 );
354 }
355
356 /// The full await-approval recovery as a waiter sees it: `NeedsMachineAuth` (awaiting admin
357 /// approval) → `Running` (the admin approved; `check_auth` returned `Ok`, the netmap self-node
358 /// arrived) resolves `Ok(())`. Proves the transient state is observed and then recovered with no
359 /// re-registration — Go's `NeedsMachineAuth → Starting → Running` auto-transition.
360 #[tokio::test]
361 async fn wait_resolves_on_needs_machine_auth_then_running() {
362 let (tx, rx) = watch::channel(DeviceState::NeedsMachineAuth);
363 tokio::spawn(async move {
364 tokio::time::sleep(Duration::from_millis(20)).await;
365 tx.send_replace(DeviceState::Running);
366 });
367 assert_eq!(
368 wait_for_running(rx, Some(Duration::from_secs(1))).await,
369 Ok(())
370 );
371 }
372
373 /// If the sender is dropped while still `Connecting`, the wait ends as `NetworkUnreachable`
374 /// rather than hanging forever.
375 #[tokio::test]
376 async fn wait_sender_dropped_is_network_unreachable() {
377 let (tx, rx) = watch::channel(DeviceState::Connecting);
378 drop(tx);
379 assert_eq!(
380 wait_for_running(rx, Some(Duration::from_secs(1))).await,
381 Err(RegistrationError::NetworkUnreachable)
382 );
383 }
384}