Skip to main content

ts_runtime/
device_state.rs

1//! Device connection-state tracking: a push-style view of where a [`Runtime`](crate::Runtime) is in
2//! its control-plane lifecycle, plus a typed registration outcome.
3//!
4//! Mirrors the part of Go `tsnet`/`ipn`'s state machine an embedder actually reacts to: is the node
5//! still coming up, running, waiting for interactive login, expired, or did registration hard-fail?
6//! The [`ControlRunner`](crate::control_runner::ControlRunner) publishes transitions into a
7//! `watch` cell so an embedder can `await` them ([`Runtime::watch_state`](crate::Runtime::watch_state))
8//! instead of polling [`status`](crate::Runtime::status), and
9//! [`Runtime::wait_until_running`](crate::Runtime::wait_until_running) is a one-shot convenience
10//! built on the same cell.
11
12/// The control-plane lifecycle state of a device.
13///
14/// Published by the control runner as it brings the node up and maintains the netmap stream. A
15/// consumer watches this to drive UI ("connecting…", "needs login", "expired") and to distinguish a
16/// permanent failure from a transient one without inspecting logs.
17///
18/// `#[non_exhaustive]`: more lifecycle states may be added (as `Reauthenticating` was), so an
19/// embedder matching on this must include a wildcard arm and treat an unknown state conservatively
20/// (e.g. as "still coming up") rather than failing to compile on a fork upgrade. (Same-crate matches
21/// are unaffected — `#[non_exhaustive]` only forces the wildcard on out-of-crate consumers.)
22#[derive(Debug, Clone, PartialEq, Eq)]
23#[non_exhaustive]
24pub enum DeviceState {
25    /// The runtime has spawned and is registering / establishing the control session. The initial
26    /// state of every device.
27    Connecting,
28    /// Registered and the netmap stream is live — the node is up.
29    Running,
30    /// Control requires interactive authentication (no usable auth key): the node is waiting for a
31    /// human to authorize it at the carried URL. Transient — registration retries until authorized.
32    NeedsLogin(url::Url),
33    /// The node is registered with a valid key but awaiting **admin approval** on an approval-gated
34    /// tailnet, and control offered **no** interactive auth URL (so, unlike
35    /// [`NeedsLogin`](Self::NeedsLogin), there is nothing for a human to open — an admin must approve
36    /// the node out of band). **Transient** — treated like [`Connecting`](Self::Connecting) by the
37    /// waiters ([`wait_until_running`](crate::Runtime::wait_until_running) keeps waiting, never
38    /// settling on it); the runtime polls registration and, once an admin approves, auto-transitions
39    /// to [`Running`](Self::Running) with no re-registration (Go's `ipn.State::NeedsMachineAuth` →
40    /// `Starting`). No `browse_to_url` is derived from it (there is no URL).
41    NeedsMachineAuth,
42    /// The node key expired and an automatic, non-interactive re-authentication is in progress: the
43    /// runtime is rotating the node key and re-registering with the stored auth key (Go `doLogin`).
44    /// **Transient** — treated like [`Connecting`](Self::Connecting) by the waiters
45    /// ([`wait_until_running`](crate::Runtime::wait_until_running) keeps waiting, never settling on
46    /// it), and the next good self-node flips the state back to [`Running`](Self::Running). No
47    /// `browse_to_url` is derived from it (the recovery is non-interactive, unlike
48    /// [`NeedsLogin`](Self::NeedsLogin)). Entered only when an auth key is retained, auto-reauth is
49    /// enabled, and Tailnet Lock enforcement is NOT active; otherwise the runtime falls through to
50    /// [`Expired`](Self::Expired). See the runtime's `expiry_action` for the decision matrix.
51    Reauthenticating,
52    /// The node key has expired (control reported the self-node's key expiry is in the past). The
53    /// node must re-authenticate to continue. Surfaced from the netmap self-node, not registration.
54    Expired,
55    /// Registration hard-failed with a permanent reason (e.g. a bad/expired/unknown auth key). The
56    /// control runner stops; this carries the typed reason. Not retried.
57    Failed(RegistrationError),
58}
59
60/// A typed registration outcome, distinguishing a **permanent** failure (don't retry — tell the
61/// user) from a **transient** one (worth retrying).
62///
63/// This is the error surfaced by [`Runtime::wait_until_running`](crate::Runtime::wait_until_running),
64/// replacing the previous "poll `ipv4_addr` until a deadline and report a generic timeout" workaround
65/// with an actionable reason.
66#[derive(Debug, thiserror::Error, Clone, PartialEq, Eq)]
67pub enum RegistrationError {
68    /// Control rejected registration with a permanent reason — typically a bad, expired, or unknown
69    /// auth key. The string is control's verbatim reason. **Permanent**: re-pairing (a new auth
70    /// key) is required; retrying with the same key will not succeed.
71    #[error("authentication rejected by control: {0}")]
72    AuthRejected(String),
73
74    /// The node key has expired. **Permanent** until re-authentication.
75    #[error("node key expired; re-authentication required")]
76    KeyExpired,
77
78    /// Interactive authorization is required: control offered an auth URL (no usable auth key).
79    /// **Actionable but not permanent** — direct the user to the URL; the runtime keeps retrying
80    /// registration and will reach `Running` once the user authorizes (so this is *not*
81    /// [`is_permanent`](Self::is_permanent)). A caller using an auth key should not hit this; a
82    /// caller doing interactive auth should drive it via
83    /// [`watch_state`](crate::Runtime::watch_state) rather than treating this as a hard failure.
84    #[error("interactive login required at {0}")]
85    NeedsLogin(url::Url),
86
87    /// The control plane was unreachable (network/transport error). **Transient**: retrying later
88    /// may succeed.
89    #[error("control plane unreachable")]
90    NetworkUnreachable,
91
92    /// No settled state was reached before the caller's timeout elapsed. **Indeterminate**:
93    /// registration may still be in flight (e.g. slow control plane); the caller may retry the wait.
94    #[error("timed out waiting for the device to finish registering")]
95    Timeout,
96}
97
98impl RegistrationError {
99    /// Whether this outcome is **permanent** — re-pairing / new credentials are required and
100    /// retrying as-is will not succeed (`AuthRejected`, `KeyExpired`). Everything else is not
101    /// permanent: `NetworkUnreachable`/`Timeout` are transient (retry may succeed), and `NeedsLogin`
102    /// is actionable-but-recoverable (the runtime keeps retrying and reaches `Running` once the user
103    /// authorizes the offered URL — so it is *not* permanent).
104    pub fn is_permanent(&self) -> bool {
105        matches!(
106            self,
107            RegistrationError::AuthRejected(_) | RegistrationError::KeyExpired
108        )
109    }
110}
111
112/// Map a control-layer [`ts_control::Error`] from the registration path into a typed
113/// [`RegistrationError`]. Used by the control runner when its `check_auth` loop hard-fails.
114impl From<&ts_control::Error> for RegistrationError {
115    fn from(e: &ts_control::Error) -> Self {
116        match e {
117            ts_control::Error::MachineNotAuthorized(u) => RegistrationError::NeedsLogin(u.clone()),
118            ts_control::Error::Registration(reason) => {
119                RegistrationError::AuthRejected(reason.clone())
120            }
121            ts_control::Error::NetworkError(_) => RegistrationError::NetworkUnreachable,
122            // A 429 rate-limit is **transient** — control is asking us to wait, not rejecting us —
123            // so it must NOT become a permanent `AuthRejected`. The control runner's `check_auth`
124            // loop already intercepts `RateLimited` and sleeps the server delay before this mapping
125            // is reached; classifying it as `NetworkUnreachable` here keeps any other caller of this
126            // conversion on the correct (non-permanent, retry-may-succeed) branch.
127            ts_control::Error::RateLimited(_) => RegistrationError::NetworkUnreachable,
128            // "Awaiting admin approval, no URL" is **transient** — the node holds a valid key and the
129            // runtime polls until an admin approves, then comes up (Go `NeedsMachineAuth → Starting`),
130            // so it must NOT become a permanent `AuthRejected`. The control runner's `check_auth` and
131            // `connect` loops already intercept `NeedsMachineAuth` and poll before this mapping is
132            // reached; classifying it as `NetworkUnreachable` here keeps any other caller of this
133            // conversion on the correct (non-permanent, retry-may-succeed) branch.
134            ts_control::Error::NeedsMachineAuth => RegistrationError::NetworkUnreachable,
135            // InvalidUrl / Internal: not a transient network condition and not an auth decision —
136            // treat as a (permanent-ish) auth rejection carrying the display reason so the caller
137            // sees something actionable rather than an opaque "timeout".
138            other => RegistrationError::AuthRejected(other.to_string()),
139        }
140    }
141}
142
143/// Wait on a [`DeviceState`] `watch` channel until it settles, mapping the settled state to the
144/// typed [`wait_until_running`](crate::Runtime::wait_until_running) result.
145///
146/// Factored out of [`Runtime::wait_until_running`](crate::Runtime) so the (non-trivial) loop — the
147/// see-then-await ordering, the per-state mapping, sender-drop handling, and the timeout — is
148/// unit-testable against a plain `watch::channel` without standing up a runtime.
149pub(crate) async fn wait_for_running(
150    mut rx: tokio::sync::watch::Receiver<DeviceState>,
151    timeout: Option<core::time::Duration>,
152) -> Result<(), RegistrationError> {
153    let wait = async {
154        loop {
155            // Evaluate the current value, then await a change. `borrow_and_update` marks the current
156            // value seen so a transition isn't missed between this check and `changed()`.
157            let settled = match &*rx.borrow_and_update() {
158                DeviceState::Running => Some(Ok(())),
159                DeviceState::Failed(e) => Some(Err(e.clone())),
160                DeviceState::Expired => Some(Err(RegistrationError::KeyExpired)),
161                DeviceState::NeedsLogin(u) => Some(Err(RegistrationError::NeedsLogin(u.clone()))),
162                // Transient, like `Connecting`: keep waiting rather than settling. `Reauthenticating`
163                // — an auto-reauth is in flight and the next good self-node flips back to `Running`.
164                // `NeedsMachineAuth` — awaiting admin approval (no URL); the runtime polls and
165                // auto-transitions to `Running` once approved (Go `NeedsMachineAuth → Starting`).
166                DeviceState::Connecting
167                | DeviceState::Reauthenticating
168                | DeviceState::NeedsMachineAuth => None,
169            };
170            if let Some(result) = settled {
171                return result;
172            }
173            // Not settled yet — wait for the next transition. If the sender is dropped (runtime
174            // tearing down), treat it as unreachable rather than hanging forever.
175            if rx.changed().await.is_err() {
176                return Err(RegistrationError::NetworkUnreachable);
177            }
178        }
179    };
180
181    match timeout {
182        Some(timeout) => tokio::time::timeout(timeout, wait)
183            .await
184            .unwrap_or(Err(RegistrationError::Timeout)),
185        None => wait.await,
186    }
187}
188
189#[cfg(test)]
190mod tests {
191    use core::time::Duration;
192
193    use tokio::sync::watch;
194
195    use super::*;
196
197    #[test]
198    fn permanence_classification() {
199        // Permanent: re-pairing / new credentials required.
200        assert!(RegistrationError::AuthRejected("bad key".into()).is_permanent());
201        assert!(RegistrationError::KeyExpired.is_permanent());
202        // Not permanent: NeedsLogin recovers once the user authorizes (runtime keeps retrying);
203        // network/timeout are transient.
204        assert!(
205            !RegistrationError::NeedsLogin("https://login.example/x".parse().unwrap())
206                .is_permanent()
207        );
208        assert!(!RegistrationError::NetworkUnreachable.is_permanent());
209        assert!(!RegistrationError::Timeout.is_permanent());
210    }
211
212    #[test]
213    fn maps_control_error_variants() {
214        let url: url::Url = "https://login.example/a".parse().unwrap();
215        assert_eq!(
216            RegistrationError::from(&ts_control::Error::MachineNotAuthorized(url.clone())),
217            RegistrationError::NeedsLogin(url)
218        );
219        assert_eq!(
220            RegistrationError::from(&ts_control::Error::Registration("bad auth key".into())),
221            RegistrationError::AuthRejected("bad auth key".into())
222        );
223        assert_eq!(
224            RegistrationError::from(&ts_control::Error::NetworkError(
225                ts_control::Operation::Registration
226            )),
227            RegistrationError::NetworkUnreachable
228        );
229        // A 429 rate-limit is TRANSIENT and must map to a non-permanent state, never the
230        // `AuthRejected` catch-all (which would wrongly stop the runtime). This pins the explicit
231        // arm: if a refactor drops it and lets `RateLimited` fall into `other => AuthRejected`, this
232        // assertion fails.
233        let rl = RegistrationError::from(&ts_control::Error::RateLimited(Duration::from_secs(30)));
234        assert_eq!(rl, RegistrationError::NetworkUnreachable);
235        assert!(
236            !rl.is_permanent(),
237            "a rate-limit must be a transient (non-permanent) failure"
238        );
239        // "Awaiting admin approval, no URL" (tsr-dvu) is TRANSIENT and must map to a non-permanent
240        // state, never the `AuthRejected` catch-all (which would wrongly mark an approval-gated node
241        // as a hard failure). Pins the explicit arm: if a refactor drops it and lets `NeedsMachineAuth`
242        // fall into `other => AuthRejected`, this fails.
243        let nma = RegistrationError::from(&ts_control::Error::NeedsMachineAuth);
244        assert_eq!(nma, RegistrationError::NetworkUnreachable);
245        assert!(
246            !nma.is_permanent(),
247            "awaiting admin approval must be a transient (non-permanent) failure"
248        );
249    }
250
251    // --- wait_for_running loop ---
252
253    /// An already-`Running` cell resolves `Ok(())` immediately (the initial `borrow_and_update`
254    /// sees it without waiting for a transition).
255    #[tokio::test]
256    async fn wait_resolves_when_already_running() {
257        let (_tx, rx) = watch::channel(DeviceState::Running);
258        assert_eq!(
259            wait_for_running(rx, Some(Duration::from_secs(1))).await,
260            Ok(())
261        );
262    }
263
264    /// A transition `Connecting → Running` published from another task is observed (no missed
265    /// wakeup) and resolves `Ok(())`.
266    #[tokio::test]
267    async fn wait_resolves_on_transition_to_running() {
268        let (tx, rx) = watch::channel(DeviceState::Connecting);
269        tokio::spawn(async move {
270            tokio::time::sleep(Duration::from_millis(20)).await;
271            tx.send_replace(DeviceState::Running);
272        });
273        assert_eq!(
274            wait_for_running(rx, Some(Duration::from_secs(1))).await,
275            Ok(())
276        );
277    }
278
279    /// Each settled non-running state maps to its typed error.
280    #[tokio::test]
281    async fn wait_maps_each_settled_failure() {
282        for (state, expected) in [
283            (
284                DeviceState::Failed(RegistrationError::AuthRejected("bad".into())),
285                RegistrationError::AuthRejected("bad".into()),
286            ),
287            (DeviceState::Expired, RegistrationError::KeyExpired),
288            (
289                DeviceState::NeedsLogin("https://login.example/x".parse().unwrap()),
290                RegistrationError::NeedsLogin("https://login.example/x".parse().unwrap()),
291            ),
292        ] {
293            let (_tx, rx) = watch::channel(state);
294            assert_eq!(
295                wait_for_running(rx, Some(Duration::from_secs(1))).await,
296                Err(expected)
297            );
298        }
299    }
300
301    /// A cell stuck at `Connecting` past the timeout yields `Timeout`.
302    #[tokio::test]
303    async fn wait_times_out_while_connecting() {
304        let (_tx, rx) = watch::channel(DeviceState::Connecting);
305        assert_eq!(
306            wait_for_running(rx, Some(Duration::from_millis(30))).await,
307            Err(RegistrationError::Timeout)
308        );
309    }
310
311    /// `Reauthenticating` is transient (a waiter never settles on it): a waiter must NOT resolve on
312    /// it — like `Connecting`, it times out rather than resolving to a terminal error, because the
313    /// next good self-node flips the state back to `Running`. This is the behavioral guard that an
314    /// in-flight auto-reauth never surfaces as a permanent failure.
315    #[tokio::test]
316    async fn wait_does_not_settle_on_reauthenticating() {
317        let (_tx, rx) = watch::channel(DeviceState::Reauthenticating);
318        assert_eq!(
319            wait_for_running(rx, Some(Duration::from_millis(30))).await,
320            Err(RegistrationError::Timeout),
321            "Reauthenticating is transient — a waiter keeps waiting, it does not settle"
322        );
323    }
324
325    /// The full auto-reauth recovery as a waiter sees it: `Reauthenticating` (in flight) → `Running`
326    /// (the next good self-node) resolves `Ok(())`. Proves the transient state is observed and then
327    /// recovered, never surfaced as a failure.
328    #[tokio::test]
329    async fn wait_resolves_on_reauthenticating_then_running() {
330        let (tx, rx) = watch::channel(DeviceState::Reauthenticating);
331        tokio::spawn(async move {
332            tokio::time::sleep(Duration::from_millis(20)).await;
333            tx.send_replace(DeviceState::Running);
334        });
335        assert_eq!(
336            wait_for_running(rx, Some(Duration::from_secs(1))).await,
337            Ok(())
338        );
339    }
340
341    /// `NeedsMachineAuth` is transient (a waiter never settles on it): like `Connecting`, it times
342    /// out rather than resolving to a terminal error, because the runtime polls registration and the
343    /// next good self-node flips the state back to `Running` once an admin approves. This is the
344    /// behavioral guard for tsr-dvu: an approval-gated node awaiting admin approval never surfaces as
345    /// a permanent failure (the bug was it dying terminally instead of polling).
346    #[tokio::test]
347    async fn wait_does_not_settle_on_needs_machine_auth() {
348        let (_tx, rx) = watch::channel(DeviceState::NeedsMachineAuth);
349        assert_eq!(
350            wait_for_running(rx, Some(Duration::from_millis(30))).await,
351            Err(RegistrationError::Timeout),
352            "NeedsMachineAuth is transient — a waiter keeps waiting, it does not settle"
353        );
354    }
355
356    /// The full await-approval recovery as a waiter sees it: `NeedsMachineAuth` (awaiting admin
357    /// approval) → `Running` (the admin approved; `check_auth` returned `Ok`, the netmap self-node
358    /// arrived) resolves `Ok(())`. Proves the transient state is observed and then recovered with no
359    /// re-registration — Go's `NeedsMachineAuth → Starting → Running` auto-transition.
360    #[tokio::test]
361    async fn wait_resolves_on_needs_machine_auth_then_running() {
362        let (tx, rx) = watch::channel(DeviceState::NeedsMachineAuth);
363        tokio::spawn(async move {
364            tokio::time::sleep(Duration::from_millis(20)).await;
365            tx.send_replace(DeviceState::Running);
366        });
367        assert_eq!(
368            wait_for_running(rx, Some(Duration::from_secs(1))).await,
369            Ok(())
370        );
371    }
372
373    /// If the sender is dropped while still `Connecting`, the wait ends as `NetworkUnreachable`
374    /// rather than hanging forever.
375    #[tokio::test]
376    async fn wait_sender_dropped_is_network_unreachable() {
377        let (tx, rx) = watch::channel(DeviceState::Connecting);
378        drop(tx);
379        assert_eq!(
380            wait_for_running(rx, Some(Duration::from_secs(1))).await,
381            Err(RegistrationError::NetworkUnreachable)
382        );
383    }
384}