ts_runtime/device_state.rs
1//! Device connection-state tracking: a push-style view of where a [`Runtime`](crate::Runtime) is in
2//! its control-plane lifecycle, plus a typed registration outcome.
3//!
4//! Mirrors the part of Go `tsnet`/`ipn`'s state machine an embedder actually reacts to: is the node
5//! still coming up, running, waiting for interactive login, expired, or did registration hard-fail?
6//! The [`ControlRunner`](crate::control_runner::ControlRunner) publishes transitions into a
7//! `watch` cell so an embedder can `await` them ([`Runtime::watch_state`](crate::Runtime::watch_state))
8//! instead of polling [`status`](crate::Runtime::status), and
9//! [`Runtime::wait_until_running`](crate::Runtime::wait_until_running) is a one-shot convenience
10//! built on the same cell.
11
12/// The control-plane lifecycle state of a device.
13///
14/// Published by the control runner as it brings the node up and maintains the netmap stream. A
15/// consumer watches this to drive UI ("connecting…", "needs login", "expired") and to distinguish a
16/// permanent failure from a transient one without inspecting logs.
17#[derive(Debug, Clone, PartialEq, Eq)]
18pub enum DeviceState {
19 /// The runtime has spawned and is registering / establishing the control session. The initial
20 /// state of every device.
21 Connecting,
22 /// Registered and the netmap stream is live — the node is up.
23 Running,
24 /// Control requires interactive authentication (no usable auth key): the node is waiting for a
25 /// human to authorize it at the carried URL. Transient — registration retries until authorized.
26 NeedsLogin(url::Url),
27 /// The node key has expired (control reported the self-node's key expiry is in the past). The
28 /// node must re-authenticate to continue. Surfaced from the netmap self-node, not registration.
29 Expired,
30 /// Registration hard-failed with a permanent reason (e.g. a bad/expired/unknown auth key). The
31 /// control runner stops; this carries the typed reason. Not retried.
32 Failed(RegistrationError),
33}
34
35/// A typed registration outcome, distinguishing a **permanent** failure (don't retry — tell the
36/// user) from a **transient** one (worth retrying).
37///
38/// This is the error surfaced by [`Runtime::wait_until_running`](crate::Runtime::wait_until_running),
39/// replacing the previous "poll `ipv4_addr` until a deadline and report a generic timeout" workaround
40/// with an actionable reason.
41#[derive(Debug, thiserror::Error, Clone, PartialEq, Eq)]
42pub enum RegistrationError {
43 /// Control rejected registration with a permanent reason — typically a bad, expired, or unknown
44 /// auth key. The string is control's verbatim reason. **Permanent**: re-pairing (a new auth
45 /// key) is required; retrying with the same key will not succeed.
46 #[error("authentication rejected by control: {0}")]
47 AuthRejected(String),
48
49 /// The node key has expired. **Permanent** until re-authentication.
50 #[error("node key expired; re-authentication required")]
51 KeyExpired,
52
53 /// Interactive authorization is required: control offered an auth URL (no usable auth key).
54 /// **Actionable but not permanent** — direct the user to the URL; the runtime keeps retrying
55 /// registration and will reach `Running` once the user authorizes (so this is *not*
56 /// [`is_permanent`](Self::is_permanent)). A caller using an auth key should not hit this; a
57 /// caller doing interactive auth should drive it via
58 /// [`watch_state`](crate::Runtime::watch_state) rather than treating this as a hard failure.
59 #[error("interactive login required at {0}")]
60 NeedsLogin(url::Url),
61
62 /// The control plane was unreachable (network/transport error). **Transient**: retrying later
63 /// may succeed.
64 #[error("control plane unreachable")]
65 NetworkUnreachable,
66
67 /// No settled state was reached before the caller's timeout elapsed. **Indeterminate**:
68 /// registration may still be in flight (e.g. slow control plane); the caller may retry the wait.
69 #[error("timed out waiting for the device to finish registering")]
70 Timeout,
71}
72
73impl RegistrationError {
74 /// Whether this outcome is **permanent** — re-pairing / new credentials are required and
75 /// retrying as-is will not succeed (`AuthRejected`, `KeyExpired`). Everything else is not
76 /// permanent: `NetworkUnreachable`/`Timeout` are transient (retry may succeed), and `NeedsLogin`
77 /// is actionable-but-recoverable (the runtime keeps retrying and reaches `Running` once the user
78 /// authorizes the offered URL — so it is *not* permanent).
79 pub fn is_permanent(&self) -> bool {
80 matches!(
81 self,
82 RegistrationError::AuthRejected(_) | RegistrationError::KeyExpired
83 )
84 }
85}
86
87/// Map a control-layer [`ts_control::Error`] from the registration path into a typed
88/// [`RegistrationError`]. Used by the control runner when its `check_auth` loop hard-fails.
89impl From<&ts_control::Error> for RegistrationError {
90 fn from(e: &ts_control::Error) -> Self {
91 match e {
92 ts_control::Error::MachineNotAuthorized(u) => RegistrationError::NeedsLogin(u.clone()),
93 ts_control::Error::Registration(reason) => {
94 RegistrationError::AuthRejected(reason.clone())
95 }
96 ts_control::Error::NetworkError(_) => RegistrationError::NetworkUnreachable,
97 // A 429 rate-limit is **transient** — control is asking us to wait, not rejecting us —
98 // so it must NOT become a permanent `AuthRejected`. The control runner's `check_auth`
99 // loop already intercepts `RateLimited` and sleeps the server delay before this mapping
100 // is reached; classifying it as `NetworkUnreachable` here keeps any other caller of this
101 // conversion on the correct (non-permanent, retry-may-succeed) branch.
102 ts_control::Error::RateLimited(_) => RegistrationError::NetworkUnreachable,
103 // InvalidUrl / Internal: not a transient network condition and not an auth decision —
104 // treat as a (permanent-ish) auth rejection carrying the display reason so the caller
105 // sees something actionable rather than an opaque "timeout".
106 other => RegistrationError::AuthRejected(other.to_string()),
107 }
108 }
109}
110
111/// Wait on a [`DeviceState`] `watch` channel until it settles, mapping the settled state to the
112/// typed [`wait_until_running`](crate::Runtime::wait_until_running) result.
113///
114/// Factored out of [`Runtime::wait_until_running`](crate::Runtime) so the (non-trivial) loop — the
115/// see-then-await ordering, the per-state mapping, sender-drop handling, and the timeout — is
116/// unit-testable against a plain `watch::channel` without standing up a runtime.
117pub(crate) async fn wait_for_running(
118 mut rx: tokio::sync::watch::Receiver<DeviceState>,
119 timeout: Option<core::time::Duration>,
120) -> Result<(), RegistrationError> {
121 let wait = async {
122 loop {
123 // Evaluate the current value, then await a change. `borrow_and_update` marks the current
124 // value seen so a transition isn't missed between this check and `changed()`.
125 let settled = match &*rx.borrow_and_update() {
126 DeviceState::Running => Some(Ok(())),
127 DeviceState::Failed(e) => Some(Err(e.clone())),
128 DeviceState::Expired => Some(Err(RegistrationError::KeyExpired)),
129 DeviceState::NeedsLogin(u) => Some(Err(RegistrationError::NeedsLogin(u.clone()))),
130 DeviceState::Connecting => None,
131 };
132 if let Some(result) = settled {
133 return result;
134 }
135 // Not settled yet — wait for the next transition. If the sender is dropped (runtime
136 // tearing down), treat it as unreachable rather than hanging forever.
137 if rx.changed().await.is_err() {
138 return Err(RegistrationError::NetworkUnreachable);
139 }
140 }
141 };
142
143 match timeout {
144 Some(timeout) => tokio::time::timeout(timeout, wait)
145 .await
146 .unwrap_or(Err(RegistrationError::Timeout)),
147 None => wait.await,
148 }
149}
150
151#[cfg(test)]
152mod tests {
153 use core::time::Duration;
154
155 use tokio::sync::watch;
156
157 use super::*;
158
159 #[test]
160 fn permanence_classification() {
161 // Permanent: re-pairing / new credentials required.
162 assert!(RegistrationError::AuthRejected("bad key".into()).is_permanent());
163 assert!(RegistrationError::KeyExpired.is_permanent());
164 // Not permanent: NeedsLogin recovers once the user authorizes (runtime keeps retrying);
165 // network/timeout are transient.
166 assert!(
167 !RegistrationError::NeedsLogin("https://login.example/x".parse().unwrap())
168 .is_permanent()
169 );
170 assert!(!RegistrationError::NetworkUnreachable.is_permanent());
171 assert!(!RegistrationError::Timeout.is_permanent());
172 }
173
174 #[test]
175 fn maps_control_error_variants() {
176 let url: url::Url = "https://login.example/a".parse().unwrap();
177 assert_eq!(
178 RegistrationError::from(&ts_control::Error::MachineNotAuthorized(url.clone())),
179 RegistrationError::NeedsLogin(url)
180 );
181 assert_eq!(
182 RegistrationError::from(&ts_control::Error::Registration("bad auth key".into())),
183 RegistrationError::AuthRejected("bad auth key".into())
184 );
185 assert_eq!(
186 RegistrationError::from(&ts_control::Error::NetworkError(
187 ts_control::Operation::Registration
188 )),
189 RegistrationError::NetworkUnreachable
190 );
191 // A 429 rate-limit is TRANSIENT and must map to a non-permanent state, never the
192 // `AuthRejected` catch-all (which would wrongly stop the runtime). This pins the explicit
193 // arm: if a refactor drops it and lets `RateLimited` fall into `other => AuthRejected`, this
194 // assertion fails.
195 let rl = RegistrationError::from(&ts_control::Error::RateLimited(Duration::from_secs(30)));
196 assert_eq!(rl, RegistrationError::NetworkUnreachable);
197 assert!(
198 !rl.is_permanent(),
199 "a rate-limit must be a transient (non-permanent) failure"
200 );
201 }
202
203 // --- wait_for_running loop ---
204
205 /// An already-`Running` cell resolves `Ok(())` immediately (the initial `borrow_and_update`
206 /// sees it without waiting for a transition).
207 #[tokio::test]
208 async fn wait_resolves_when_already_running() {
209 let (_tx, rx) = watch::channel(DeviceState::Running);
210 assert_eq!(
211 wait_for_running(rx, Some(Duration::from_secs(1))).await,
212 Ok(())
213 );
214 }
215
216 /// A transition `Connecting → Running` published from another task is observed (no missed
217 /// wakeup) and resolves `Ok(())`.
218 #[tokio::test]
219 async fn wait_resolves_on_transition_to_running() {
220 let (tx, rx) = watch::channel(DeviceState::Connecting);
221 tokio::spawn(async move {
222 tokio::time::sleep(Duration::from_millis(20)).await;
223 tx.send_replace(DeviceState::Running);
224 });
225 assert_eq!(
226 wait_for_running(rx, Some(Duration::from_secs(1))).await,
227 Ok(())
228 );
229 }
230
231 /// Each settled non-running state maps to its typed error.
232 #[tokio::test]
233 async fn wait_maps_each_settled_failure() {
234 for (state, expected) in [
235 (
236 DeviceState::Failed(RegistrationError::AuthRejected("bad".into())),
237 RegistrationError::AuthRejected("bad".into()),
238 ),
239 (DeviceState::Expired, RegistrationError::KeyExpired),
240 (
241 DeviceState::NeedsLogin("https://login.example/x".parse().unwrap()),
242 RegistrationError::NeedsLogin("https://login.example/x".parse().unwrap()),
243 ),
244 ] {
245 let (_tx, rx) = watch::channel(state);
246 assert_eq!(
247 wait_for_running(rx, Some(Duration::from_secs(1))).await,
248 Err(expected)
249 );
250 }
251 }
252
253 /// A cell stuck at `Connecting` past the timeout yields `Timeout`.
254 #[tokio::test]
255 async fn wait_times_out_while_connecting() {
256 let (_tx, rx) = watch::channel(DeviceState::Connecting);
257 assert_eq!(
258 wait_for_running(rx, Some(Duration::from_millis(30))).await,
259 Err(RegistrationError::Timeout)
260 );
261 }
262
263 /// If the sender is dropped while still `Connecting`, the wait ends as `NetworkUnreachable`
264 /// rather than hanging forever.
265 #[tokio::test]
266 async fn wait_sender_dropped_is_network_unreachable() {
267 let (tx, rx) = watch::channel(DeviceState::Connecting);
268 drop(tx);
269 assert_eq!(
270 wait_for_running(rx, Some(Duration::from_secs(1))).await,
271 Err(RegistrationError::NetworkUnreachable)
272 );
273 }
274}