Skip to main content

studio_worker/
auto_register.rs

1//! Auto-register state machine — the only registration path.
2//!
3//! On first launch the worker POSTs `/workers/register-request`
4//! to the studio with a self-generated install id + a registration
5//! secret (only its SHA-256 hash leaves the box), then polls
6//! `/workers/register-requests/<id>` every 30s for the operator's
7//! decision.  On Approved we persist `worker_id` + `auth_token` to
8//! `config.toml` and fall through to the normal heartbeat / claim
9//! loops.  On Rejected we surface the reason; the user clears state
10//! with `studio-worker register --reset` to retry.
11//!
12//! `tick()` does at most one HTTP round-trip per call so the outer
13//! orchestrator can sleep between polls.  All persistence goes
14//! through `config::save` so a crash mid-flight leaves consistent
15//! on-disk state.
16
17use std::path::Path;
18use std::sync::Arc;
19
20use anyhow::Result;
21use chrono::{DateTime, Utc};
22use parking_lot::Mutex;
23use sha2::{Digest, Sha256};
24
25use crate::{
26    config::{self, SharedConfig},
27    engine,
28    http::ApiClient,
29    runtime::build_capabilities,
30    types::{AutoRegisterRequest, RegisterStatus},
31    AGENT_VERSION,
32};
33
34/// Tracing target for the registration state machine.  Stable so
35/// operators can filter the worker's most-asked-about flow ("why is my
36/// worker stuck unregistered?") with
37/// `RUST_LOG=studio_worker::auto_register=debug`.
38const TRACE_TARGET: &str = "studio_worker::auto_register";
39
40/// What `tick()` returns + what the UI Status tab reads.  Distinct
41/// from the persisted config fields, which carry the raw building
42/// blocks (`install_id`, `registration_request_id`, …).
43#[derive(Debug, Clone, PartialEq, Eq)]
44pub enum RegistrationState {
45    /// First-launch default; no request in flight, no worker_id.
46    Pristine,
47    /// Studio has a Pending row for us; we're polling for the
48    /// operator's decision.
49    Pending {
50        request_id: String,
51        /// First time we saw this request in the Pending state.
52        since: DateTime<Utc>,
53    },
54    /// `worker_id` + `auth_token` are in config; ready for the
55    /// normal heartbeat / claim loops.
56    Approved,
57    /// Operator rejected the request.  Worker stops trying;
58    /// `studio-worker register --reset` clears the state.
59    Rejected { reason: String },
60}
61
62/// Shared in-memory mirror of `RegistrationState` for the UI to read
63/// (the persisted source of truth is `Config`).
64pub type SharedRegistration = Arc<Mutex<RegistrationState>>;
65
66pub fn shared_initial() -> SharedRegistration {
67    Arc::new(Mutex::new(RegistrationState::Pristine))
68}
69
70/// One iteration of the state machine.
71///
72/// Reads the current `Config` snapshot, decides what to do, performs
73/// at most one HTTP call, persists changes via `config::save`,
74/// mirrors the new state into `observers`, and returns it.
75///
76/// Idempotent: re-running with the same on-disk state and a
77/// pending-returning studio is a no-op on disk.
78pub async fn tick(
79    cfg: &SharedConfig,
80    config_path: &Path,
81    observers: &SharedRegistration,
82) -> RegistrationState {
83    // Fast path: already registered.
84    {
85        let snap = cfg.lock();
86        if snap.worker_id.is_some() && snap.auth_token.is_some() {
87            *observers.lock() = RegistrationState::Approved;
88            return RegistrationState::Approved;
89        }
90    }
91
92    // Ensure install_id + secret are present before doing any HTTP.
93    ensure_install_state(cfg, config_path);
94
95    // Branch on whether we already have a request id.
96    let (api_base_url, request_id, secret, install_id) = {
97        let snap = cfg.lock();
98        (
99            snap.api_base_url.clone(),
100            snap.registration_request_id.clone(),
101            snap.registration_secret.clone(),
102            snap.install_id.clone(),
103        )
104    };
105
106    match (request_id, secret) {
107        (Some(rid), Some(sec)) => {
108            poll_existing(cfg, config_path, observers, api_base_url, rid, sec).await
109        }
110        _ => {
111            create_request(
112                cfg,
113                config_path,
114                observers,
115                api_base_url,
116                install_id.expect("ensure_install_state seeds install_id"),
117            )
118            .await
119        }
120    }
121}
122
123fn ensure_install_state(cfg: &SharedConfig, config_path: &Path) {
124    let mut snap = cfg.lock();
125    let mut dirty = false;
126    if snap.install_id.is_none() {
127        snap.install_id = Some(new_uuid());
128        dirty = true;
129    }
130    // Pre-allocate the secret only if we also have no request id.
131    // Otherwise the existing pair is still valid.
132    if snap.registration_request_id.is_none() && snap.registration_secret.is_none() {
133        snap.registration_secret = Some(new_secret_hex());
134        dirty = true;
135    }
136    if dirty {
137        let snapshot = snap.clone();
138        drop(snap);
139        if let Err(e) = config::save(&snapshot, config_path) {
140            tracing::warn!(
141                target: TRACE_TARGET,
142                op = "ensure-install",
143                config_path = %config_path.display(),
144                error = %e,
145                "failed to persist install state"
146            );
147        }
148    }
149}
150
151async fn create_request(
152    cfg: &SharedConfig,
153    config_path: &Path,
154    observers: &SharedRegistration,
155    api_base_url: String,
156    install_id: String,
157) -> RegistrationState {
158    // Bind the cloned value in its own statement so the `cfg.lock()`
159    // guard releases at the `;`.  Holding it across the `match` would
160    // deadlock the non-reentrant mutex the moment the `None` arm below
161    // re-locks to store a freshly generated secret.
162    let existing_secret = cfg.lock().registration_secret.clone();
163    let secret = match existing_secret {
164        Some(s) => s,
165        None => {
166            // Should never happen post-ensure_install_state, but be safe.
167            let s = new_secret_hex();
168            cfg.lock().registration_secret = Some(s.clone());
169            s
170        }
171    };
172    let secret_hash = sha256_hex(&secret);
173
174    // Build the capabilities snapshot the operator will see.
175    let payload = match build_payload(cfg, install_id.clone(), secret_hash) {
176        Ok(p) => p,
177        Err(e) => {
178            tracing::warn!(
179                target: TRACE_TARGET,
180                op = "register-request",
181                error = %e,
182                "engine build failed during register-request"
183            );
184            return RegistrationState::Pristine;
185        }
186    };
187
188    let api_base_url_for_task = api_base_url.clone();
189    let payload_for_task = payload.clone();
190    let result = tokio::task::spawn_blocking(move || -> Result<_> {
191        let api = ApiClient::new(api_base_url_for_task)?;
192        api.register_request(&payload_for_task)
193    })
194    .await;
195
196    let response = match result {
197        Ok(Ok(r)) => r,
198        Ok(Err(e)) => {
199            tracing::warn!(
200                target: TRACE_TARGET,
201                op = "register-request",
202                error = %e,
203                "register-request HTTP failed; will retry next tick"
204            );
205            return RegistrationState::Pristine;
206        }
207        Err(e) => {
208            tracing::warn!(
209                target: TRACE_TARGET,
210                op = "register-request",
211                error = %e,
212                "register-request task panic; will retry next tick"
213            );
214            return RegistrationState::Pristine;
215        }
216    };
217
218    // Persist requestId.
219    let now = Utc::now();
220    {
221        let mut snap = cfg.lock();
222        snap.registration_request_id = Some(response.request_id.clone());
223        let snapshot = snap.clone();
224        drop(snap);
225        if let Err(e) = config::save(&snapshot, config_path) {
226            tracing::warn!(
227                target: TRACE_TARGET,
228                op = "register-request",
229                config_path = %config_path.display(),
230                error = %e,
231                "failed to persist request_id"
232            );
233        }
234    }
235    let state = RegistrationState::Pending {
236        request_id: response.request_id,
237        since: now,
238    };
239    *observers.lock() = state.clone();
240    state
241}
242
243async fn poll_existing(
244    cfg: &SharedConfig,
245    config_path: &Path,
246    observers: &SharedRegistration,
247    api_base_url: String,
248    request_id: String,
249    secret: String,
250) -> RegistrationState {
251    let api_base_url_for_task = api_base_url.clone();
252    let request_id_for_task = request_id.clone();
253    let secret_for_task = secret.clone();
254    let result = tokio::task::spawn_blocking(move || -> Result<_> {
255        let api = ApiClient::new(api_base_url_for_task)?;
256        api.poll_register_status(&request_id_for_task, &secret_for_task)
257    })
258    .await;
259
260    let outcome = match result {
261        Ok(Ok(o)) => o,
262        Ok(Err(e)) => {
263            tracing::warn!(
264                target: TRACE_TARGET,
265                op = "poll",
266                error = %e,
267                "poll failed; will retry next tick"
268            );
269            let state = RegistrationState::Pending {
270                request_id,
271                since: Utc::now(),
272            };
273            *observers.lock() = state.clone();
274            return state;
275        }
276        Err(e) => {
277            tracing::warn!(
278                target: TRACE_TARGET,
279                op = "poll",
280                error = %e,
281                "poll task panic; will retry next tick"
282            );
283            let state = RegistrationState::Pending {
284                request_id,
285                since: Utc::now(),
286            };
287            *observers.lock() = state.clone();
288            return state;
289        }
290    };
291
292    match outcome {
293        None => {
294            // 404: studio doesn't know this request id anymore.  Drop
295            // the stale id + secret so the next tick creates fresh.
296            {
297                let mut snap = cfg.lock();
298                snap.registration_request_id = None;
299                snap.registration_secret = None;
300                let snapshot = snap.clone();
301                drop(snap);
302                if let Err(e) = config::save(&snapshot, config_path) {
303                    tracing::warn!(
304                        target: TRACE_TARGET,
305                        op = "poll",
306                        config_path = %config_path.display(),
307                        error = %e,
308                        "failed to persist cleared request state after stale 404; the stale request id stays on disk until the next successful save"
309                    );
310                }
311            }
312            *observers.lock() = RegistrationState::Pristine;
313            RegistrationState::Pristine
314        }
315        Some(RegisterStatus::Pending) => {
316            let state = RegistrationState::Pending {
317                request_id,
318                since: Utc::now(),
319            };
320            *observers.lock() = state.clone();
321            state
322        }
323        Some(RegisterStatus::Approved {
324            worker_id,
325            auth_token,
326        }) => {
327            {
328                let mut snap = cfg.lock();
329                snap.worker_id = Some(worker_id);
330                snap.auth_token = Some(auth_token);
331                snap.registration_request_id = None;
332                snap.registration_secret = None;
333                let snapshot = snap.clone();
334                drop(snap);
335                if let Err(e) = config::save(&snapshot, config_path) {
336                    tracing::error!(
337                        target: TRACE_TARGET,
338                        op = "poll",
339                        config_path = %config_path.display(),
340                        error = %e,
341                        "failed to persist approved credentials; this session is registered in memory but the worker will re-register from scratch on the next restart"
342                    );
343                }
344            }
345            *observers.lock() = RegistrationState::Approved;
346            RegistrationState::Approved
347        }
348        Some(RegisterStatus::Rejected { reason }) => {
349            {
350                let mut snap = cfg.lock();
351                snap.registration_request_id = None;
352                snap.registration_secret = None;
353                let snapshot = snap.clone();
354                drop(snap);
355                if let Err(e) = config::save(&snapshot, config_path) {
356                    tracing::warn!(
357                        target: TRACE_TARGET,
358                        op = "poll",
359                        config_path = %config_path.display(),
360                        error = %e,
361                        "failed to persist cleared request state after rejection; the stale request id stays on disk until the next successful save"
362                    );
363                }
364            }
365            let state = RegistrationState::Rejected { reason };
366            *observers.lock() = state.clone();
367            state
368        }
369    }
370}
371
372fn build_payload(
373    cfg: &SharedConfig,
374    install_id: String,
375    registration_secret_hash: String,
376) -> Result<AutoRegisterRequest> {
377    let snap = cfg.lock().clone();
378    let engine_handle = engine::build(&snap)?;
379    let capabilities = build_capabilities(&snap, &*engine_handle);
380    Ok(AutoRegisterRequest {
381        install_id,
382        registration_secret_hash,
383        capabilities,
384        user_agent: format!("studio-worker/{AGENT_VERSION}"),
385    })
386}
387
388fn new_uuid() -> String {
389    // UUIDv4-ish without pulling in the `uuid` crate: 16 random bytes
390    // formatted as 8-4-4-4-12.
391    let bytes: [u8; 16] = rand_bytes::<16>();
392    let hex: String = bytes.iter().map(|b| format!("{b:02x}")).collect();
393    format!(
394        "{}-{}-{}-{}-{}",
395        &hex[0..8],
396        &hex[8..12],
397        &hex[12..16],
398        &hex[16..20],
399        &hex[20..32]
400    )
401}
402
403fn new_secret_hex() -> String {
404    // 32 bytes of randomness = 64 hex chars (256 bits of entropy).
405    let bytes: [u8; 32] = rand_bytes::<32>();
406    bytes.iter().map(|b| format!("{b:02x}")).collect()
407}
408
409fn sha256_hex(input: &str) -> String {
410    let mut hasher = Sha256::new();
411    hasher.update(input.as_bytes());
412    let digest = hasher.finalize();
413    digest.iter().map(|b| format!("{b:02x}")).collect()
414}
415
416/// Fill `N` bytes from the OS cryptographically-secure RNG via the
417/// `getrandom` crate (`getrandom(2)` / `/dev/urandom` on Linux,
418/// `getentropy` on macOS, `BCryptGenRandom` on Windows).  Used for
419/// the install id and the registration secret, both of which must be
420/// unpredictable: an attacker who could guess the secret could claim
421/// another box's pending registration.
422///
423/// Panics if the OS entropy source is unavailable.  That only happens
424/// on a fundamentally broken platform, and failing loudly (the panic
425/// is captured by Sentry) is the right call — minting a guessable
426/// secret from a timestamp would be worse than a clean crash.
427///
428/// This used to hand-roll a `/dev/urandom` read on unix and silently
429/// fall back to a SHA-256-mixed timestamp on Windows (and on any I/O
430/// error), which left the Windows secret predictable.  `getrandom` is
431/// a tiny syscall wrapper already in the dep tree, so the whole
432/// per-OS dance collapses into one secure, testable path.
433fn rand_bytes<const N: usize>() -> [u8; N] {
434    let mut buf = [0u8; N];
435    getrandom::fill(&mut buf).expect("OS entropy source (getrandom) unavailable");
436    buf
437}
438
439#[cfg(test)]
440mod tests {
441    use super::*;
442
443    #[test]
444    fn new_uuid_has_expected_shape() {
445        let id = new_uuid();
446        let parts: Vec<&str> = id.split('-').collect();
447        assert_eq!(parts.len(), 5);
448        assert_eq!(parts[0].len(), 8);
449        assert_eq!(parts[1].len(), 4);
450        assert_eq!(parts[2].len(), 4);
451        assert_eq!(parts[3].len(), 4);
452        assert_eq!(parts[4].len(), 12);
453        assert!(id.chars().all(|c| c.is_ascii_hexdigit() || c == '-'));
454    }
455
456    #[test]
457    fn new_uuid_is_unique() {
458        let a = new_uuid();
459        let b = new_uuid();
460        assert_ne!(a, b);
461    }
462
463    #[test]
464    fn new_secret_hex_is_64_chars() {
465        let s = new_secret_hex();
466        assert_eq!(s.len(), 64);
467        assert!(s.chars().all(|c| c.is_ascii_hexdigit()));
468    }
469
470    #[test]
471    fn sha256_hex_is_deterministic() {
472        assert_eq!(sha256_hex("abc"), sha256_hex("abc"));
473        assert_ne!(sha256_hex("abc"), sha256_hex("abd"));
474        assert_eq!(sha256_hex("").len(), 64);
475    }
476
477    // ---------------------------------------------------------------
478    // Entropy primitive.  `rand_bytes` is the single source for the
479    // install id + registration secret on every platform, so these
480    // also cover the formerly-untested Windows path (which used to
481    // route through a predictable timestamp fallback).
482    // ---------------------------------------------------------------
483
484    #[test]
485    fn rand_bytes_are_distinct_across_many_calls() {
486        use std::collections::HashSet;
487        let mut seen = HashSet::new();
488        for _ in 0..2_000 {
489            assert!(
490                seen.insert(rand_bytes::<32>()),
491                "rand_bytes produced a duplicate 32-byte value"
492            );
493        }
494    }
495
496    #[test]
497    fn rand_bytes_cover_every_bit_position() {
498        // OR + AND across many samples: a stuck or constant source
499        // would leave a bit position never set (an OR-zero) or never
500        // cleared (an AND-one).  An OS CSPRNG flips every one of the
501        // 256 bits within a handful of samples.
502        let mut ever_set = [0u8; 32];
503        let mut ever_clear = [0xffu8; 32];
504        for _ in 0..256 {
505            let b = rand_bytes::<32>();
506            for i in 0..32 {
507                ever_set[i] |= b[i];
508                ever_clear[i] &= b[i];
509            }
510        }
511        assert_eq!(ever_set, [0xffu8; 32], "a bit position was never set");
512        assert_eq!(ever_clear, [0u8; 32], "a bit position was never cleared");
513    }
514}