Skip to main content

dscode_extension_host/
manager.rs

1use std::io::{BufRead, BufReader};
2use std::process::{Child, Command, Stdio};
3use std::thread;
4use std::time::{Duration, Instant};
5
6use tracing::{debug, error, info, instrument, warn};
7
8use crate::sandbox::{apply_sandbox, complete_sandbox_setup, SandboxConfig};
9
10/// STATE MACHINE: ExtensionHostProcess
11///
12/// Tracks the lifecycle of the Node.js extension host child process.
13///
14/// State Diagram:
15///
16///   Stopped ──────► Starting ──────► Running
17///     ▲                │                │
18///     │    (spawn fail)│   (unexpected) │  (stop() called)
19///     │                ▼      (exit)    ▼
20///     │             Crashed ◄──── Unhealthy      Stopping
21///     │                ▲              │              │
22///     │   (max retries)│              │ (within      │
23///     │                │              │  budget)     │
24///     │                │              ▼              │
25///     │                └──────── Restarting          │
26///     │                                             │
27///     └─────────────────────────────────────────────┘
28///
29/// Transitions:
30///   Stopped    -> Starting    (start() called)
31///   Starting   -> Running     (process spawned, stdout/stderr threads launched)
32///   Starting   -> Crashed     (Command::spawn() failed)
33///   Running    -> Unhealthy   (process exited unexpectedly, detected by try_wait)
34///   Running    -> Stopping    (stop() called for graceful shutdown)
35///   Unhealthy  -> Restarting  (restart_if_needed(), within MAX_RESTART_ATTEMPTS)
36///   Unhealthy  -> Crashed     (restart_if_needed(), attempts exhausted)
37///   Restarting -> Starting    (backoff elapsed, re-entering start flow)
38///   Stopping   -> Stopped     (process killed and waited successfully)
39///   Crashed    -> Starting    (explicit start() after crash, resets restart_count)
40///
41/// Concurrency Invariant:
42///   All access to ExtensionHostManager is serialized by the
43///   Arc<tokio::sync::Mutex<ExtensionHostManager>> held in SessionManager.
44///   No method on this struct should be called without holding that mutex.
45///   This means state transitions are inherently atomic — no TOCTOU possible
46///   as long as callers hold the mutex across check-then-act sequences.
47///
48/// Interruption Table:
49/// ┌─────────────┬──────────────────────────────────────────────────────────┐
50/// │ State       │ What happens + recovery path                            │
51/// ├─────────────┼──────────────────────────────────────────────────────────┤
52/// │ Stopped     │ Safe resting state. No child process, no resources.     │
53/// │             │ Can call start() to launch a new host.                  │
54/// ├─────────────┼──────────────────────────────────────────────────────────┤
55/// │ Starting    │ Child process spawning or just spawned.                 │
56/// │             │ If Tauri crashes: child process orphaned, OS reaps.     │
57/// │             │ If child spawn fails: -> Crashed immediately.           │
58/// │             │ If child exits within 5s: next state check -> Unhealthy.│
59/// │             │ stdout/stderr reader threads may be running — they exit │
60/// │             │ naturally when child's pipes close.                     │
61/// ├─────────────┼──────────────────────────────────────────────────────────┤
62/// │ Running     │ Normal operation. Process alive and healthy.            │
63/// │             │ If child exits unexpectedly: detected on next           │
64/// │             │ check_and_update_state() call -> Unhealthy.             │
65/// │             │ All pending IPC requests are orphaned (oneshot senders  │
66/// │             │ dropped, receivers get RecvErr or timeout).           │
67/// │             │ Frontend NOT notified — stale state until next command. │
68/// │             │ Backend SessionState still shows extensions as active.  │
69/// │             │ TODO: Add proactive health monitoring (periodic check). │
70/// ├─────────────┼──────────────────────────────────────────────────────────┤
71/// │ Unhealthy   │ Process has exited. No child process running.           │
72/// │             │ restart_if_needed() should be called to attempt         │
73/// │             │ recovery. If within budget (3 attempts):                │
74/// │             │   -> Restarting with exponential backoff (1s, 2s, 4s).  │
75/// │             │ If budget exhausted: -> Crashed (terminal).             │
76/// ├─────────────┼──────────────────────────────────────────────────────────┤
77/// │ Restarting  │ Waiting for backoff timer before re-spawn.              │
78/// │             │ If Tauri crashes during wait: no cleanup needed (no     │
79/// │             │ child process exists yet).                              │
80/// │             │ After backoff: -> Starting -> Running (or -> Crashed).  │
81/// ├─────────────┼──────────────────────────────────────────────────────────┤
82/// │ Stopping    │ Graceful shutdown in progress. Kill signal sent.        │
83/// │             │ If process ignores SIGTERM: force kill + wait.          │
84/// │             │ -> Stopped after process exits.                         │
85/// │             │ If Tauri crashes during stop: child may linger          │
86/// │             │ (no SIGCHLD handler). OS eventually reaps.              │
87/// ├─────────────┼──────────────────────────────────────────────────────────┤
88/// │ Crashed     │ Terminal failure state. Max restart attempts exhausted. │
89/// │             │ All extensions in this host are effectively dead.       │
90/// │             │ Backend state still shows extensions as active (stale). │
91/// │             │ Frontend shows stale extension state.                   │
92/// │             │ Recovery: explicit start() call resets restart_count    │
93/// │             │ and re-enters Starting. Alternatively, app restart.     │
94/// └─────────────┴──────────────────────────────────────────────────────────┘
95#[derive(Debug, Clone, Copy, PartialEq, Eq)]
96pub enum ExtensionHostState {
97    Stopped,
98    Starting,
99    Running,
100    Unhealthy,
101    Restarting,
102    Stopping,
103    Crashed,
104}
105
106const MAX_RESTART_ATTEMPTS: u32 = 3;
107const RESTART_BACKOFF_BASE_MS: u64 = 1000;
108
109pub struct ExtensionHostManager {
110    child: Option<Child>,
111    #[allow(dead_code)]
112    pub(crate) extension_id: String,
113    restart_count: u32,
114    last_restart_time: Option<Instant>,
115    started_at: Option<Instant>,
116    state: ExtensionHostState,
117}
118
119impl ExtensionHostManager {
120    pub fn new(extension_id: String) -> Self {
121        Self {
122            child: None,
123            extension_id,
124            restart_count: 0,
125            last_restart_time: None,
126            started_at: None,
127            state: ExtensionHostState::Stopped,
128        }
129    }
130
131    /// Validates and performs a state transition.
132    /// Returns Ok(()) on valid transition, Err with details on invalid.
133    /// Invalid transitions are logged but do NOT panic — graceful degradation.
134    fn transition(&mut self, to: ExtensionHostState) -> Result<(), String> {
135        let valid = match self.state {
136            ExtensionHostState::Stopped => matches!(to, ExtensionHostState::Starting),
137            ExtensionHostState::Starting => {
138                matches!(to, ExtensionHostState::Running | ExtensionHostState::Crashed)
139            }
140            ExtensionHostState::Running => {
141                matches!(to, ExtensionHostState::Unhealthy | ExtensionHostState::Stopping)
142            }
143            ExtensionHostState::Unhealthy => {
144                matches!(to, ExtensionHostState::Restarting | ExtensionHostState::Crashed)
145            }
146            ExtensionHostState::Restarting => matches!(to, ExtensionHostState::Starting),
147            ExtensionHostState::Stopping => matches!(to, ExtensionHostState::Stopped),
148            ExtensionHostState::Crashed => matches!(to, ExtensionHostState::Starting),
149        };
150
151        if valid {
152            info!(state_from = ?self.state, state_to = ?to, "State transition");
153            self.state = to;
154            Ok(())
155        } else {
156            let msg = format!(
157                "Invalid state transition: {:?} -> {:?}",
158                self.state, to
159            );
160            error!(state_from = ?self.state, state_to = ?to, "Invalid state transition attempted");
161            Err(msg)
162        }
163    }
164
165    /// Returns the current state of the extension host process.
166    /// Callers should use this instead of the removed is_running()/is_healthy() methods.
167    pub fn state(&self) -> ExtensionHostState {
168        self.state
169    }
170
171    /// Checks if the child process is still alive and updates state accordingly.
172    /// Call this before making decisions based on host state.
173    /// This is the ONLY place where Running -> Unhealthy transition happens passively.
174    pub fn check_and_update_state(&mut self) {
175        if self.state != ExtensionHostState::Running {
176            return;
177        }
178        if let Some(child) = &mut self.child {
179            match child.try_wait() {
180                Ok(Some(status)) => {
181                    info!(status = %status, "Process exited");
182                    let _ = self.transition(ExtensionHostState::Unhealthy);
183                }
184                Ok(None) => {} // Still running
185                Err(e) => {
186                    error!(error = %e, "Failed to check process status");
187                    let _ = self.transition(ExtensionHostState::Unhealthy);
188                }
189            }
190        }
191    }
192
193    #[instrument(skip(self))]
194    pub fn start(
195        &mut self, extension_host_path: &str, outgoing_socket: &str, incoming_socket: &str,
196    ) -> Result<(), String> {
197        self.transition(ExtensionHostState::Starting)?;
198
199        info!(path = extension_host_path, "Starting extension host");
200
201        let node_binary = match self.resolve_node_binary() {
202            Ok(bin) => bin,
203            Err(e) => {
204                let _ = self.transition(ExtensionHostState::Crashed);
205                return Err(e);
206            }
207        };
208
209        // Verify the Node.js binary hasn't been tampered with
210        if let Err(e) = crate::binary_verifier::verify_binary(std::path::Path::new(&node_binary)) {
211            let _ = self.transition(ExtensionHostState::Crashed);
212            return Err(e);
213        }
214
215        #[cfg(unix)]
216        {
217            use std::os::unix::fs::PermissionsExt;
218            let _ = std::fs::set_permissions(&node_binary, std::fs::Permissions::from_mode(0o755));
219        }
220
221        let mut cmd = Command::new(&node_binary);
222        cmd.arg(extension_host_path);
223
224        cmd.env("DSCODE_IPC_URL", outgoing_socket);
225        info!(url = outgoing_socket, "Set DSCODE_IPC_URL");
226
227        cmd.env("DSCODE_INCOMING_IPC_URL", incoming_socket);
228        info!(url = incoming_socket, "Set DSCODE_INCOMING_IPC_URL");
229
230        if let Ok(resource_dir) = std::env::var("DSCODE_RESOURCE_PATH") {
231            cmd.env("DSCODE_RESOURCE_PATH", &resource_dir);
232        } else if let Ok(exe) = std::env::current_exe() {
233            if let Some(exe_dir) = exe.parent() {
234                let resource_path = exe_dir.join("resources");
235                if resource_path.exists() {
236                    cmd.env("DSCODE_RESOURCE_PATH", resource_path.to_string_lossy().to_string());
237                }
238            }
239        }
240
241        let sandbox_config = SandboxConfig::default();
242        if let Err(e) = apply_sandbox(&mut cmd, &sandbox_config) {
243            let _ = self.transition(ExtensionHostState::Crashed);
244            return Err(e);
245        }
246
247        cmd.stdout(Stdio::piped()).stderr(Stdio::piped());
248
249        let mut child = match cmd.spawn() {
250            Ok(child) => child,
251            Err(e) => {
252                let _ = self.transition(ExtensionHostState::Crashed);
253                return Err(format!("Failed to spawn extension host: {}", e));
254            }
255        };
256
257        // Complete sandbox setup — on Windows this assigns the child process
258        // to the Job Object created by apply_sandbox. On other platforms this
259        // is a no-op.
260        if let Err(e) = complete_sandbox_setup(&child) {
261            warn!("Failed to complete sandbox setup: {}", e);
262        }
263
264        let stdout = child.stdout.take().ok_or("Failed to capture stdout")?;
265        let stderr = child.stderr.take().ok_or("Failed to capture stderr")?;
266
267        thread::spawn(move || {
268            let reader = BufReader::new(stdout);
269            for line in reader.lines().map_while(Result::ok) {
270                info!(source = "stdout", "{}", line);
271            }
272        });
273
274        thread::spawn(move || {
275            let reader = BufReader::new(stderr);
276            for line in reader.lines().map_while(Result::ok) {
277                warn!(source = "stderr", "{}", line);
278            }
279        });
280
281        self.started_at = Some(Instant::now());
282        self.child = Some(child);
283        let _ = self.transition(ExtensionHostState::Running);
284        info!(
285            pid = self.child.as_ref().map(|c| c.id()),
286            "Process started successfully"
287        );
288
289        Ok(())
290    }
291
292    fn resolve_node_binary(&self) -> Result<String, String> {
293        #[cfg(target_os = "windows")]
294        let node_binary_name = "node.exe";
295        #[cfg(not(target_os = "windows"))]
296        let node_binary_name = "node";
297
298        let target_triple = std::env::consts::ARCH.to_string()
299            + "-"
300            + match std::env::consts::OS {
301                "macos" => "apple-darwin",
302                "linux" => "unknown-linux-gnu",
303                "windows" => "pc-windows-msvc",
304                other => other,
305            };
306
307        #[cfg(target_os = "windows")]
308        let ext_bin_name = format!("node-{}.exe", target_triple);
309        #[cfg(not(target_os = "windows"))]
310        let ext_bin_name = format!("node-{}", target_triple);
311
312        if let Ok(exe) = std::env::current_exe() {
313            if let Some(exe_dir) = exe.parent() {
314                let bundled = exe_dir.join(&ext_bin_name);
315                if bundled.exists() {
316                    debug!(path = ?bundled, "Using bundled Node.js (externalBin)");
317                    return Ok(bundled.to_string_lossy().to_string());
318                }
319
320                let bundled = exe_dir.join(node_binary_name);
321                if bundled.exists() {
322                    debug!(path = ?bundled, "Using bundled Node.js");
323                    return Ok(bundled.to_string_lossy().to_string());
324                }
325            }
326        }
327
328        if let Ok(resource_dir) = std::env::var("DSCODE_RESOURCE_PATH") {
329            let bundled = std::path::Path::new(&resource_dir).join(node_binary_name);
330            if bundled.exists() {
331                debug!(path = ?bundled, "Using bundled Node.js from resource path");
332                return Ok(bundled.to_string_lossy().to_string());
333            }
334        }
335
336        match which::which("node") {
337            Ok(path) => {
338                debug!(path = ?path, "Using system Node.js");
339                Ok(path.to_string_lossy().to_string())
340            }
341            Err(_) => {
342                Err("Node.js not found. Install Node.js or bundle it with the application."
343                    .to_string())
344            }
345        }
346    }
347
348    #[instrument(skip(self))]
349    pub async fn restart_if_needed(
350        &mut self, extension_host_path: &str, outgoing_socket: &str, incoming_socket: &str,
351    ) -> Result<(), String> {
352        // If we think we're running, check whether the process actually exited.
353        if self.state == ExtensionHostState::Running {
354            self.check_and_update_state();
355        }
356
357        if self.state != ExtensionHostState::Unhealthy {
358            return Err(format!(
359                "Cannot restart from state {:?} — must be Unhealthy",
360                self.state
361            ));
362        }
363
364        if self.restart_count >= MAX_RESTART_ATTEMPTS {
365            error!(max_attempts = MAX_RESTART_ATTEMPTS, "Max restart attempts exceeded");
366            let _ = self.transition(ExtensionHostState::Crashed);
367            return Err(format!(
368                "Extension host crashed and max restart attempts ({}) exceeded",
369                MAX_RESTART_ATTEMPTS
370            ));
371        }
372
373        let _ = self.transition(ExtensionHostState::Restarting);
374
375        let backoff_ms = RESTART_BACKOFF_BASE_MS * 2u64.pow(self.restart_count);
376        info!(
377            attempt = self.restart_count + 1,
378            max_attempts = MAX_RESTART_ATTEMPTS,
379            backoff_ms = backoff_ms,
380            "Restarting after crash"
381        );
382
383        tokio::time::sleep(Duration::from_millis(backoff_ms)).await;
384
385        self.restart_count += 1;
386        self.last_restart_time = Some(Instant::now());
387
388        // Transition Restarting -> Starting happens inside start()
389        self.start(extension_host_path, outgoing_socket, incoming_socket)?;
390
391        info!(attempt = self.restart_count, "Restart successful");
392        Ok(())
393    }
394
395    #[allow(dead_code)]
396    pub(crate) fn reset_restart_count(&mut self) {
397        if self.restart_count > 0 && self.state == ExtensionHostState::Running {
398            if let Some(started) = self.started_at {
399                if started.elapsed() > Duration::from_millis(5000) {
400                    self.restart_count = 0;
401                }
402            }
403        }
404    }
405
406    #[instrument(skip(self))]
407    pub fn stop(&mut self) -> Result<(), String> {
408        // Allow stop() from any state by force-setting Stopping.
409        // This is intentionally permissive — stop() is a cleanup operation.
410        if self.state != ExtensionHostState::Stopped {
411            // Force state to Stopping regardless of current state.
412            info!(state_from = ?self.state, "Forcing state to Stopping");
413            self.state = ExtensionHostState::Stopping;
414        }
415
416        if let Some(mut child) = self.child.take() {
417            info!(pid = child.id(), "Stopping process");
418
419            match child.try_wait() {
420                Ok(None) => {
421                    child.kill().map_err(|e| format!("Failed to kill extension host: {}", e))?;
422
423                    match child.wait() {
424                        Ok(status) => {
425                            info!(status = %status, "Process exited")
426                        }
427                        Err(e) => error!(error = %e, "Error waiting for process"),
428                    }
429                }
430                Ok(Some(status)) => {
431                    info!(status = %status, "Process already exited");
432                }
433                Err(e) => {
434                    warn!(error = %e, "Error checking process status");
435                }
436            }
437
438            info!("Process stopped");
439        }
440
441        if self.state == ExtensionHostState::Stopping {
442            let _ = self.transition(ExtensionHostState::Stopped);
443        }
444
445        Ok(())
446    }
447
448    pub fn shutdown(&mut self) {
449        let _ = self.stop();
450    }
451}
452
453impl Drop for ExtensionHostManager {
454    fn drop(&mut self) {
455        let _ = self.stop();
456    }
457}