Skip to main content

stakpak_server/
sandbox.rs

1//! Sandboxed MCP server management.
2//!
3//! When sandbox mode is enabled for a session, a `stakpak mcp start` server
4//! is spawned inside a warden container. The host-side proxy connects to it
5//! via HTTPS/mTLS, and tool calls from the agent loop are routed through the
6//! containerized server — executing `run_command`, file I/O, etc. inside the
7//! sandbox.
8//!
9//! ## Sandbox Modes
10//!
11//! - **Persistent** (default): A single sandbox container is spawned at process startup and
12//!   reused across all sessions. Near-zero per-session overhead, slightly less
13//!   isolation (sessions share the same container filesystem).
14//!
15//! - **Ephemeral**: A new sandbox container is spawned for each session
16//!   and destroyed when the session ends. Maximum isolation, ~5-10s startup overhead.
17//!
18//! ## mTLS key exchange
19//!
20//! Each side generates its own identity independently:
21//!
22//! 1. Host generates a client identity (CA + leaf cert + key, all in memory)
23//! 2. Host passes the client **CA cert** (public only) to the container via env var
24//! 3. Container generates a server identity (CA + leaf cert + key, all in memory)
25//! 4. Container outputs the server **CA cert** (public only) to stdout
26//! 5. Host parses the server CA cert and builds a client TLS config
27//!
28//! Private keys never leave their respective processes.
29
30use serde::{Deserialize, Serialize};
31use stakpak_mcp_client::McpClient;
32use stakpak_mcp_proxy::client::{ClientPoolConfig, ServerConfig};
33use stakpak_mcp_proxy::server::start_proxy_server;
34use stakpak_shared::cert_utils::{CertificateChain, MtlsIdentity};
35use std::collections::HashMap;
36use std::path::Path;
37use std::process::ExitStatus;
38use std::sync::Arc;
39use tokio::io::{AsyncBufReadExt, AsyncReadExt};
40use tokio::net::TcpListener;
41use tokio::process::Child;
42use tokio::sync::{broadcast, watch};
43
44/// Environment variable used to pass the client CA cert PEM to the container.
45const TRUSTED_CLIENT_CA_ENV: &str = "STAKPAK_MCP_CLIENT_CA";
46
47/// Path to the entrypoint script inside the agent container image.
48///
49/// The entrypoint handles UID/GID remapping when the container is started as
50/// root (via `--user 0:0`) with `STAKPAK_TARGET_UID`/`STAKPAK_TARGET_GID` env
51/// vars.  It patches `/etc/passwd`, chowns `$HOME`, and drops privileges via
52/// gosu before exec-ing the real command.
53///
54/// This path is part of the image contract — it must match the `COPY` in the
55/// Dockerfile.  If the image is stale and missing this script, the container
56/// will fail to start with a clear "No such file" error.
57const CONTAINER_ENTRYPOINT: &str = "/home/agent/.local/bin/entrypoint.sh";
58
59// ── Sandbox mode ────────────────────────────────────────────────────────────
60
61/// Controls how sandbox containers are managed across sessions.
62#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
63#[serde(rename_all = "lowercase")]
64pub enum SandboxMode {
65    /// Spawn a new sandbox container for each session and destroy it when the
66    /// session ends. Maximum isolation, ~5-10s startup overhead per session.
67    Ephemeral,
68    /// Spawn a single sandbox container at process startup and reuse it for all
69    /// sessions. Near-zero per-session overhead, shared container filesystem.
70    #[default]
71    Persistent,
72}
73
74impl std::fmt::Display for SandboxMode {
75    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
76        match self {
77            SandboxMode::Ephemeral => write!(f, "ephemeral"),
78            SandboxMode::Persistent => write!(f, "persistent"),
79        }
80    }
81}
82
83// ── Sandbox config ──────────────────────────────────────────────────────────
84
85#[derive(Clone, Debug, Default, PartialEq, Eq)]
86pub enum SandboxUserMapping {
87    #[default]
88    ImageDefault,
89    HostUser {
90        uid: u32,
91        gid: u32,
92    },
93}
94
95/// Configuration for spawning sandboxed MCP servers.
96#[derive(Clone, Debug)]
97pub struct SandboxConfig {
98    /// Path to the warden binary.
99    pub warden_path: String,
100    /// Container image for the sandbox (e.g., `ghcr.io/stakpak/agent:v1.2.3`).
101    pub image: String,
102    /// Volume mounts for the container (e.g., `["./:/agent:ro"]`).
103    pub volumes: Vec<String>,
104    /// How sandbox containers are managed across sessions.
105    pub mode: SandboxMode,
106    /// User identity mapping for the container runtime.
107    pub user_mapping: SandboxUserMapping,
108}
109
110// ── Sandbox health ──────────────────────────────────────────────────────────
111
112/// Health status of a persistent sandbox, updated by the background monitor.
113#[derive(Clone, Debug, Serialize, Deserialize)]
114pub struct SandboxHealth {
115    /// Whether the last health check succeeded.
116    pub healthy: bool,
117    /// Number of consecutive successful health checks.
118    pub consecutive_ok: u64,
119    /// Number of consecutive failed health checks.
120    pub consecutive_failures: u64,
121    /// ISO 8601 timestamp of the last successful health check.
122    pub last_ok: Option<String>,
123    /// Error message from the last failed health check, if any.
124    pub last_error: Option<String>,
125    /// Total number of respawn attempts since the sandbox was started.
126    pub total_respawn_attempts: u64,
127}
128
129impl Default for SandboxHealth {
130    fn default() -> Self {
131        Self {
132            healthy: true,
133            consecutive_ok: 0,
134            consecutive_failures: 0,
135            last_ok: None,
136            last_error: None,
137            total_respawn_attempts: 0,
138        }
139    }
140}
141
142/// Interval between health checks for persistent sandboxes.
143const HEALTH_CHECK_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30);
144
145/// Number of consecutive failures before attempting a respawn.
146const RESPAWN_THRESHOLD: u64 = 3;
147
148/// Maximum number of total respawn attempts before giving up and shutting down.
149const MAX_RESPAWN_ATTEMPTS: u64 = 5;
150
151// ── Persistent sandbox ──────────────────────────────────────────────────────
152
153/// A long-lived sandbox that persists across sessions.
154///
155/// Spawned once at process startup (when `SandboxMode::Persistent` is configured)
156/// and shared by all sessions. Includes a background health monitor that
157/// periodically pings the sandbox and attempts to respawn it on failure.
158pub struct PersistentSandbox {
159    inner: Arc<tokio::sync::RwLock<SandboxedMcpServer>>,
160    config: SandboxConfig,
161    health_rx: watch::Receiver<SandboxHealth>,
162    /// Handle to the background health monitor task.
163    monitor_handle: tokio::task::JoinHandle<()>,
164}
165
166impl PersistentSandbox {
167    /// Spawn a persistent sandbox with a background health monitor.
168    pub async fn spawn(config: &SandboxConfig) -> Result<Self, String> {
169        tracing::info!(image = %config.image, "Spawning persistent sandbox container");
170        let inner = SandboxedMcpServer::spawn(config).await?;
171        tracing::info!("Persistent sandbox ready");
172
173        let initial_health = SandboxHealth {
174            healthy: true,
175            consecutive_ok: 1,
176            consecutive_failures: 0,
177            last_ok: Some(chrono::Utc::now().to_rfc3339()),
178            last_error: None,
179            total_respawn_attempts: 0,
180        };
181        let (health_tx, health_rx) = watch::channel(initial_health);
182
183        let inner = Arc::new(tokio::sync::RwLock::new(inner));
184        let monitor_inner = inner.clone();
185        let monitor_config = config.clone();
186
187        let monitor_handle = tokio::spawn(async move {
188            health_monitor_loop(monitor_inner, monitor_config, health_tx).await;
189        });
190
191        Ok(Self {
192            inner,
193            config: config.clone(),
194            health_rx,
195            monitor_handle,
196        })
197    }
198
199    /// Get the MCP client for routing tool calls through this sandbox.
200    pub async fn client(&self) -> Arc<McpClient> {
201        self.inner.read().await.client.clone()
202    }
203
204    /// Get the tools available from this sandbox.
205    pub async fn tools(&self) -> Vec<stakai::Tool> {
206        self.inner.read().await.tools.clone()
207    }
208
209    /// Get the current health status (non-blocking snapshot).
210    pub fn health(&self) -> SandboxHealth {
211        self.health_rx.borrow().clone()
212    }
213
214    /// Get the sandbox mode from the config.
215    pub fn mode(&self) -> &SandboxMode {
216        &self.config.mode
217    }
218
219    /// Shut down the persistent sandbox and its health monitor.
220    pub async fn shutdown(self) {
221        tracing::info!("Shutting down persistent sandbox");
222        self.monitor_handle.abort();
223        // Try to take exclusive ownership of the inner sandbox for clean shutdown.
224        // If other sessions still hold references, the RwLock + Arc will be dropped
225        // when all references are gone; the container process will be cleaned up by
226        // the OS when the host process exits.
227        if let Ok(inner) = Arc::try_unwrap(self.inner) {
228            let sandbox = inner.into_inner();
229            sandbox.shutdown().await;
230        } else {
231            tracing::warn!(
232                "Other references to persistent sandbox still exist; container will be cleaned up on process exit"
233            );
234        }
235    }
236
237    /// Force-kill the sandbox container and abort the health monitor.
238    ///
239    /// Unlike `shutdown(self)`, this works through a shared reference so it can
240    /// be called from the graceful-shutdown handler where only `Arc<Self>` is
241    /// available. The container process is killed via the write lock.
242    pub async fn kill(&self) {
243        tracing::warn!(
244            "Killing persistent sandbox container — in-flight sessions using this sandbox will fail"
245        );
246        self.monitor_handle.abort();
247        self.inner.write().await.teardown().await;
248        tracing::info!("Persistent sandbox container killed");
249    }
250}
251
252/// Background loop that periodically health-checks the persistent sandbox
253/// and attempts to respawn it after consecutive failures.
254async fn health_monitor_loop(
255    inner: Arc<tokio::sync::RwLock<SandboxedMcpServer>>,
256    config: SandboxConfig,
257    health_tx: watch::Sender<SandboxHealth>,
258) {
259    let mut health = SandboxHealth::default();
260
261    loop {
262        tokio::time::sleep(HEALTH_CHECK_INTERVAL).await;
263
264        let check_result = {
265            let sandbox = inner.read().await;
266            // Use list_tools as a health probe — it exercises the full
267            // mTLS → proxy → container → MCP server path.
268            tokio::time::timeout(
269                std::time::Duration::from_secs(10),
270                stakpak_mcp_client::get_tools(&sandbox.client),
271            )
272            .await
273        };
274
275        match check_result {
276            Ok(Ok(_tools)) => {
277                health.healthy = true;
278                health.consecutive_ok = health.consecutive_ok.saturating_add(1);
279                health.consecutive_failures = 0;
280                health.last_ok = Some(chrono::Utc::now().to_rfc3339());
281                health.last_error = None;
282                tracing::debug!(
283                    consecutive_ok = health.consecutive_ok,
284                    "Persistent sandbox health check passed"
285                );
286            }
287            Ok(Err(e)) => {
288                let err_msg = format!("MCP error: {e}");
289                health.healthy = false;
290                health.consecutive_ok = 0;
291                health.consecutive_failures = health.consecutive_failures.saturating_add(1);
292                health.last_error = Some(err_msg.clone());
293                tracing::warn!(
294                    consecutive_failures = health.consecutive_failures,
295                    error = %err_msg,
296                    "Persistent sandbox health check failed"
297                );
298            }
299            Err(_timeout) => {
300                health.healthy = false;
301                health.consecutive_ok = 0;
302                health.consecutive_failures = health.consecutive_failures.saturating_add(1);
303                health.last_error = Some("Health check timed out (10s)".to_string());
304                tracing::warn!(
305                    consecutive_failures = health.consecutive_failures,
306                    "Persistent sandbox health check timed out"
307                );
308            }
309        }
310
311        // Attempt respawn after RESPAWN_THRESHOLD consecutive failures
312        if health.consecutive_failures >= RESPAWN_THRESHOLD {
313            health.total_respawn_attempts = health.total_respawn_attempts.saturating_add(1);
314
315            if health.total_respawn_attempts > MAX_RESPAWN_ATTEMPTS {
316                tracing::error!(
317                    total_attempts = health.total_respawn_attempts,
318                    "Persistent sandbox exceeded maximum respawn attempts ({}) — giving up. \
319                     The server cannot operate without a healthy sandbox. Shutting down.",
320                    MAX_RESPAWN_ATTEMPTS
321                );
322                health.last_error = Some(format!(
323                    "Exceeded max respawn attempts ({}); sandbox permanently failed",
324                    MAX_RESPAWN_ATTEMPTS
325                ));
326                let _ = health_tx.send(health);
327                // Exit the monitor loop — the sandbox is unrecoverable.
328                // The server health endpoint will report unhealthy, and operators
329                // should investigate and restart the autopilot.
330                return;
331            }
332
333            tracing::error!(
334                failures = health.consecutive_failures,
335                attempt = health.total_respawn_attempts,
336                max_attempts = MAX_RESPAWN_ATTEMPTS,
337                "Persistent sandbox unhealthy — attempting respawn"
338            );
339
340            // Take write lock to replace the sandbox
341            let mut sandbox = inner.write().await;
342
343            // Shut down the old one (best-effort)
344            sandbox.teardown().await;
345
346            match SandboxedMcpServer::spawn(&config).await {
347                Ok(new_sandbox) => {
348                    *sandbox = new_sandbox;
349                    health.healthy = true;
350                    health.consecutive_ok = 1;
351                    health.consecutive_failures = 0;
352                    health.last_ok = Some(chrono::Utc::now().to_rfc3339());
353                    health.last_error = None;
354                    tracing::info!("Persistent sandbox respawned successfully");
355                }
356                Err(e) => {
357                    health.last_error = Some(format!("Respawn failed: {e}"));
358                    tracing::error!(error = %e, "Failed to respawn persistent sandbox");
359                    // Don't reset consecutive_failures — next iteration will try again
360                }
361            }
362        }
363
364        // Publish updated health (ignore error if all receivers dropped)
365        let _ = health_tx.send(health.clone());
366    }
367}
368
369/// A running sandboxed MCP server with its associated proxy and client.
370///
371/// Drop this struct to shut down the sandbox.
372pub struct SandboxedMcpServer {
373    /// MCP client connected via the per-session proxy.
374    pub client: Arc<McpClient>,
375    /// Tools available from the sandboxed server.
376    pub tools: Vec<stakai::Tool>,
377    /// Channel to shut down the per-session proxy.
378    proxy_shutdown_tx: broadcast::Sender<()>,
379    /// The warden container child process.
380    container_process: Child,
381}
382
383impl SandboxedMcpServer {
384    /// Spawn a sandboxed MCP server inside a warden container and connect to it.
385    ///
386    /// 1. Generates a client mTLS identity (private key stays in host memory)
387    /// 2. Passes the client CA cert (public) to the container via env var
388    /// 3. Spawns `warden wrap <image> -- stakpak mcp start`
389    /// 4. Parses the server CA cert (public) from the container's stdout
390    /// 5. Builds a client TLS config trusting the server CA, using the client key
391    /// 6. Starts a per-session MCP proxy pointing to the container
392    /// 7. Connects a client to the proxy
393    pub async fn spawn(config: &SandboxConfig) -> Result<Self, String> {
394        // 1. Generate client identity — private key stays in host memory
395        let client_identity = MtlsIdentity::generate_client()
396            .map_err(|e| format!("Failed to generate client identity: {e}"))?;
397
398        let client_ca_pem = client_identity
399            .ca_cert_pem()
400            .map_err(|e| format!("Failed to get client CA PEM: {e}"))?;
401
402        // 2. Find a free port for the container's MCP server to expose
403        let container_host_port = find_free_port()
404            .await
405            .map_err(|e| format!("Failed to find free port for sandbox: {e}"))?;
406
407        // 3. Spawn warden container, passing client CA cert (public) via env var
408        let mut container_process =
409            spawn_warden_container(config, container_host_port, &client_ca_pem)
410                .await
411                .map_err(|e| format!("Failed to spawn sandbox container: {e}"))?;
412
413        // 4. Parse the server CA cert (public) from the container's stdout
414        let server_ca_pem = match parse_server_ca_from_stdout(&mut container_process).await {
415            Ok(server_ca_pem) => server_ca_pem,
416            Err(base_message) => {
417                let error = sandbox_bootstrap_error(&mut container_process, &base_message).await;
418                return Err(error);
419            }
420        };
421        tracing::info!(
422            "Parsed server CA from container stdout ({} bytes)",
423            server_ca_pem.len()
424        );
425
426        // 5. Build client TLS config — trusts server CA, authenticates with our key
427        let container_client_config = client_identity
428            .create_client_config(&server_ca_pem)
429            .map_err(|e| format!("Failed to create client TLS config: {e}"))?;
430
431        // 6. Wait for the MCP server inside the container to be ready
432        let server_url = format!("https://127.0.0.1:{container_host_port}/mcp");
433        tracing::info!(url = %server_url, "Waiting for sandbox MCP server to be ready");
434        wait_for_server_ready(&server_url, &container_client_config).await?;
435        tracing::info!("Sandbox MCP server is ready");
436
437        // 7. Start a per-session proxy connecting to the sandboxed server
438        let (proxy_shutdown_tx, proxy_shutdown_rx) = broadcast::channel::<()>(1);
439
440        let proxy_binding = find_available_binding("sandbox proxy").await?;
441        let proxy_url = format!("https://{}/mcp", proxy_binding.address);
442
443        let proxy_cert_chain = Arc::new(
444            CertificateChain::generate()
445                .map_err(|e| format!("Failed to generate proxy certificates: {e}"))?,
446        );
447
448        let pool_config = build_sandbox_proxy_config(server_url, Arc::new(container_client_config));
449
450        let proxy_chain_for_server = proxy_cert_chain.clone();
451        let proxy_listener = proxy_binding.listener;
452        tokio::spawn(async move {
453            if let Err(e) = start_proxy_server(
454                pool_config,
455                proxy_listener,
456                proxy_chain_for_server,
457                true,  // redact_secrets
458                false, // privacy_mode
459                Some(proxy_shutdown_rx),
460            )
461            .await
462            {
463                tracing::error!("Sandbox proxy error: {e}");
464            }
465        });
466
467        // Small delay for proxy to start
468        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
469
470        // 8. Connect client to proxy
471        let client = connect_to_proxy(&proxy_url, proxy_cert_chain).await?;
472
473        // 9. Get tools
474        let mcp_tools = stakpak_mcp_client::get_tools(&client)
475            .await
476            .map_err(|e| format!("Failed to get sandbox tools: {e}"))?;
477
478        let tools = mcp_tools
479            .into_iter()
480            .map(|tool| stakai::Tool {
481                tool_type: "function".to_string(),
482                function: stakai::ToolFunction {
483                    name: tool.name.as_ref().to_string(),
484                    description: tool
485                        .description
486                        .as_ref()
487                        .map(std::string::ToString::to_string)
488                        .unwrap_or_default(),
489                    parameters: serde_json::Value::Object((*tool.input_schema).clone()),
490                },
491                provider_options: None,
492            })
493            .collect();
494
495        Ok(Self {
496            client,
497            tools,
498            proxy_shutdown_tx,
499            container_process,
500        })
501    }
502
503    /// Shut down the sandbox: stop the proxy and kill the container.
504    pub async fn shutdown(mut self) {
505        self.teardown().await;
506    }
507
508    /// Stop the proxy and gracefully shut down the warden container process.
509    ///
510    /// Sends SIGINT (not SIGTERM) because warden listens for `ctrl_c` (SIGINT)
511    /// to trigger its cleanup — stopping the user container and sidecar.
512    /// Waits up to 10s for warden to finish, then force-kills.
513    pub async fn teardown(&mut self) {
514        let _ = self.proxy_shutdown_tx.send(());
515
516        // Send SIGINT so warden's `signal::ctrl_c()` handler fires and runs
517        // its container cleanup (stop_user_container + cleanup sidecar).
518        #[cfg(unix)]
519        if let Some(pid) = self.container_process.id() {
520            let _ = tokio::process::Command::new("kill")
521                .args(["-INT", &pid.to_string()])
522                .output()
523                .await;
524        }
525
526        // Give warden up to 10s to stop containers and exit cleanly.
527        match tokio::time::timeout(
528            std::time::Duration::from_secs(10),
529            self.container_process.wait(),
530        )
531        .await
532        {
533            Ok(Ok(status)) => {
534                tracing::debug!(exit_status = ?status, "Warden process exited gracefully");
535            }
536            _ => {
537                tracing::warn!("Warden process did not exit in 10s — force killing");
538                let _ = self.container_process.kill().await;
539                let _ = self.container_process.wait().await;
540            }
541        }
542    }
543}
544
545/// Build the argument list for `warden wrap`.
546///
547/// All options come before the positional `<IMAGE>` argument, then `--` and the
548/// command.  Extracting this into a pure function makes it testable without
549/// spawning a real process.
550fn build_warden_argv(
551    config: &SandboxConfig,
552    host_port: u16,
553    client_ca_pem: &str,
554    env_overrides: &[(&str, &str)],
555) -> Vec<String> {
556    use stakpak_shared::container::{expand_volume_path, is_named_volume};
557
558    let mut args: Vec<String> = vec!["wrap".to_string()];
559
560    // --- options (must come before positional <IMAGE>) ---
561
562    for vol in &config.volumes {
563        let expanded = expand_volume_path(vol);
564        let host_path = expanded.split(':').next().unwrap_or(&expanded);
565        if is_named_volume(host_path) || Path::new(host_path).exists() {
566            args.push("--volume".to_string());
567            args.push(expanded);
568        }
569    }
570
571    // When host-user mapping is requested, start the container as root so the
572    // entrypoint script can patch /etc/passwd, chown $HOME, and then drop
573    // privileges via gosu.  The target UID/GID are passed as env vars.
574    match &config.user_mapping {
575        SandboxUserMapping::HostUser { uid, gid } => {
576            args.push("--user".to_string());
577            args.push("0:0".to_string());
578            args.push("--env".to_string());
579            args.push(format!("STAKPAK_TARGET_UID={uid}"));
580            args.push("--env".to_string());
581            args.push(format!("STAKPAK_TARGET_GID={gid}"));
582        }
583        SandboxUserMapping::ImageDefault => {}
584    }
585
586    args.push("-p".to_string());
587    args.push(format!("127.0.0.1:{host_port}:8080"));
588
589    args.push("--env".to_string());
590    args.push("STAKPAK_SKIP_WARDEN=1".to_string());
591
592    args.push("--env".to_string());
593    args.push("STAKPAK_MCP_PORT=8080".to_string());
594
595    args.push("--env".to_string());
596    args.push(format!("{TRUSTED_CLIENT_CA_ENV}={client_ca_pem}"));
597
598    for (key, value) in env_overrides {
599        args.push("--env".to_string());
600        args.push(format!("{key}={value}"));
601    }
602
603    // --- positional image ---
604    args.push(config.image.clone());
605
606    // --- command after `--` ---
607    // Run through the entrypoint script so that /etc/passwd is patched when
608    // the container is started as root with STAKPAK_TARGET_UID/GID set.
609    args.push("--".to_string());
610    args.push(CONTAINER_ENTRYPOINT.to_string());
611    args.push("/usr/local/bin/stakpak".to_string());
612    args.push("mcp".to_string());
613    args.push("start".to_string());
614
615    args
616}
617
618async fn spawn_warden_container(
619    config: &SandboxConfig,
620    host_port: u16,
621    client_ca_pem: &str,
622) -> Result<Child, String> {
623    // Collect env-var pass-throughs
624    let mut env_pairs: Vec<(String, String)> = Vec::new();
625    for var in &["STAKPAK_API_KEY", "STAKPAK_PROFILE", "STAKPAK_API_ENDPOINT"] {
626        if let Ok(val) = std::env::var(var) {
627            env_pairs.push((var.to_string(), val));
628        }
629    }
630
631    let env_refs: Vec<(&str, &str)> = env_pairs
632        .iter()
633        .map(|(k, v)| (k.as_str(), v.as_str()))
634        .collect();
635    let argv = build_warden_argv(config, host_port, client_ca_pem, &env_refs);
636
637    let mut cmd = tokio::process::Command::new(&config.warden_path);
638    cmd.args(&argv);
639    cmd.stdout(std::process::Stdio::piped());
640    cmd.stderr(std::process::Stdio::piped());
641    cmd.stdin(std::process::Stdio::null());
642
643    let child = cmd
644        .spawn()
645        .map_err(|e| format!("Failed to spawn warden process: {e}"))?;
646
647    Ok(child)
648}
649
650/// Parse the server CA certificate PEM from the container's stdout.
651///
652/// The MCP server outputs the server CA cert between structured delimiters:
653/// ```text
654/// ---BEGIN STAKPAK SERVER CA---
655/// -----BEGIN CERTIFICATE-----
656/// ...
657/// -----END CERTIFICATE-----
658/// ---END STAKPAK SERVER CA---
659/// ```
660async fn parse_server_ca_from_stdout(process: &mut Child) -> Result<String, String> {
661    let stdout = process
662        .stdout
663        .take()
664        .ok_or_else(|| "Container stdout not captured".to_string())?;
665
666    let mut reader = tokio::io::BufReader::new(stdout);
667    let mut server_ca_pem = String::new();
668    let mut in_server_ca = false;
669    let mut line = String::new();
670
671    let timeout_duration = tokio::time::Duration::from_secs(60);
672    let deadline = tokio::time::Instant::now() + timeout_duration;
673
674    tracing::debug!("Starting to read container stdout for server CA...");
675
676    loop {
677        line.clear();
678        let bytes_read = tokio::time::timeout_at(deadline, reader.read_line(&mut line))
679            .await
680            .map_err(|_| {
681                "Timed out waiting for container to output server CA certificate".to_string()
682            })?
683            .map_err(|e| format!("Failed to read container stdout: {e}"))?;
684
685        if bytes_read == 0 {
686            tracing::error!("Container stdout EOF before server CA was found");
687            return Err("Container exited before outputting server CA certificate".to_string());
688        }
689
690        let trimmed = line.trim();
691        tracing::debug!(line = %trimmed, bytes = bytes_read, "Read line from container stdout");
692
693        if trimmed == "---BEGIN STAKPAK SERVER CA---" {
694            in_server_ca = true;
695            continue;
696        }
697
698        if trimmed == "---END STAKPAK SERVER CA---" {
699            tracing::debug!("Found end of server CA block");
700            break;
701        }
702
703        if in_server_ca {
704            server_ca_pem.push_str(trimmed);
705            server_ca_pem.push('\n');
706        }
707    }
708
709    let server_ca_pem = server_ca_pem.trim().to_string();
710
711    if server_ca_pem.is_empty() {
712        return Err("Failed to parse server CA certificate from container output".to_string());
713    }
714
715    Ok(server_ca_pem)
716}
717
718async fn sandbox_bootstrap_error(process: &mut Child, base_message: &str) -> String {
719    let exit_status = ensure_process_exited(process).await;
720    let stderr_excerpt = read_stderr_excerpt(process, 4096).await;
721    format_bootstrap_error(base_message, exit_status, stderr_excerpt.as_deref())
722}
723
724async fn ensure_process_exited(process: &mut Child) -> Option<ExitStatus> {
725    if let Ok(Some(status)) = process.try_wait() {
726        return Some(status);
727    }
728
729    let _ = process.kill().await;
730    process.wait().await.ok()
731}
732
733async fn read_stderr_excerpt(process: &mut Child, max_bytes: usize) -> Option<String> {
734    let stderr = process.stderr.take()?;
735    let mut limited = stderr.take(max_bytes as u64);
736    let mut bytes = Vec::with_capacity(max_bytes.min(8192));
737
738    if limited.read_to_end(&mut bytes).await.is_err() {
739        return None;
740    }
741
742    if bytes.is_empty() {
743        return None;
744    }
745
746    let text = String::from_utf8_lossy(&bytes).trim().to_string();
747    if text.is_empty() {
748        return None;
749    }
750
751    Some(truncate_chars(&text, max_bytes))
752}
753
754fn format_bootstrap_error(
755    base_message: &str,
756    exit_status: Option<ExitStatus>,
757    stderr_excerpt: Option<&str>,
758) -> String {
759    let mut message = base_message.to_string();
760
761    if let Some(status) = exit_status {
762        message.push_str("\nExit status: ");
763        if let Some(code) = status.code() {
764            message.push_str(&code.to_string());
765        } else {
766            message.push_str("terminated by signal");
767        }
768    }
769
770    if let Some(stderr_excerpt) = stderr_excerpt.filter(|value| !value.trim().is_empty()) {
771        message.push_str("\n\nContainer stderr:\n");
772        message.push_str(stderr_excerpt);
773    }
774
775    message
776}
777
778fn truncate_chars(value: &str, max_chars: usize) -> String {
779    if value.chars().count() <= max_chars {
780        return value.to_string();
781    }
782
783    let mut truncated: String = value.chars().take(max_chars.saturating_sub(3)).collect();
784    truncated.push_str("...");
785    truncated
786}
787
788async fn wait_for_server_ready(
789    url: &str,
790    client_config: &rustls::ClientConfig,
791) -> Result<(), String> {
792    let http_client = reqwest::Client::builder()
793        .use_preconfigured_tls(client_config.clone())
794        .build()
795        .map_err(|e| format!("Failed to build readiness check client: {e}"))?;
796
797    let mut last_error = String::new();
798    for attempt in 0..30 {
799        tokio::time::sleep(tokio::time::Duration::from_millis(if attempt < 5 {
800            500
801        } else {
802            1000
803        }))
804        .await;
805
806        match http_client.get(url).send().await {
807            Ok(_) => {
808                tracing::info!(attempt, "Sandbox MCP server ready");
809                return Ok(());
810            }
811            Err(e) => {
812                last_error = format!("{e:?}");
813                tracing::debug!(attempt, error = %last_error, "Readiness check failed");
814            }
815        }
816    }
817
818    Err(format!(
819        "Sandbox MCP server failed to become ready after 30 attempts: {last_error}"
820    ))
821}
822
823struct ProxyBinding {
824    address: String,
825    listener: TcpListener,
826}
827
828async fn find_available_binding(purpose: &str) -> Result<ProxyBinding, String> {
829    let listener = TcpListener::bind("127.0.0.1:0")
830        .await
831        .map_err(|e| format!("Failed to bind port for {purpose}: {e}"))?;
832    let addr = listener
833        .local_addr()
834        .map_err(|e| format!("Failed to get address for {purpose}: {e}"))?;
835    Ok(ProxyBinding {
836        address: addr.to_string(),
837        listener,
838    })
839}
840
841// TODO: TOCTOU race — between dropping the listener and Docker binding the port,
842// another process could claim it. Consider retrying with a different port on bind
843// failure, or passing the listener fd directly if Docker supports it.
844async fn find_free_port() -> Result<u16, String> {
845    let listener = TcpListener::bind("127.0.0.1:0")
846        .await
847        .map_err(|e| format!("Failed to bind ephemeral port: {e}"))?;
848    let port = listener
849        .local_addr()
850        .map_err(|e| format!("Failed to get ephemeral port: {e}"))?
851        .port();
852    // Drop the listener to free the port for Docker to use
853    drop(listener);
854    Ok(port)
855}
856
857fn build_sandbox_proxy_config(
858    sandbox_server_url: String,
859    client_tls_config: Arc<rustls::ClientConfig>,
860) -> ClientPoolConfig {
861    let mut servers: HashMap<String, ServerConfig> = HashMap::new();
862
863    // Register the sandboxed MCP server under the same name ("stakpak") so
864    // tool names like `stakpak__run_command` route correctly through the proxy.
865    servers.insert(
866        "stakpak".to_string(),
867        ServerConfig::Http {
868            url: sandbox_server_url,
869            headers: None,
870            certificate_chain: Arc::new(None),
871            client_tls_config: Some(client_tls_config),
872        },
873    );
874
875    // Keep the external paks server accessible
876    servers.insert(
877        "paks".to_string(),
878        ServerConfig::Http {
879            url: "https://apiv2.stakpak.dev/v1/paks/mcp".to_string(),
880            headers: None,
881            certificate_chain: Arc::new(None),
882            client_tls_config: None,
883        },
884    );
885
886    ClientPoolConfig::with_servers(servers)
887}
888
889async fn connect_to_proxy(
890    proxy_url: &str,
891    cert_chain: Arc<CertificateChain>,
892) -> Result<Arc<McpClient>, String> {
893    const MAX_RETRIES: u32 = 5;
894    let mut retry_delay = tokio::time::Duration::from_millis(50);
895    let mut last_error = None;
896
897    for attempt in 1..=MAX_RETRIES {
898        match stakpak_mcp_client::connect_https(proxy_url, Some(cert_chain.clone()), None).await {
899            Ok(client) => return Ok(Arc::new(client)),
900            Err(e) => {
901                last_error = Some(e);
902                if attempt < MAX_RETRIES {
903                    tokio::time::sleep(retry_delay).await;
904                    retry_delay *= 2;
905                }
906            }
907        }
908    }
909
910    Err(format!(
911        "Failed to connect to sandbox proxy after {MAX_RETRIES} retries: {}",
912        last_error.map(|e| e.to_string()).unwrap_or_default()
913    ))
914}
915
916#[cfg(test)]
917mod tests {
918    #[test]
919    fn parse_server_ca_from_structured_output() {
920        let output = "\
921🔐 mTLS enabled - independent identity (sandbox mode)
922---BEGIN STAKPAK SERVER CA---
923-----BEGIN CERTIFICATE-----
924MIIB0zCCAXmgAwIBAgIUFAKE=
925-----END CERTIFICATE-----
926---END STAKPAK SERVER CA---
927MCP server started at https://0.0.0.0:8080/mcp
928";
929
930        let expected_ca = "\
931-----BEGIN CERTIFICATE-----
932MIIB0zCCAXmgAwIBAgIUFAKE=
933-----END CERTIFICATE-----";
934
935        // Parse the same way parse_server_ca_from_stdout does
936        let mut server_ca_pem = String::new();
937        let mut in_server_ca = false;
938
939        for line in output.lines() {
940            let trimmed = line.trim();
941            if trimmed == "---BEGIN STAKPAK SERVER CA---" {
942                in_server_ca = true;
943                continue;
944            }
945            if trimmed == "---END STAKPAK SERVER CA---" {
946                break;
947            }
948            if in_server_ca {
949                server_ca_pem.push_str(trimmed);
950                server_ca_pem.push('\n');
951            }
952        }
953
954        assert_eq!(server_ca_pem.trim(), expected_ca);
955    }
956
957    #[test]
958    fn format_bootstrap_error_includes_stderr() {
959        let message = super::format_bootstrap_error(
960            "Container exited before outputting server CA certificate",
961            None,
962            Some("Failed to load config: Permission denied (os error 13)"),
963        );
964
965        assert!(message.contains("Container stderr:"));
966        assert!(message.contains("Permission denied"));
967    }
968
969    #[cfg(unix)]
970    #[test]
971    fn format_bootstrap_error_includes_exit_status() {
972        use std::os::unix::process::ExitStatusExt;
973
974        let status = std::process::ExitStatus::from_raw(7 << 8);
975        let message = super::format_bootstrap_error("bootstrap failed", Some(status), None);
976
977        assert!(message.contains("Exit status: 7"));
978    }
979
980    #[test]
981    fn mtls_identity_cross_trust() {
982        use stakpak_shared::cert_utils::MtlsIdentity;
983
984        // Ensure a crypto provider is installed for rustls
985        let _ = rustls::crypto::aws_lc_rs::default_provider().install_default();
986
987        // Simulate the sandbox mTLS exchange
988        let client_identity = MtlsIdentity::generate_client().expect("generate client identity");
989        let server_identity = MtlsIdentity::generate_server().expect("generate server identity");
990
991        let client_ca_pem = client_identity.ca_cert_pem().expect("client CA PEM");
992        let server_ca_pem = server_identity.ca_cert_pem().expect("server CA PEM");
993
994        // Server trusts client CA, client trusts server CA
995        let _server_config = server_identity
996            .create_server_config(&client_ca_pem)
997            .expect("server config with client CA trust");
998        let _client_config = client_identity
999            .create_client_config(&server_ca_pem)
1000            .expect("client config with server CA trust");
1001
1002        // Only public CA certs were exchanged — private keys stayed in their
1003        // respective MtlsIdentity structs.
1004        assert!(client_ca_pem.contains("BEGIN CERTIFICATE"));
1005        assert!(server_ca_pem.contains("BEGIN CERTIFICATE"));
1006        assert!(!client_ca_pem.contains("PRIVATE KEY"));
1007        assert!(!server_ca_pem.contains("PRIVATE KEY"));
1008    }
1009
1010    // ── Named volume detection in expand_volume_path / mount filter ────
1011
1012    #[test]
1013    fn expand_volume_path_leaves_named_volumes_unchanged() {
1014        use stakpak_shared::container::expand_volume_path;
1015        let named = "stakpak-aqua-cache:/home/agent/.local/share/aquaproj-aqua";
1016        assert_eq!(expand_volume_path(named), named);
1017    }
1018
1019    /// Named volumes (no `/` or `.` prefix in the host part) must pass the
1020    /// mount filter even though they don't exist on the host filesystem.
1021    #[test]
1022    fn named_volume_is_detected_correctly() {
1023        use stakpak_shared::container::is_named_volume;
1024        let cases = vec![
1025            ("stakpak-aqua-cache", true),
1026            ("my-volume", true),
1027            ("./relative/path", false),
1028            ("/absolute/path", false),
1029            ("relative/with/slash", false),
1030            (".", false),
1031        ];
1032        for (host_part, expected) in cases {
1033            assert_eq!(
1034                is_named_volume(host_part),
1035                expected,
1036                "host_part={host_part:?} expected named={expected}"
1037            );
1038        }
1039    }
1040
1041    #[test]
1042    fn host_user_mapping_starts_as_root_with_target_env() {
1043        let config = super::SandboxConfig {
1044            warden_path: "warden".to_string(),
1045            image: "img:latest".to_string(),
1046            volumes: vec![],
1047            mode: super::SandboxMode::Persistent,
1048            user_mapping: super::SandboxUserMapping::HostUser {
1049                uid: 1001,
1050                gid: 1001,
1051            },
1052        };
1053        let argv = super::build_warden_argv(&config, 8080, "CA", &[]);
1054        // Container starts as root
1055        assert!(argv.contains(&"0:0".to_string()));
1056        // Target UID/GID passed as env vars for the entrypoint
1057        assert!(argv.contains(&"STAKPAK_TARGET_UID=1001".to_string()));
1058        assert!(argv.contains(&"STAKPAK_TARGET_GID=1001".to_string()));
1059    }
1060
1061    #[test]
1062    fn sandbox_mode_default_is_persistent() {
1063        assert_eq!(
1064            super::SandboxMode::default(),
1065            super::SandboxMode::Persistent
1066        );
1067    }
1068
1069    #[test]
1070    fn sandbox_mode_serde_roundtrip() {
1071        #[derive(serde::Serialize, serde::Deserialize)]
1072        struct Wrapper {
1073            #[serde(default)]
1074            mode: super::SandboxMode,
1075        }
1076
1077        // Explicit persistent
1078        let json = serde_json::json!({"mode": "persistent"});
1079        let w: Wrapper = serde_json::from_value(json).expect("deserialize persistent");
1080        assert_eq!(w.mode, super::SandboxMode::Persistent);
1081
1082        // Explicit ephemeral
1083        let json = serde_json::json!({"mode": "ephemeral"});
1084        let w: Wrapper = serde_json::from_value(json).expect("deserialize ephemeral");
1085        assert_eq!(w.mode, super::SandboxMode::Ephemeral);
1086
1087        // Missing field → default (persistent)
1088        let json = serde_json::json!({});
1089        let w: Wrapper = serde_json::from_value(json).expect("deserialize default");
1090        assert_eq!(w.mode, super::SandboxMode::Persistent);
1091
1092        // Display
1093        assert_eq!(super::SandboxMode::Persistent.to_string(), "persistent");
1094        assert_eq!(super::SandboxMode::Ephemeral.to_string(), "ephemeral");
1095    }
1096
1097    // ── Warden argument ordering ───────────────────────────────────────
1098
1099    #[test]
1100    fn warden_argv_options_before_image() {
1101        let config = super::SandboxConfig {
1102            warden_path: "warden".to_string(),
1103            image: "ghcr.io/stakpak/agent:latest".to_string(),
1104            volumes: vec!["named-vol:/data".to_string()],
1105            mode: super::SandboxMode::Persistent,
1106            user_mapping: super::SandboxUserMapping::HostUser {
1107                uid: 1000,
1108                gid: 1001,
1109            },
1110        };
1111
1112        let argv = super::build_warden_argv(&config, 9999, "FAKE_CA_PEM", &[("MY_VAR", "val")]);
1113
1114        // Find where the positional image argument sits
1115        let image_pos = argv
1116            .iter()
1117            .position(|a| a == "ghcr.io/stakpak/agent:latest")
1118            .expect("image arg must be present");
1119
1120        // Find where `--` sits (start of the command section)
1121        let dash_pos = argv
1122            .iter()
1123            .position(|a| a == "--")
1124            .expect("-- separator must be present");
1125
1126        // All option flags must come before the image
1127        for (i, arg) in argv.iter().enumerate() {
1128            if arg.starts_with('-') && arg != "--" {
1129                assert!(
1130                    i < image_pos,
1131                    "option '{}' at position {} must come before image at position {}",
1132                    arg,
1133                    i,
1134                    image_pos
1135                );
1136            }
1137        }
1138
1139        // Image must come before `--`
1140        assert!(image_pos < dash_pos, "image must come before -- separator");
1141
1142        // Command after `--` must be `entrypoint.sh stakpak mcp start`
1143        let command_section: Vec<&str> = argv[dash_pos + 1..].iter().map(|s| s.as_str()).collect();
1144        assert_eq!(
1145            command_section,
1146            vec![
1147                "/home/agent/.local/bin/entrypoint.sh",
1148                "/usr/local/bin/stakpak",
1149                "mcp",
1150                "start"
1151            ]
1152        );
1153
1154        // Verify specific options are present
1155        assert!(argv.contains(&"--volume".to_string()));
1156        // HostUser mapping starts container as root; target UID/GID are env vars
1157        assert!(argv.contains(&"--user".to_string()));
1158        assert!(argv.contains(&"0:0".to_string()));
1159        assert!(argv.contains(&"STAKPAK_TARGET_UID=1000".to_string()));
1160        assert!(argv.contains(&"STAKPAK_TARGET_GID=1001".to_string()));
1161        assert!(argv.contains(&"-p".to_string()));
1162    }
1163
1164    #[test]
1165    fn warden_argv_no_user_when_image_default() {
1166        let config = super::SandboxConfig {
1167            warden_path: "warden".to_string(),
1168            image: "ghcr.io/stakpak/agent:latest".to_string(),
1169            volumes: vec![],
1170            mode: super::SandboxMode::Persistent,
1171            user_mapping: super::SandboxUserMapping::ImageDefault,
1172        };
1173
1174        let argv = super::build_warden_argv(&config, 8080, "CA", &[]);
1175        assert!(!argv.contains(&"--user".to_string()));
1176        // No target UID/GID env vars either
1177        assert!(!argv.iter().any(|a| a.starts_with("STAKPAK_TARGET_UID")));
1178        assert!(!argv.iter().any(|a| a.starts_with("STAKPAK_TARGET_GID")));
1179    }
1180
1181    #[test]
1182    fn sandbox_health_default_is_healthy() {
1183        let h = super::SandboxHealth::default();
1184        assert!(h.healthy);
1185        assert_eq!(h.consecutive_ok, 0);
1186        assert_eq!(h.consecutive_failures, 0);
1187        assert!(h.last_ok.is_none());
1188        assert!(h.last_error.is_none());
1189        assert_eq!(h.total_respawn_attempts, 0);
1190    }
1191}