stakpak_server/
sandbox.rs

1//! Sandboxed MCP server management.
2//!
3//! When sandbox mode is enabled for a session, a `stakpak mcp start` server
4//! is spawned inside a warden container. The host-side proxy connects to it
5//! via HTTPS/mTLS, and tool calls from the agent loop are routed through the
6//! containerized server — executing `run_command`, file I/O, etc. inside the
7//! sandbox.
8//!
9//! ## Sandbox Modes
10//!
11//! - **Persistent** (default): A single sandbox container is spawned at process startup and
12//!   reused across all sessions. Near-zero per-session overhead, slightly less
13//!   isolation (sessions share the same container filesystem).
14//!
15//! - **Ephemeral**: A new sandbox container is spawned for each session
16//!   and destroyed when the session ends. Maximum isolation, ~5-10s startup overhead.
17//!
18//! ## mTLS key exchange
19//!
20//! Each side generates its own identity independently:
21//!
22//! 1. Host generates a client identity (CA + leaf cert + key, all in memory)
23//! 2. Host passes the client **CA cert** (public only) to the container via env var
24//! 3. Container generates a server identity (CA + leaf cert + key, all in memory)
25//! 4. Container outputs the server **CA cert** (public only) to stdout
26//! 5. Host parses the server CA cert and builds a client TLS config
27//!
28//! Private keys never leave their respective processes.
29
30use serde::{Deserialize, Serialize};
31use stakpak_mcp_client::McpClient;
32use stakpak_mcp_proxy::client::{ClientPoolConfig, ServerConfig};
33use stakpak_mcp_proxy::server::start_proxy_server;
34use stakpak_shared::cert_utils::{CertificateChain, MtlsIdentity};
35use std::collections::HashMap;
36use std::path::Path;
37use std::sync::Arc;
38use tokio::io::AsyncBufReadExt;
39use tokio::net::TcpListener;
40use tokio::process::Child;
41use tokio::sync::{broadcast, watch};
42
43/// Environment variable used to pass the client CA cert PEM to the container.
44const TRUSTED_CLIENT_CA_ENV: &str = "STAKPAK_MCP_CLIENT_CA";
45
46// ── Sandbox mode ────────────────────────────────────────────────────────────
47
48/// Controls how sandbox containers are managed across sessions.
49#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)]
50#[serde(rename_all = "lowercase")]
51pub enum SandboxMode {
52    /// Spawn a new sandbox container for each session and destroy it when the
53    /// session ends. Maximum isolation, ~5-10s startup overhead per session.
54    Ephemeral,
55    /// Spawn a single sandbox container at process startup and reuse it for all
56    /// sessions. Near-zero per-session overhead, shared container filesystem.
57    #[default]
58    Persistent,
59}
60
61impl std::fmt::Display for SandboxMode {
62    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
63        match self {
64            SandboxMode::Ephemeral => write!(f, "ephemeral"),
65            SandboxMode::Persistent => write!(f, "persistent"),
66        }
67    }
68}
69
70// ── Sandbox config ──────────────────────────────────────────────────────────
71
72/// Configuration for spawning sandboxed MCP servers.
73#[derive(Clone, Debug)]
74pub struct SandboxConfig {
75    /// Path to the warden binary.
76    pub warden_path: String,
77    /// Container image for the sandbox (e.g., `ghcr.io/stakpak/agent:v1.2.3`).
78    pub image: String,
79    /// Volume mounts for the container (e.g., `["./:/agent:ro"]`).
80    pub volumes: Vec<String>,
81    /// How sandbox containers are managed across sessions.
82    pub mode: SandboxMode,
83}
84
85// ── Sandbox health ──────────────────────────────────────────────────────────
86
87/// Health status of a persistent sandbox, updated by the background monitor.
88#[derive(Clone, Debug, Serialize, Deserialize)]
89pub struct SandboxHealth {
90    /// Whether the last health check succeeded.
91    pub healthy: bool,
92    /// Number of consecutive successful health checks.
93    pub consecutive_ok: u64,
94    /// Number of consecutive failed health checks.
95    pub consecutive_failures: u64,
96    /// ISO 8601 timestamp of the last successful health check.
97    pub last_ok: Option<String>,
98    /// Error message from the last failed health check, if any.
99    pub last_error: Option<String>,
100    /// Total number of respawn attempts since the sandbox was started.
101    pub total_respawn_attempts: u64,
102}
103
104impl Default for SandboxHealth {
105    fn default() -> Self {
106        Self {
107            healthy: true,
108            consecutive_ok: 0,
109            consecutive_failures: 0,
110            last_ok: None,
111            last_error: None,
112            total_respawn_attempts: 0,
113        }
114    }
115}
116
117/// Interval between health checks for persistent sandboxes.
118const HEALTH_CHECK_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30);
119
120/// Number of consecutive failures before attempting a respawn.
121const RESPAWN_THRESHOLD: u64 = 3;
122
123/// Maximum number of total respawn attempts before giving up and shutting down.
124const MAX_RESPAWN_ATTEMPTS: u64 = 5;
125
126// ── Persistent sandbox ──────────────────────────────────────────────────────
127
128/// A long-lived sandbox that persists across sessions.
129///
130/// Spawned once at process startup (when `SandboxMode::Persistent` is configured)
131/// and shared by all sessions. Includes a background health monitor that
132/// periodically pings the sandbox and attempts to respawn it on failure.
133pub struct PersistentSandbox {
134    inner: Arc<tokio::sync::RwLock<SandboxedMcpServer>>,
135    config: SandboxConfig,
136    health_rx: watch::Receiver<SandboxHealth>,
137    /// Handle to the background health monitor task.
138    monitor_handle: tokio::task::JoinHandle<()>,
139}
140
141impl PersistentSandbox {
142    /// Spawn a persistent sandbox with a background health monitor.
143    pub async fn spawn(config: &SandboxConfig) -> Result<Self, String> {
144        tracing::info!(image = %config.image, "Spawning persistent sandbox container");
145        let inner = SandboxedMcpServer::spawn(config).await?;
146        tracing::info!("Persistent sandbox ready");
147
148        let initial_health = SandboxHealth {
149            healthy: true,
150            consecutive_ok: 1,
151            consecutive_failures: 0,
152            last_ok: Some(chrono::Utc::now().to_rfc3339()),
153            last_error: None,
154            total_respawn_attempts: 0,
155        };
156        let (health_tx, health_rx) = watch::channel(initial_health);
157
158        let inner = Arc::new(tokio::sync::RwLock::new(inner));
159        let monitor_inner = inner.clone();
160        let monitor_config = config.clone();
161
162        let monitor_handle = tokio::spawn(async move {
163            health_monitor_loop(monitor_inner, monitor_config, health_tx).await;
164        });
165
166        Ok(Self {
167            inner,
168            config: config.clone(),
169            health_rx,
170            monitor_handle,
171        })
172    }
173
174    /// Get the MCP client for routing tool calls through this sandbox.
175    pub async fn client(&self) -> Arc<McpClient> {
176        self.inner.read().await.client.clone()
177    }
178
179    /// Get the tools available from this sandbox.
180    pub async fn tools(&self) -> Vec<stakai::Tool> {
181        self.inner.read().await.tools.clone()
182    }
183
184    /// Get the current health status (non-blocking snapshot).
185    pub fn health(&self) -> SandboxHealth {
186        self.health_rx.borrow().clone()
187    }
188
189    /// Get the sandbox mode from the config.
190    pub fn mode(&self) -> &SandboxMode {
191        &self.config.mode
192    }
193
194    /// Shut down the persistent sandbox and its health monitor.
195    pub async fn shutdown(self) {
196        tracing::info!("Shutting down persistent sandbox");
197        self.monitor_handle.abort();
198        // Try to take exclusive ownership of the inner sandbox for clean shutdown.
199        // If other sessions still hold references, the RwLock + Arc will be dropped
200        // when all references are gone; the container process will be cleaned up by
201        // the OS when the host process exits.
202        if let Ok(inner) = Arc::try_unwrap(self.inner) {
203            let sandbox = inner.into_inner();
204            sandbox.shutdown().await;
205        } else {
206            tracing::warn!(
207                "Other references to persistent sandbox still exist; container will be cleaned up on process exit"
208            );
209        }
210    }
211
212    /// Force-kill the sandbox container and abort the health monitor.
213    ///
214    /// Unlike `shutdown(self)`, this works through a shared reference so it can
215    /// be called from the graceful-shutdown handler where only `Arc<Self>` is
216    /// available. The container process is killed via the write lock.
217    pub async fn kill(&self) {
218        tracing::warn!(
219            "Killing persistent sandbox container — in-flight sessions using this sandbox will fail"
220        );
221        self.monitor_handle.abort();
222        self.inner.write().await.teardown().await;
223        tracing::info!("Persistent sandbox container killed");
224    }
225}
226
227/// Background loop that periodically health-checks the persistent sandbox
228/// and attempts to respawn it after consecutive failures.
229async fn health_monitor_loop(
230    inner: Arc<tokio::sync::RwLock<SandboxedMcpServer>>,
231    config: SandboxConfig,
232    health_tx: watch::Sender<SandboxHealth>,
233) {
234    let mut health = SandboxHealth::default();
235
236    loop {
237        tokio::time::sleep(HEALTH_CHECK_INTERVAL).await;
238
239        let check_result = {
240            let sandbox = inner.read().await;
241            // Use list_tools as a health probe — it exercises the full
242            // mTLS → proxy → container → MCP server path.
243            tokio::time::timeout(
244                std::time::Duration::from_secs(10),
245                stakpak_mcp_client::get_tools(&sandbox.client),
246            )
247            .await
248        };
249
250        match check_result {
251            Ok(Ok(_tools)) => {
252                health.healthy = true;
253                health.consecutive_ok = health.consecutive_ok.saturating_add(1);
254                health.consecutive_failures = 0;
255                health.last_ok = Some(chrono::Utc::now().to_rfc3339());
256                health.last_error = None;
257                tracing::debug!(
258                    consecutive_ok = health.consecutive_ok,
259                    "Persistent sandbox health check passed"
260                );
261            }
262            Ok(Err(e)) => {
263                let err_msg = format!("MCP error: {e}");
264                health.healthy = false;
265                health.consecutive_ok = 0;
266                health.consecutive_failures = health.consecutive_failures.saturating_add(1);
267                health.last_error = Some(err_msg.clone());
268                tracing::warn!(
269                    consecutive_failures = health.consecutive_failures,
270                    error = %err_msg,
271                    "Persistent sandbox health check failed"
272                );
273            }
274            Err(_timeout) => {
275                health.healthy = false;
276                health.consecutive_ok = 0;
277                health.consecutive_failures = health.consecutive_failures.saturating_add(1);
278                health.last_error = Some("Health check timed out (10s)".to_string());
279                tracing::warn!(
280                    consecutive_failures = health.consecutive_failures,
281                    "Persistent sandbox health check timed out"
282                );
283            }
284        }
285
286        // Attempt respawn after RESPAWN_THRESHOLD consecutive failures
287        if health.consecutive_failures >= RESPAWN_THRESHOLD {
288            health.total_respawn_attempts = health.total_respawn_attempts.saturating_add(1);
289
290            if health.total_respawn_attempts > MAX_RESPAWN_ATTEMPTS {
291                tracing::error!(
292                    total_attempts = health.total_respawn_attempts,
293                    "Persistent sandbox exceeded maximum respawn attempts ({}) — giving up. \
294                     The server cannot operate without a healthy sandbox. Shutting down.",
295                    MAX_RESPAWN_ATTEMPTS
296                );
297                health.last_error = Some(format!(
298                    "Exceeded max respawn attempts ({}); sandbox permanently failed",
299                    MAX_RESPAWN_ATTEMPTS
300                ));
301                let _ = health_tx.send(health);
302                // Exit the monitor loop — the sandbox is unrecoverable.
303                // The server health endpoint will report unhealthy, and operators
304                // should investigate and restart the autopilot.
305                return;
306            }
307
308            tracing::error!(
309                failures = health.consecutive_failures,
310                attempt = health.total_respawn_attempts,
311                max_attempts = MAX_RESPAWN_ATTEMPTS,
312                "Persistent sandbox unhealthy — attempting respawn"
313            );
314
315            // Take write lock to replace the sandbox
316            let mut sandbox = inner.write().await;
317
318            // Shut down the old one (best-effort)
319            sandbox.teardown().await;
320
321            match SandboxedMcpServer::spawn(&config).await {
322                Ok(new_sandbox) => {
323                    *sandbox = new_sandbox;
324                    health.healthy = true;
325                    health.consecutive_ok = 1;
326                    health.consecutive_failures = 0;
327                    health.last_ok = Some(chrono::Utc::now().to_rfc3339());
328                    health.last_error = None;
329                    tracing::info!("Persistent sandbox respawned successfully");
330                }
331                Err(e) => {
332                    health.last_error = Some(format!("Respawn failed: {e}"));
333                    tracing::error!(error = %e, "Failed to respawn persistent sandbox");
334                    // Don't reset consecutive_failures — next iteration will try again
335                }
336            }
337        }
338
339        // Publish updated health (ignore error if all receivers dropped)
340        let _ = health_tx.send(health.clone());
341    }
342}
343
344/// A running sandboxed MCP server with its associated proxy and client.
345///
346/// Drop this struct to shut down the sandbox.
347pub struct SandboxedMcpServer {
348    /// MCP client connected via the per-session proxy.
349    pub client: Arc<McpClient>,
350    /// Tools available from the sandboxed server.
351    pub tools: Vec<stakai::Tool>,
352    /// Channel to shut down the per-session proxy.
353    proxy_shutdown_tx: broadcast::Sender<()>,
354    /// The warden container child process.
355    container_process: Child,
356}
357
358impl SandboxedMcpServer {
359    /// Spawn a sandboxed MCP server inside a warden container and connect to it.
360    ///
361    /// 1. Generates a client mTLS identity (private key stays in host memory)
362    /// 2. Passes the client CA cert (public) to the container via env var
363    /// 3. Spawns `warden wrap <image> -- stakpak mcp start`
364    /// 4. Parses the server CA cert (public) from the container's stdout
365    /// 5. Builds a client TLS config trusting the server CA, using the client key
366    /// 6. Starts a per-session MCP proxy pointing to the container
367    /// 7. Connects a client to the proxy
368    pub async fn spawn(config: &SandboxConfig) -> Result<Self, String> {
369        // 1. Generate client identity — private key stays in host memory
370        let client_identity = MtlsIdentity::generate_client()
371            .map_err(|e| format!("Failed to generate client identity: {e}"))?;
372
373        let client_ca_pem = client_identity
374            .ca_cert_pem()
375            .map_err(|e| format!("Failed to get client CA PEM: {e}"))?;
376
377        // 2. Find a free port for the container's MCP server to expose
378        let container_host_port = find_free_port()
379            .await
380            .map_err(|e| format!("Failed to find free port for sandbox: {e}"))?;
381
382        // 3. Spawn warden container, passing client CA cert (public) via env var
383        let mut container_process =
384            spawn_warden_container(config, container_host_port, &client_ca_pem)
385                .await
386                .map_err(|e| format!("Failed to spawn sandbox container: {e}"))?;
387
388        // 4. Parse the server CA cert (public) from the container's stdout
389        let server_ca_pem = parse_server_ca_from_stdout(&mut container_process).await?;
390        tracing::info!(
391            "Parsed server CA from container stdout ({} bytes)",
392            server_ca_pem.len()
393        );
394
395        // 5. Build client TLS config — trusts server CA, authenticates with our key
396        let container_client_config = client_identity
397            .create_client_config(&server_ca_pem)
398            .map_err(|e| format!("Failed to create client TLS config: {e}"))?;
399
400        // 6. Wait for the MCP server inside the container to be ready
401        let server_url = format!("https://127.0.0.1:{container_host_port}/mcp");
402        tracing::info!(url = %server_url, "Waiting for sandbox MCP server to be ready");
403        wait_for_server_ready(&server_url, &container_client_config).await?;
404        tracing::info!("Sandbox MCP server is ready");
405
406        // 7. Start a per-session proxy connecting to the sandboxed server
407        let (proxy_shutdown_tx, proxy_shutdown_rx) = broadcast::channel::<()>(1);
408
409        let proxy_binding = find_available_binding("sandbox proxy").await?;
410        let proxy_url = format!("https://{}/mcp", proxy_binding.address);
411
412        let proxy_cert_chain = Arc::new(
413            CertificateChain::generate()
414                .map_err(|e| format!("Failed to generate proxy certificates: {e}"))?,
415        );
416
417        let pool_config = build_sandbox_proxy_config(server_url, Arc::new(container_client_config));
418
419        let proxy_chain_for_server = proxy_cert_chain.clone();
420        let proxy_listener = proxy_binding.listener;
421        tokio::spawn(async move {
422            if let Err(e) = start_proxy_server(
423                pool_config,
424                proxy_listener,
425                proxy_chain_for_server,
426                true,  // redact_secrets
427                false, // privacy_mode
428                Some(proxy_shutdown_rx),
429            )
430            .await
431            {
432                tracing::error!("Sandbox proxy error: {e}");
433            }
434        });
435
436        // Small delay for proxy to start
437        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
438
439        // 8. Connect client to proxy
440        let client = connect_to_proxy(&proxy_url, proxy_cert_chain).await?;
441
442        // 9. Get tools
443        let mcp_tools = stakpak_mcp_client::get_tools(&client)
444            .await
445            .map_err(|e| format!("Failed to get sandbox tools: {e}"))?;
446
447        let tools = mcp_tools
448            .into_iter()
449            .map(|tool| stakai::Tool {
450                tool_type: "function".to_string(),
451                function: stakai::ToolFunction {
452                    name: tool.name.as_ref().to_string(),
453                    description: tool
454                        .description
455                        .as_ref()
456                        .map(std::string::ToString::to_string)
457                        .unwrap_or_default(),
458                    parameters: serde_json::Value::Object((*tool.input_schema).clone()),
459                },
460                provider_options: None,
461            })
462            .collect();
463
464        Ok(Self {
465            client,
466            tools,
467            proxy_shutdown_tx,
468            container_process,
469        })
470    }
471
472    /// Shut down the sandbox: stop the proxy and kill the container.
473    pub async fn shutdown(mut self) {
474        self.teardown().await;
475    }
476
477    /// Stop the proxy and gracefully shut down the warden container process.
478    ///
479    /// Sends SIGINT (not SIGTERM) because warden listens for `ctrl_c` (SIGINT)
480    /// to trigger its cleanup — stopping the user container and sidecar.
481    /// Waits up to 10s for warden to finish, then force-kills.
482    pub async fn teardown(&mut self) {
483        let _ = self.proxy_shutdown_tx.send(());
484
485        // Send SIGINT so warden's `signal::ctrl_c()` handler fires and runs
486        // its container cleanup (stop_user_container + cleanup sidecar).
487        #[cfg(unix)]
488        if let Some(pid) = self.container_process.id() {
489            let _ = tokio::process::Command::new("kill")
490                .args(["-INT", &pid.to_string()])
491                .output()
492                .await;
493        }
494
495        // Give warden up to 10s to stop containers and exit cleanly.
496        match tokio::time::timeout(
497            std::time::Duration::from_secs(10),
498            self.container_process.wait(),
499        )
500        .await
501        {
502            Ok(Ok(status)) => {
503                tracing::debug!(exit_status = ?status, "Warden process exited gracefully");
504            }
505            _ => {
506                tracing::warn!("Warden process did not exit in 10s — force killing");
507                let _ = self.container_process.kill().await;
508                let _ = self.container_process.wait().await;
509            }
510        }
511    }
512}
513
514async fn spawn_warden_container(
515    config: &SandboxConfig,
516    host_port: u16,
517    client_ca_pem: &str,
518) -> Result<Child, String> {
519    use stakpak_shared::container::{expand_volume_path, is_named_volume};
520
521    let mut cmd = tokio::process::Command::new(&config.warden_path);
522    cmd.arg("wrap");
523    cmd.arg(&config.image);
524
525    // Mount configured volumes
526    for vol in &config.volumes {
527        let expanded = expand_volume_path(vol);
528        let host_path = expanded.split(':').next().unwrap_or(&expanded);
529        // Named volumes (e.g. "stakpak-aqua-cache:/container/path") don't have a
530        // host filesystem path — mount them unconditionally. Bind mounts are only
531        // added when the host path actually exists.
532        if is_named_volume(host_path) || Path::new(host_path).exists() {
533            cmd.args(["--volume", &expanded]);
534        }
535    }
536
537    // Port forwarding for the MCP server — publish on the sidecar so the
538    // host can reach the container's MCP server port directly.
539    cmd.args(["-p", &format!("127.0.0.1:{host_port}:8080")]);
540
541    // Prevent warden re-entry
542    cmd.args(["--env", "STAKPAK_SKIP_WARDEN=1"]);
543
544    // Tell the MCP server to bind to a fixed port inside the container
545    // so it matches the published port on the sidecar.
546    cmd.args(["--env", "STAKPAK_MCP_PORT=8080"]);
547
548    // Pass the client CA cert (public only) so the server can trust the client.
549    cmd.args(["--env", &format!("{TRUSTED_CLIENT_CA_ENV}={client_ca_pem}")]);
550
551    // Pass through API credentials if set
552    if let Ok(api_key) = std::env::var("STAKPAK_API_KEY") {
553        cmd.args(["--env", &format!("STAKPAK_API_KEY={api_key}")]);
554    }
555    if let Ok(profile) = std::env::var("STAKPAK_PROFILE") {
556        cmd.args(["--env", &format!("STAKPAK_PROFILE={profile}")]);
557    }
558    if let Ok(endpoint) = std::env::var("STAKPAK_API_ENDPOINT") {
559        cmd.args(["--env", &format!("STAKPAK_API_ENDPOINT={endpoint}")]);
560    }
561
562    // The MCP server detects STAKPAK_MCP_CLIENT_CA and generates its own
563    // server identity, outputting the server CA cert to stdout.
564    cmd.args(["--", "stakpak", "mcp", "start"]);
565
566    cmd.stdout(std::process::Stdio::piped());
567    cmd.stderr(std::process::Stdio::piped());
568    cmd.stdin(std::process::Stdio::null());
569
570    let child = cmd
571        .spawn()
572        .map_err(|e| format!("Failed to spawn warden process: {e}"))?;
573
574    Ok(child)
575}
576
577/// Parse the server CA certificate PEM from the container's stdout.
578///
579/// The MCP server outputs the server CA cert between structured delimiters:
580/// ```text
581/// ---BEGIN STAKPAK SERVER CA---
582/// -----BEGIN CERTIFICATE-----
583/// ...
584/// -----END CERTIFICATE-----
585/// ---END STAKPAK SERVER CA---
586/// ```
587async fn parse_server_ca_from_stdout(process: &mut Child) -> Result<String, String> {
588    let stdout = process
589        .stdout
590        .take()
591        .ok_or_else(|| "Container stdout not captured".to_string())?;
592
593    let mut reader = tokio::io::BufReader::new(stdout);
594    let mut server_ca_pem = String::new();
595    let mut in_server_ca = false;
596    let mut line = String::new();
597
598    let timeout_duration = tokio::time::Duration::from_secs(60);
599    let deadline = tokio::time::Instant::now() + timeout_duration;
600
601    tracing::debug!("Starting to read container stdout for server CA...");
602
603    loop {
604        line.clear();
605        let bytes_read = tokio::time::timeout_at(deadline, reader.read_line(&mut line))
606            .await
607            .map_err(|_| {
608                "Timed out waiting for container to output server CA certificate".to_string()
609            })?
610            .map_err(|e| format!("Failed to read container stdout: {e}"))?;
611
612        if bytes_read == 0 {
613            tracing::error!("Container stdout EOF before server CA was found");
614            return Err("Container exited before outputting server CA certificate".to_string());
615        }
616
617        let trimmed = line.trim();
618        tracing::debug!(line = %trimmed, bytes = bytes_read, "Read line from container stdout");
619
620        if trimmed == "---BEGIN STAKPAK SERVER CA---" {
621            in_server_ca = true;
622            continue;
623        }
624
625        if trimmed == "---END STAKPAK SERVER CA---" {
626            tracing::debug!("Found end of server CA block");
627            break;
628        }
629
630        if in_server_ca {
631            server_ca_pem.push_str(trimmed);
632            server_ca_pem.push('\n');
633        }
634    }
635
636    let server_ca_pem = server_ca_pem.trim().to_string();
637
638    if server_ca_pem.is_empty() {
639        return Err("Failed to parse server CA certificate from container output".to_string());
640    }
641
642    Ok(server_ca_pem)
643}
644
645async fn wait_for_server_ready(
646    url: &str,
647    client_config: &rustls::ClientConfig,
648) -> Result<(), String> {
649    let http_client = reqwest::Client::builder()
650        .use_preconfigured_tls(client_config.clone())
651        .build()
652        .map_err(|e| format!("Failed to build readiness check client: {e}"))?;
653
654    let mut last_error = String::new();
655    for attempt in 0..30 {
656        tokio::time::sleep(tokio::time::Duration::from_millis(if attempt < 5 {
657            500
658        } else {
659            1000
660        }))
661        .await;
662
663        match http_client.get(url).send().await {
664            Ok(_) => {
665                tracing::info!(attempt, "Sandbox MCP server ready");
666                return Ok(());
667            }
668            Err(e) => {
669                last_error = format!("{e:?}");
670                tracing::debug!(attempt, error = %last_error, "Readiness check failed");
671            }
672        }
673    }
674
675    Err(format!(
676        "Sandbox MCP server failed to become ready after 30 attempts: {last_error}"
677    ))
678}
679
680struct ProxyBinding {
681    address: String,
682    listener: TcpListener,
683}
684
685async fn find_available_binding(purpose: &str) -> Result<ProxyBinding, String> {
686    let listener = TcpListener::bind("127.0.0.1:0")
687        .await
688        .map_err(|e| format!("Failed to bind port for {purpose}: {e}"))?;
689    let addr = listener
690        .local_addr()
691        .map_err(|e| format!("Failed to get address for {purpose}: {e}"))?;
692    Ok(ProxyBinding {
693        address: addr.to_string(),
694        listener,
695    })
696}
697
698// TODO: TOCTOU race — between dropping the listener and Docker binding the port,
699// another process could claim it. Consider retrying with a different port on bind
700// failure, or passing the listener fd directly if Docker supports it.
701async fn find_free_port() -> Result<u16, String> {
702    let listener = TcpListener::bind("127.0.0.1:0")
703        .await
704        .map_err(|e| format!("Failed to bind ephemeral port: {e}"))?;
705    let port = listener
706        .local_addr()
707        .map_err(|e| format!("Failed to get ephemeral port: {e}"))?
708        .port();
709    // Drop the listener to free the port for Docker to use
710    drop(listener);
711    Ok(port)
712}
713
714fn build_sandbox_proxy_config(
715    sandbox_server_url: String,
716    client_tls_config: Arc<rustls::ClientConfig>,
717) -> ClientPoolConfig {
718    let mut servers: HashMap<String, ServerConfig> = HashMap::new();
719
720    // Register the sandboxed MCP server under the same name ("stakpak") so
721    // tool names like `stakpak__run_command` route correctly through the proxy.
722    servers.insert(
723        "stakpak".to_string(),
724        ServerConfig::Http {
725            url: sandbox_server_url,
726            headers: None,
727            certificate_chain: Arc::new(None),
728            client_tls_config: Some(client_tls_config),
729        },
730    );
731
732    // Keep the external paks server accessible
733    servers.insert(
734        "paks".to_string(),
735        ServerConfig::Http {
736            url: "https://apiv2.stakpak.dev/v1/paks/mcp".to_string(),
737            headers: None,
738            certificate_chain: Arc::new(None),
739            client_tls_config: None,
740        },
741    );
742
743    ClientPoolConfig::with_servers(servers)
744}
745
746async fn connect_to_proxy(
747    proxy_url: &str,
748    cert_chain: Arc<CertificateChain>,
749) -> Result<Arc<McpClient>, String> {
750    const MAX_RETRIES: u32 = 5;
751    let mut retry_delay = tokio::time::Duration::from_millis(50);
752    let mut last_error = None;
753
754    for attempt in 1..=MAX_RETRIES {
755        match stakpak_mcp_client::connect_https(proxy_url, Some(cert_chain.clone()), None).await {
756            Ok(client) => return Ok(Arc::new(client)),
757            Err(e) => {
758                last_error = Some(e);
759                if attempt < MAX_RETRIES {
760                    tokio::time::sleep(retry_delay).await;
761                    retry_delay *= 2;
762                }
763            }
764        }
765    }
766
767    Err(format!(
768        "Failed to connect to sandbox proxy after {MAX_RETRIES} retries: {}",
769        last_error.map(|e| e.to_string()).unwrap_or_default()
770    ))
771}
772
773#[cfg(test)]
774mod tests {
775    #[test]
776    fn parse_server_ca_from_structured_output() {
777        let output = "\
778🔐 mTLS enabled - independent identity (sandbox mode)
779---BEGIN STAKPAK SERVER CA---
780-----BEGIN CERTIFICATE-----
781MIIB0zCCAXmgAwIBAgIUFAKE=
782-----END CERTIFICATE-----
783---END STAKPAK SERVER CA---
784MCP server started at https://0.0.0.0:8080/mcp
785";
786
787        let expected_ca = "\
788-----BEGIN CERTIFICATE-----
789MIIB0zCCAXmgAwIBAgIUFAKE=
790-----END CERTIFICATE-----";
791
792        // Parse the same way parse_server_ca_from_stdout does
793        let mut server_ca_pem = String::new();
794        let mut in_server_ca = false;
795
796        for line in output.lines() {
797            let trimmed = line.trim();
798            if trimmed == "---BEGIN STAKPAK SERVER CA---" {
799                in_server_ca = true;
800                continue;
801            }
802            if trimmed == "---END STAKPAK SERVER CA---" {
803                break;
804            }
805            if in_server_ca {
806                server_ca_pem.push_str(trimmed);
807                server_ca_pem.push('\n');
808            }
809        }
810
811        assert_eq!(server_ca_pem.trim(), expected_ca);
812    }
813
814    #[test]
815    fn mtls_identity_cross_trust() {
816        use stakpak_shared::cert_utils::MtlsIdentity;
817
818        // Ensure a crypto provider is installed for rustls
819        let _ = rustls::crypto::aws_lc_rs::default_provider().install_default();
820
821        // Simulate the sandbox mTLS exchange
822        let client_identity = MtlsIdentity::generate_client().expect("generate client identity");
823        let server_identity = MtlsIdentity::generate_server().expect("generate server identity");
824
825        let client_ca_pem = client_identity.ca_cert_pem().expect("client CA PEM");
826        let server_ca_pem = server_identity.ca_cert_pem().expect("server CA PEM");
827
828        // Server trusts client CA, client trusts server CA
829        let _server_config = server_identity
830            .create_server_config(&client_ca_pem)
831            .expect("server config with client CA trust");
832        let _client_config = client_identity
833            .create_client_config(&server_ca_pem)
834            .expect("client config with server CA trust");
835
836        // Only public CA certs were exchanged — private keys stayed in their
837        // respective MtlsIdentity structs.
838        assert!(client_ca_pem.contains("BEGIN CERTIFICATE"));
839        assert!(server_ca_pem.contains("BEGIN CERTIFICATE"));
840        assert!(!client_ca_pem.contains("PRIVATE KEY"));
841        assert!(!server_ca_pem.contains("PRIVATE KEY"));
842    }
843
844    // ── Named volume detection in expand_volume_path / mount filter ────
845
846    #[test]
847    fn expand_volume_path_leaves_named_volumes_unchanged() {
848        use stakpak_shared::container::expand_volume_path;
849        let named = "stakpak-aqua-cache:/home/agent/.local/share/aquaproj-aqua";
850        assert_eq!(expand_volume_path(named), named);
851    }
852
853    /// Named volumes (no `/` or `.` prefix in the host part) must pass the
854    /// mount filter even though they don't exist on the host filesystem.
855    #[test]
856    fn named_volume_is_detected_correctly() {
857        use stakpak_shared::container::is_named_volume;
858        let cases = vec![
859            ("stakpak-aqua-cache", true),
860            ("my-volume", true),
861            ("./relative/path", false),
862            ("/absolute/path", false),
863            ("relative/with/slash", false),
864            (".", false),
865        ];
866        for (host_part, expected) in cases {
867            assert_eq!(
868                is_named_volume(host_part),
869                expected,
870                "host_part={host_part:?} expected named={expected}"
871            );
872        }
873    }
874
875    #[test]
876    fn sandbox_mode_default_is_persistent() {
877        assert_eq!(
878            super::SandboxMode::default(),
879            super::SandboxMode::Persistent
880        );
881    }
882
883    #[test]
884    fn sandbox_mode_serde_roundtrip() {
885        #[derive(serde::Serialize, serde::Deserialize)]
886        struct Wrapper {
887            #[serde(default)]
888            mode: super::SandboxMode,
889        }
890
891        // Explicit persistent
892        let json = serde_json::json!({"mode": "persistent"});
893        let w: Wrapper = serde_json::from_value(json).expect("deserialize persistent");
894        assert_eq!(w.mode, super::SandboxMode::Persistent);
895
896        // Explicit ephemeral
897        let json = serde_json::json!({"mode": "ephemeral"});
898        let w: Wrapper = serde_json::from_value(json).expect("deserialize ephemeral");
899        assert_eq!(w.mode, super::SandboxMode::Ephemeral);
900
901        // Missing field → default (persistent)
902        let json = serde_json::json!({});
903        let w: Wrapper = serde_json::from_value(json).expect("deserialize default");
904        assert_eq!(w.mode, super::SandboxMode::Persistent);
905
906        // Display
907        assert_eq!(super::SandboxMode::Persistent.to_string(), "persistent");
908        assert_eq!(super::SandboxMode::Ephemeral.to_string(), "ephemeral");
909    }
910
911    #[test]
912    fn sandbox_health_default_is_healthy() {
913        let h = super::SandboxHealth::default();
914        assert!(h.healthy);
915        assert_eq!(h.consecutive_ok, 0);
916        assert_eq!(h.consecutive_failures, 0);
917        assert!(h.last_ok.is_none());
918        assert!(h.last_error.is_none());
919        assert_eq!(h.total_respawn_attempts, 0);
920    }
921}
stakpak_server/sandbox.rs

stakpak_server/
sandbox.rs