Skip to main content

fakecloud_ec2/runtime/
mod.rs

1//! Backing-container runtime for EC2 instances.
2//!
3//! `RunInstances` spins a real container per instance; the instance
4//! lifecycle (`Start`/`Stop`/`Reboot`/`Terminate`) maps onto the container
5//! lifecycle, and `DescribeInstances` reports the container's real private
6//! IP. The container can run either as a local Docker/Podman container (the
7//! default) or as a native Kubernetes Pod (`FAKECLOUD_EC2_BACKEND=k8s` or the
8//! global `FAKECLOUD_CONTAINER_BACKEND=k8s`).
9//!
10//! Operations are keyed by **instance id**, not the backend handle: a
11//! Kubernetes Pod can't be stopped and restarted in place, so `Stop` deletes
12//! the Pod and `Start`/`Reboot` recreate it. The runtime therefore keeps,
13//! per instance, the handle plus enough of the original request (image,
14//! user-data) to recreate the backing container deterministically.
15//!
16//! The runtime is strictly additive: when no container backend is available
17//! the control plane keeps its metadata-faithful behaviour (synthesized IPs,
18//! state transitions) so every API call still succeeds. Real container
19//! backing is best-effort fidelity layered on top.
20
21pub mod firewall;
22mod k8s;
23pub mod netpolicy;
24
25use std::collections::{BTreeMap, HashMap};
26use std::sync::Arc;
27
28use parking_lot::RwLock;
29
30use firewall::{
31    render_ruleset, resolve_enforcement_mode, EnforcementMode, InstanceRules, SubnetFirewall,
32};
33
34/// Default base image an instance's container runs. AMIs don't map to a
35/// concrete OS image, so we boot a real Amazon Linux container by default
36/// (overridable via `FAKECLOUD_EC2_DEFAULT_IMAGE`, e.g. to a lighter image
37/// in CI). The container is kept alive with `tail -f /dev/null` — EC2
38/// instances are long-running hosts, not one-shot tasks. `tail` is used
39/// rather than `sleep infinity` so any base image works (busybox `sleep`
40/// rejects `infinity`).
41const DEFAULT_IMAGE_ENV: &str = "FAKECLOUD_EC2_DEFAULT_IMAGE";
42const DEFAULT_IMAGE: &str = "amazonlinux:2023";
43
44#[derive(Debug, thiserror::Error)]
45pub enum RuntimeError {
46    #[error("container failed to start: {0}")]
47    ContainerStartFailed(String),
48}
49
50/// Error initializing the Kubernetes backend at startup. Surfaced to the
51/// operator so a misconfigured cluster fails fast rather than silently
52/// falling back to Docker.
53#[derive(Debug, thiserror::Error)]
54pub enum BackendInitError {
55    #[error(transparent)]
56    Env(#[from] fakecloud_k8s::K8sEnvError),
57    #[error(transparent)]
58    PodConfig(#[from] fakecloud_k8s::K8sPodConfigError),
59    #[error("failed to connect to the Kubernetes cluster: {0}")]
60    Connect(String),
61}
62
63/// A running instance's backing container.
64#[derive(Debug, Clone)]
65pub struct RunningInstance {
66    /// Backend-specific handle: a Docker container id, or a Pod name.
67    pub container_id: String,
68    /// The instance's private IP — the container's address on the daemon
69    /// network (Docker) or the Pod IP (k8s).
70    pub private_ip: String,
71    /// Name of the backing daemon network the container was attached to
72    /// (`fakecloud-subnet-<id>`), or `None` when it ran on the default bridge
73    /// (no network spec, or creation failed and we fell back). Surfaced for
74    /// introspection (#1745 phase 5).
75    pub network: Option<String>,
76}
77
78/// The L3 placement of an instance's backing container: which subnet it lands
79/// in and whether that subnet is private.
80///
81/// Per-subnet networks give the isolation #1745 wants for free: two instances
82/// in the same subnet share a bridge and can talk; instances in different
83/// subnets / VPCs land on different bridges and cannot route to each other.
84#[derive(Debug, Clone)]
85pub struct InstanceNetwork {
86    /// The EC2 subnet id the instance launched into.
87    pub subnet_id: String,
88    /// True when the subnet has no `0.0.0.0/0 -> igw` route (private): the
89    /// backing network is created `--internal` (no NAT to host/internet).
90    pub internal: bool,
91}
92
93/// The daemon network name backing an EC2 subnet. Stable per subnet so every
94/// instance in the subnet attaches to the same bridge.
95pub fn subnet_network_name(subnet_id: &str) -> String {
96    format!("fakecloud-subnet-{subnet_id}")
97}
98
99/// How this runtime isolates instance traffic, surfaced by the
100/// `/_fakecloud/ec2/instance-networks` introspection endpoint so users can
101/// answer "why can't X reach Y" — which backend, which SG-enforcement
102/// mechanism, and whether it's actually active vs degraded to metadata-only.
103#[derive(Debug, Clone)]
104pub struct NetworkIsolationSummary {
105    /// `docker` | `podman` | `kubernetes`.
106    pub backend: &'static str,
107    /// `nftables` (Docker host firewall) | `networkpolicy` (k8s) | `disabled`.
108    pub sg_enforcement: &'static str,
109    /// Whether security-group rules are actually enforced. False means rules
110    /// are tracked but not applied (no `CAP_NET_ADMIN`, or a CNI that ignores
111    /// NetworkPolicy) — phase-2 L3 isolation still holds.
112    pub enforced: bool,
113}
114
115/// What the runtime remembers per instance so it can drive the backing
116/// container's lifecycle and recreate it (k8s `Start`/`Reboot`).
117#[derive(Debug, Clone)]
118struct InstanceRecord {
119    /// Docker container id, or Pod name.
120    handle: String,
121    /// Resolved base image, captured at `RunInstances` so a recreate is
122    /// identical even if `FAKECLOUD_EC2_DEFAULT_IMAGE` later changes.
123    image: String,
124    /// Base64 user-data to re-run on recreate, if any.
125    user_data: Option<String>,
126    /// The instance's tags, captured at `RunInstances`. Reserved
127    /// `fakecloud-k8s/*` entries drive per-instance Pod scheduling and must
128    /// survive a k8s `Start`/`Reboot` recreate, so they're stored here
129    /// rather than re-read from the control plane.
130    tags: BTreeMap<String, String>,
131    /// The instance's subnet placement, captured at `RunInstances` so a k8s
132    /// `Start`/`Reboot` recreate re-applies the same network and phase-5
133    /// introspection can report the backing network. `None` in metadata-only
134    /// network mode.
135    network: Option<InstanceNetwork>,
136}
137
138/// The selected backing-container backend.
139#[derive(Debug, Clone)]
140enum InstanceBackend {
141    Docker(DockerInstances),
142    K8s(k8s::K8sInstances),
143}
144
145/// Host firewall enforcement for security groups + NACLs (#1745 phase 3).
146///
147/// The network-driver abstraction the issue asks for: today there is one real
148/// driver (nftables) plus the degraded no-op, selected once at construction.
149/// Branching on podman vs docker isn't needed explicitly — rootless podman
150/// can't touch the host firewall, so the `nft list ruleset` capability probe
151/// already degrades it; rootful podman with netavark passes the same probe.
152#[derive(Debug, Clone)]
153pub struct FirewallEnforcer {
154    mode: EnforcementMode,
155}
156
157impl FirewallEnforcer {
158    /// Resolve the enforcement mode from `FAKECLOUD_EC2_SG_ENFORCEMENT` and an
159    /// `nft` capability probe, warning once when enforcement was requested but
160    /// can't be backed (so the operator knows it degraded, not silently).
161    fn detect() -> Self {
162        let requested = std::env::var("FAKECLOUD_EC2_SG_ENFORCEMENT").ok();
163        let mode = resolve_enforcement_mode(
164            requested.as_deref(),
165            firewall::host_shares_daemon_netns(),
166            firewall::nft_available,
167        );
168        if requested.is_some() && mode == EnforcementMode::Disabled {
169            tracing::warn!(
170                "EC2 security-group enforcement was requested but it can't take effect here \
171                 (needs nftables + CAP_NET_ADMIN on a native-Linux host whose daemon shares this \
172                 network namespace — Docker Desktop / podman-machine run the daemon in a VM); \
173                 falling back to metadata-only (phase-2 L3 isolation stays active, security-group \
174                 rules are tracked but not enforced)"
175            );
176        } else if mode == EnforcementMode::Nftables {
177            tracing::info!("EC2 security-group enforcement active via nftables");
178        }
179        Self { mode }
180    }
181
182    /// Disabled enforcer (k8s backend, or no container runtime).
183    fn disabled() -> Self {
184        Self {
185            mode: EnforcementMode::Disabled,
186        }
187    }
188
189    pub fn mode(&self) -> EnforcementMode {
190        self.mode
191    }
192
193    pub fn enabled(&self) -> bool {
194        self.mode != EnforcementMode::Disabled
195    }
196
197    /// Atomically swap in the rendered ruleset via `nft -f -`. No-op when
198    /// disabled. Best-effort: a failed apply logs and leaves the previous
199    /// ruleset in place rather than erroring the originating API call.
200    async fn reconcile(&self, subnets: &[SubnetFirewall]) {
201        if self.mode == EnforcementMode::Disabled {
202            return;
203        }
204        // Instances in the same subnet share one Linux bridge; their traffic is
205        // L2-switched and only traverses the `forward` chain (where our SG rules
206        // live) when bridge netfilter is enabled. Without this, same-subnet SG
207        // rules silently filter nothing — exactly what the real-packet E2E
208        // caught. Needs CAP_NET_ADMIN (which the enforcer holds) and the
209        // `modprobe`/`sysctl` binaries (shipped via kmod/procps in the image).
210        // Warn rather than swallow the error: a missing binary or a failed call
211        // means enforcement degrades to filtering nothing, and the operator who
212        // opted in deserves to know (bug-audit 2026-06-20, 0.B1).
213        match tokio::process::Command::new("modprobe")
214            .arg("br_netfilter")
215            .output()
216            .await
217        {
218            Ok(o) if o.status.success() => {}
219            Ok(o) => tracing::warn!(
220                stderr = %String::from_utf8_lossy(&o.stderr).trim(),
221                "modprobe br_netfilter failed; same-subnet security-group enforcement may filter nothing"
222            ),
223            Err(e) => tracing::warn!(
224                error = %e,
225                "could not run modprobe (is kmod installed?); same-subnet security-group enforcement may filter nothing"
226            ),
227        }
228        match tokio::process::Command::new("sysctl")
229            .args(["-w", "net.bridge.bridge-nf-call-iptables=1"])
230            .output()
231            .await
232        {
233            Ok(o) if o.status.success() => {}
234            Ok(o) => tracing::warn!(
235                stderr = %String::from_utf8_lossy(&o.stderr).trim(),
236                "sysctl bridge-nf-call-iptables=1 failed; same-subnet security-group enforcement may filter nothing"
237            ),
238            Err(e) => tracing::warn!(
239                error = %e,
240                "could not run sysctl (is procps installed?); same-subnet security-group enforcement may filter nothing"
241            ),
242        }
243        let ruleset = render_ruleset(subnets);
244        use tokio::io::AsyncWriteExt;
245        let mut child = match tokio::process::Command::new("nft")
246            .args(["-f", "-"])
247            .stdin(std::process::Stdio::piped())
248            .stdout(std::process::Stdio::null())
249            .stderr(std::process::Stdio::piped())
250            .spawn()
251        {
252            Ok(c) => c,
253            Err(e) => {
254                tracing::warn!(error = %e, "failed to spawn nft; security-group ruleset not applied");
255                return;
256            }
257        };
258        if let Some(mut stdin) = child.stdin.take() {
259            let _ = stdin.write_all(ruleset.as_bytes()).await;
260            let _ = stdin.shutdown().await;
261        }
262        match child.wait_with_output().await {
263            Ok(out) if out.status.success() => {
264                tracing::debug!(
265                    subnets = subnets.len(),
266                    "applied EC2 security-group nft ruleset"
267                );
268            }
269            Ok(out) => {
270                tracing::warn!(
271                    stderr = %String::from_utf8_lossy(&out.stderr).trim(),
272                    "nft rejected the security-group ruleset; leaving the previous ruleset in place"
273                );
274            }
275            Err(e) => tracing::warn!(error = %e, "nft apply failed"),
276        }
277    }
278}
279
280#[derive(Debug, Clone)]
281pub struct Ec2Runtime {
282    backend: InstanceBackend,
283    /// Per-instance backing records, keyed by EC2 instance id, so the
284    /// lifecycle operations and reset/shutdown teardown work without
285    /// consulting service state.
286    instances: Arc<RwLock<HashMap<String, InstanceRecord>>>,
287    /// Host firewall enforcer for security groups + NACLs.
288    firewall: FirewallEnforcer,
289    /// Serializes firewall reconciles. Reconcile is fired from many concurrent
290    /// background tasks (per SG/NACL/lifecycle event); without this, two
291    /// reconciles built from divergent state could interleave so the k8s
292    /// apply+prune of one deletes a policy the other just applied (bug-hunt
293    /// 2026-06-18 finding 4.3). Holding it across the whole reconcile makes the
294    /// last-started reconcile the last-applied for both backends.
295    reconcile_lock: Arc<tokio::sync::Mutex<()>>,
296}
297
298impl Ec2Runtime {
299    /// Construct the Docker/Podman backend. Returns `None` when no container
300    /// CLI is available — callers then run in metadata-only mode.
301    pub fn new() -> Option<Self> {
302        let cli = fakecloud_core::container_net::detect_container_cli()?;
303        Some(Self {
304            backend: InstanceBackend::Docker(DockerInstances {
305                cli,
306                instance_id: format!("fakecloud-{}", std::process::id()),
307            }),
308            instances: Arc::new(RwLock::new(HashMap::new())),
309            firewall: FirewallEnforcer::detect(),
310            reconcile_lock: Arc::new(tokio::sync::Mutex::new(())),
311        })
312    }
313
314    /// Construct the Kubernetes backend. `server_port` is fakecloud's bound
315    /// port (used when `FAKECLOUD_K8S_SELF_URL` omits one). Fails fast on
316    /// misconfiguration — never silently degrades to Docker.
317    pub async fn new_k8s(server_port: u16) -> Result<Self, BackendInitError> {
318        let backend = k8s::K8sInstances::from_env(server_port).await?;
319        Ok(Self {
320            backend: InstanceBackend::K8s(backend),
321            instances: Arc::new(RwLock::new(HashMap::new())),
322            // k8s isolation is a NetworkPolicy concern (phase 4), not host nft.
323            firewall: FirewallEnforcer::disabled(),
324            reconcile_lock: Arc::new(tokio::sync::Mutex::new(())),
325        })
326    }
327
328    /// The firewall enforcer, so the control plane can skip building the model
329    /// when enforcement is disabled and report the mode for introspection.
330    pub fn firewall(&self) -> &FirewallEnforcer {
331        &self.firewall
332    }
333
334    /// Re-render and atomically apply the security-group/NACL ruleset for the
335    /// given per-subnet model. No-op (cheap) when enforcement is disabled.
336    /// Serialized against other reconciles (finding 4.3).
337    pub async fn reconcile_firewall(&self, subnets: Vec<SubnetFirewall>) {
338        let _guard = self.reconcile_lock.lock().await;
339        self.firewall.reconcile(&subnets).await;
340    }
341
342    /// Whether this runtime backs network isolation with real enforcement —
343    /// host nftables (Docker, opt-in) or k8s NetworkPolicy. Lets the control
344    /// plane skip building the firewall model entirely when neither applies.
345    pub fn network_isolation_enforced(&self) -> bool {
346        self.firewall.enabled() || self.is_k8s()
347    }
348
349    /// True for the Kubernetes backend (isolation via NetworkPolicy).
350    pub fn is_k8s(&self) -> bool {
351        matches!(self.backend, InstanceBackend::K8s(_))
352    }
353
354    /// Apply one NetworkPolicy per instance for the k8s backend. No-op on the
355    /// Docker backend (which uses nftables instead). Serialized against other
356    /// reconciles so a concurrent apply+prune can't delete a just-applied
357    /// policy (finding 4.3).
358    pub async fn reconcile_network_policies(&self, rules: Vec<InstanceRules>) {
359        if let InstanceBackend::K8s(k) = &self.backend {
360            let _guard = self.reconcile_lock.lock().await;
361            k.reconcile_network_policies(&rules).await;
362        }
363    }
364
365    /// A snapshot of how this runtime isolates instance traffic, for the
366    /// `/_fakecloud/ec2/instance-networks` introspection endpoint (#1745 ph5).
367    pub fn network_isolation_summary(&self) -> NetworkIsolationSummary {
368        match &self.backend {
369            InstanceBackend::Docker(d) => NetworkIsolationSummary {
370                backend: if fakecloud_core::container_net::is_podman_binary(&d.cli) {
371                    "podman"
372                } else {
373                    "docker"
374                },
375                sg_enforcement: match self.firewall.mode() {
376                    EnforcementMode::Nftables => "nftables",
377                    EnforcementMode::Disabled => "disabled",
378                },
379                enforced: self.firewall.enabled(),
380            },
381            InstanceBackend::K8s(k) => NetworkIsolationSummary {
382                backend: "kubernetes",
383                sg_enforcement: "networkpolicy",
384                // NetworkPolicies are always created; "enforced" reflects
385                // whether the detected CNI actually applies them.
386                enforced: k.cni_enforces(),
387            },
388        }
389    }
390
391    /// Name of the active backend, for logging.
392    pub fn cli_name(&self) -> &str {
393        match &self.backend {
394            InstanceBackend::Docker(d) => &d.cli,
395            InstanceBackend::K8s(_) => "kubernetes",
396        }
397    }
398
399    /// Boot a container for an instance. `user_data` is the base64-encoded
400    /// user-data as received on the wire (RunInstances `UserData`), run at
401    /// boot the way cloud-init would, if present.
402    pub async fn run_instance(
403        &self,
404        instance_id: &str,
405        user_data: Option<&str>,
406        tags: &BTreeMap<String, String>,
407        network: Option<&InstanceNetwork>,
408    ) -> Result<RunningInstance, RuntimeError> {
409        let image = default_image();
410        let running = match &self.backend {
411            // Docker attaches the container to the subnet's per-VPC bridge for
412            // L3 isolation. k8s pods share a flat network; isolation there is a
413            // NetworkPolicy concern handled separately (#1745 phase 4).
414            InstanceBackend::Docker(d) => {
415                d.run_instance(instance_id, &image, user_data, network)
416                    .await?
417            }
418            InstanceBackend::K8s(k) => k.spawn_pod(instance_id, &image, user_data, tags).await?,
419        };
420        self.instances.write().insert(
421            instance_id.to_string(),
422            InstanceRecord {
423                handle: running.container_id.clone(),
424                image,
425                user_data: user_data.map(str::to_string),
426                tags: tags.clone(),
427                network: network.cloned(),
428            },
429        );
430        Ok(running)
431    }
432
433    /// Stop an instance's backing container (maps to `StopInstances`).
434    /// Docker stops the container in place; k8s deletes the Pod (recreated
435    /// on the next `Start`).
436    pub async fn stop_instance(&self, instance_id: &str) {
437        let Some(handle) = self.handle_of(instance_id) else {
438            return;
439        };
440        match &self.backend {
441            InstanceBackend::Docker(d) => d.stop(&handle).await,
442            InstanceBackend::K8s(k) => k.delete_pod(&handle).await,
443        }
444    }
445
446    /// Start a previously-stopped instance (maps to `StartInstances`).
447    /// Returns the running container's (possibly new) handle and private IP.
448    /// Docker starts the existing container; k8s recreates the Pod under a new
449    /// unique name, so the handle changes — callers should persist it.
450    pub async fn start_instance(&self, instance_id: &str) -> Option<RunningInstance> {
451        let record = self.instances.read().get(instance_id)?.clone();
452        match &self.backend {
453            InstanceBackend::Docker(d) => {
454                // Same container; only the IP may change. The subnet network the
455                // container was created on persists across stop/start.
456                let private_ip = d.start(&record.handle).await?;
457                Some(RunningInstance {
458                    container_id: record.handle,
459                    private_ip,
460                    network: record
461                        .network
462                        .as_ref()
463                        .map(|n| subnet_network_name(&n.subnet_id)),
464                })
465            }
466            InstanceBackend::K8s(k) => {
467                let running = k
468                    .spawn_pod(
469                        instance_id,
470                        &record.image,
471                        record.user_data.as_deref(),
472                        &record.tags,
473                    )
474                    .await
475                    .ok()?;
476                self.update_handle(instance_id, &running.container_id);
477                Some(running)
478            }
479        }
480    }
481
482    /// Restart an instance's backing container (maps to `RebootInstances`).
483    /// Docker restarts in place; k8s deletes and recreates the Pod under a new
484    /// name. Returns the running container's handle + IP when it changed (k8s),
485    /// so callers can persist the new handle; `None` when nothing to update.
486    pub async fn reboot_instance(&self, instance_id: &str) -> Option<RunningInstance> {
487        let record = self.instances.read().get(instance_id).cloned()?;
488        match &self.backend {
489            InstanceBackend::Docker(d) => {
490                d.reboot(&record.handle).await;
491                None
492            }
493            InstanceBackend::K8s(k) => {
494                k.delete_pod(&record.handle).await;
495                let running = k
496                    .spawn_pod(
497                        instance_id,
498                        &record.image,
499                        record.user_data.as_deref(),
500                        &record.tags,
501                    )
502                    .await
503                    .ok()?;
504                self.update_handle(instance_id, &running.container_id);
505                Some(running)
506            }
507        }
508    }
509
510    /// Remove an instance's backing container (maps to `TerminateInstances`).
511    pub async fn terminate_instance(&self, instance_id: &str) {
512        let record = self.instances.write().remove(instance_id);
513        if let Some(record) = record {
514            match &self.backend {
515                InstanceBackend::Docker(d) => d.remove(&record.handle).await,
516                InstanceBackend::K8s(k) => k.delete_pod(&record.handle).await,
517            }
518        }
519    }
520
521    /// Tear down every container this runtime spawned (used on reset and
522    /// shutdown). The Docker backend leans on the shared reaper for any
523    /// container it loses track of.
524    pub async fn stop_all(&self) {
525        let records: Vec<InstanceRecord> = {
526            let mut instances = self.instances.write();
527            instances.drain().map(|(_, r)| r).collect()
528        };
529        for record in records {
530            match &self.backend {
531                InstanceBackend::Docker(d) => d.remove(&record.handle).await,
532                InstanceBackend::K8s(k) => k.delete_pod(&record.handle).await,
533            }
534        }
535    }
536
537    /// Sweep instance Pods orphaned by a previous fakecloud process (k8s
538    /// only; the Docker backend relies on the shared reaper).
539    pub async fn reap_stale(&self) {
540        if let InstanceBackend::K8s(k) = &self.backend {
541            k.reap_stale().await;
542        }
543    }
544
545    /// The backing container's console log — its combined stdout/stderr, which
546    /// includes anything user-data printed at boot (maps to `GetConsoleOutput`).
547    /// `None` for an unbacked instance or when logs can't be read.
548    pub async fn console_output(&self, instance_id: &str) -> Option<Vec<u8>> {
549        let handle = self.handle_of(instance_id)?;
550        match &self.backend {
551            InstanceBackend::Docker(d) => d.logs(&handle).await,
552            InstanceBackend::K8s(k) => k.logs(&handle).await,
553        }
554    }
555
556    fn handle_of(&self, instance_id: &str) -> Option<String> {
557        self.instances
558            .read()
559            .get(instance_id)
560            .map(|r| r.handle.clone())
561    }
562
563    fn update_handle(&self, instance_id: &str, handle: &str) {
564        if let Some(record) = self.instances.write().get_mut(instance_id) {
565            record.handle = handle.to_string();
566        }
567    }
568}
569
570fn default_image() -> String {
571    std::env::var(DEFAULT_IMAGE_ENV).unwrap_or_else(|_| DEFAULT_IMAGE.to_string())
572}
573
574/// Keep-alive command + user-data wrapper for a base image. Shared by both
575/// backends so they boot identical containers. When `user_data` (base64) is
576/// present it is decoded and run as a root shell script, backgrounded so a
577/// slow script never blocks readiness, then the container tails forever.
578fn boot_command(user_data: Option<&str>) -> Vec<String> {
579    match user_data.filter(|s| !s.is_empty()) {
580        Some(b64) => {
581            let script = format!("printf %s '{b64}' | base64 -d | sh & exec tail -f /dev/null");
582            vec!["sh".to_string(), "-c".to_string(), script]
583        }
584        None => vec![
585            "tail".to_string(),
586            "-f".to_string(),
587            "/dev/null".to_string(),
588        ],
589    }
590}
591
592/// Docker/Podman backend: shells out to the container CLI.
593#[derive(Debug, Clone)]
594struct DockerInstances {
595    cli: String,
596    instance_id: String,
597}
598
599impl DockerInstances {
600    async fn run_instance(
601        &self,
602        instance_id: &str,
603        image: &str,
604        user_data: Option<&str>,
605        network: Option<&InstanceNetwork>,
606    ) -> Result<RunningInstance, RuntimeError> {
607        // Ensure the subnet's bridge exists and attach to it for L3 isolation.
608        // Network creation is best-effort: on failure we fall back to the
609        // default bridge so the instance still boots (no regression vs today).
610        let attached_network = match network {
611            Some(net) => self.ensure_subnet_network(net).await,
612            None => None,
613        };
614
615        let mut args: Vec<String> = vec![
616            "run".to_string(),
617            "-d".to_string(),
618            "--label".to_string(),
619            format!("fakecloud-ec2={instance_id}"),
620            "--label".to_string(),
621            format!("fakecloud-instance={}", self.instance_id),
622        ];
623        if let Some(name) = &attached_network {
624            args.push("--network".to_string());
625            args.push(name.clone());
626        }
627        args.push(image.to_string());
628        args.extend(boot_command(user_data));
629
630        let output = tokio::process::Command::new(&self.cli)
631            .args(&args)
632            .output()
633            .await
634            .map_err(|e| RuntimeError::ContainerStartFailed(e.to_string()))?;
635
636        if !output.status.success() {
637            return Err(RuntimeError::ContainerStartFailed(
638                String::from_utf8_lossy(&output.stderr).trim().to_string(),
639            ));
640        }
641
642        let container_id = String::from_utf8_lossy(&output.stdout).trim().to_string();
643        let private_ip = self
644            .inspect_ip(&container_id)
645            .await
646            .unwrap_or_else(|| "10.0.0.1".to_string());
647
648        Ok(RunningInstance {
649            container_id,
650            private_ip,
651            network: attached_network,
652        })
653    }
654
655    /// Create (idempotently) the daemon network backing a subnet and return its
656    /// name, or `None` if creation failed (caller falls back to the default
657    /// bridge). The network carries the shared `fakecloud-instance` ownership
658    /// label so the startup reaper prunes it after an ungraceful restart, plus
659    /// a `fakecloud-subnet=<id>` label for introspection. Private subnets get
660    /// an `--internal` network (no NAT to the host/internet).
661    async fn ensure_subnet_network(&self, net: &InstanceNetwork) -> Option<String> {
662        let name = subnet_network_name(&net.subnet_id);
663        let mut args = vec!["network".to_string(), "create".to_string()];
664        if net.internal {
665            args.push("--internal".to_string());
666        }
667        args.push("--label".to_string());
668        args.push(format!("fakecloud-subnet={}", net.subnet_id));
669        args.push("--label".to_string());
670        args.push(format!("fakecloud-instance={}", self.instance_id));
671        args.push(name.clone());
672
673        let output = tokio::process::Command::new(&self.cli)
674            .args(&args)
675            .output()
676            .await;
677        match output {
678            // Created fresh.
679            Ok(out) if out.status.success() => Some(name),
680            // Already exists (another instance in the same subnet created it):
681            // a benign race — the network is there, so attach to it.
682            Ok(out) => {
683                let err = String::from_utf8_lossy(&out.stderr);
684                if err.contains("already exists") || err.contains("exists") {
685                    Some(name)
686                } else {
687                    tracing::warn!(
688                        subnet = %net.subnet_id,
689                        network = %name,
690                        error = %err.trim(),
691                        "subnet network creation failed; falling back to default bridge"
692                    );
693                    None
694                }
695            }
696            Err(e) => {
697                tracing::warn!(
698                    subnet = %net.subnet_id,
699                    network = %name,
700                    error = %e,
701                    "subnet network creation failed; falling back to default bridge"
702                );
703                None
704            }
705        }
706    }
707
708    /// Read the container's private IP from `inspect`. Returns `None` if the
709    /// container has no address (e.g. host networking) — the caller falls
710    /// back to a synthesized IP.
711    async fn inspect_ip(&self, container_id: &str) -> Option<String> {
712        let output = tokio::process::Command::new(&self.cli)
713            .args([
714                "inspect",
715                "-f",
716                "{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}",
717                container_id,
718            ])
719            .output()
720            .await
721            .ok()?;
722        if !output.status.success() {
723            return None;
724        }
725        let ip = String::from_utf8_lossy(&output.stdout).trim().to_string();
726        if ip.is_empty() {
727            None
728        } else {
729            Some(ip)
730        }
731    }
732
733    async fn stop(&self, container_id: &str) {
734        let _ = tokio::process::Command::new(&self.cli)
735            .args(["stop", container_id])
736            .output()
737            .await;
738    }
739
740    async fn start(&self, container_id: &str) -> Option<String> {
741        let started = tokio::process::Command::new(&self.cli)
742            .args(["start", container_id])
743            .output()
744            .await
745            .map(|o| o.status.success())
746            .unwrap_or(false);
747        if !started {
748            return None;
749        }
750        self.inspect_ip(container_id).await
751    }
752
753    async fn reboot(&self, container_id: &str) {
754        let _ = tokio::process::Command::new(&self.cli)
755            .args(["restart", container_id])
756            .output()
757            .await;
758    }
759
760    async fn remove(&self, container_id: &str) {
761        let _ = tokio::process::Command::new(&self.cli)
762            .args(["rm", "-f", container_id])
763            .output()
764            .await;
765    }
766
767    /// The container's combined stdout+stderr (`docker logs`). `None` if the
768    /// command fails; an empty log is `Some(vec![])`.
769    async fn logs(&self, container_id: &str) -> Option<Vec<u8>> {
770        let output = tokio::process::Command::new(&self.cli)
771            .args(["logs", container_id])
772            .output()
773            .await
774            .ok()?;
775        if !output.status.success() {
776            return None;
777        }
778        // `docker logs` writes the container's stdout to ours and its stderr to
779        // ours; concatenate so the console output carries both streams.
780        let mut buf = output.stdout;
781        buf.extend_from_slice(&output.stderr);
782        Some(buf)
783    }
784}