Skip to main content

fakecloud_ec2/runtime/
mod.rs

1//! Backing-container runtime for EC2 instances.
2//!
3//! `RunInstances` spins a real container per instance; the instance
4//! lifecycle (`Start`/`Stop`/`Reboot`/`Terminate`) maps onto the container
5//! lifecycle, and `DescribeInstances` reports the container's real private
6//! IP. The container can run either as a local Docker/Podman container (the
7//! default) or as a native Kubernetes Pod (`FAKECLOUD_EC2_BACKEND=k8s` or the
8//! global `FAKECLOUD_CONTAINER_BACKEND=k8s`).
9//!
10//! Operations are keyed by **instance id**, not the backend handle: a
11//! Kubernetes Pod can't be stopped and restarted in place, so `Stop` deletes
12//! the Pod and `Start`/`Reboot` recreate it. The runtime therefore keeps,
13//! per instance, the handle plus enough of the original request (image,
14//! user-data) to recreate the backing container deterministically.
15//!
16//! The runtime is strictly additive: when no container backend is available
17//! the control plane keeps its metadata-faithful behaviour (synthesized IPs,
18//! state transitions) so every API call still succeeds. Real container
19//! backing is best-effort fidelity layered on top.
20
21pub mod firewall;
22mod k8s;
23pub mod netpolicy;
24
25use std::collections::{BTreeMap, HashMap};
26use std::sync::Arc;
27
28use parking_lot::RwLock;
29
30use firewall::{
31    render_bridge_ruleset, render_ruleset, resolve_enforcement_mode, EnforcementMode,
32    InstanceRules, SubnetFirewall,
33};
34
35/// Default base image an instance's container runs. AMIs don't map to a
36/// concrete OS image, so we boot a real Amazon Linux container by default
37/// (overridable via `FAKECLOUD_EC2_DEFAULT_IMAGE`, e.g. to a lighter image
38/// in CI). The container is kept alive with `tail -f /dev/null` — EC2
39/// instances are long-running hosts, not one-shot tasks. `tail` is used
40/// rather than `sleep infinity` so any base image works (busybox `sleep`
41/// rejects `infinity`).
42const DEFAULT_IMAGE_ENV: &str = "FAKECLOUD_EC2_DEFAULT_IMAGE";
43const DEFAULT_IMAGE: &str = "amazonlinux:2023";
44
45#[derive(Debug, thiserror::Error)]
46pub enum RuntimeError {
47    #[error("container failed to start: {0}")]
48    ContainerStartFailed(String),
49}
50
51/// Error initializing the Kubernetes backend at startup. Surfaced to the
52/// operator so a misconfigured cluster fails fast rather than silently
53/// falling back to Docker.
54#[derive(Debug, thiserror::Error)]
55pub enum BackendInitError {
56    #[error(transparent)]
57    Env(#[from] fakecloud_k8s::K8sEnvError),
58    #[error(transparent)]
59    PodConfig(#[from] fakecloud_k8s::K8sPodConfigError),
60    #[error("failed to connect to the Kubernetes cluster: {0}")]
61    Connect(String),
62}
63
64/// A running instance's backing container.
65#[derive(Debug, Clone)]
66pub struct RunningInstance {
67    /// Backend-specific handle: a Docker container id, or a Pod name.
68    pub container_id: String,
69    /// The instance's private IP — the container's address on the daemon
70    /// network (Docker) or the Pod IP (k8s).
71    pub private_ip: String,
72    /// Name of the backing daemon network the container was attached to
73    /// (`fakecloud-subnet-<id>`), or `None` when it ran on the default bridge
74    /// (no network spec, or creation failed and we fell back). Surfaced for
75    /// introspection (#1745 phase 5).
76    pub network: Option<String>,
77}
78
79/// The L3 placement of an instance's backing container: which subnet it lands
80/// in and whether that subnet is private.
81///
82/// Per-subnet networks give the isolation #1745 wants for free: two instances
83/// in the same subnet share a bridge and can talk; instances in different
84/// subnets / VPCs land on different bridges and cannot route to each other.
85#[derive(Debug, Clone)]
86pub struct InstanceNetwork {
87    /// The EC2 subnet id the instance launched into.
88    pub subnet_id: String,
89    /// True when the subnet has no `0.0.0.0/0 -> igw` route (private): the
90    /// backing network is created `--internal` (no NAT to host/internet).
91    pub internal: bool,
92}
93
94/// The daemon network name backing an EC2 subnet. Stable per subnet so every
95/// instance in the subnet attaches to the same bridge.
96pub fn subnet_network_name(subnet_id: &str) -> String {
97    format!("fakecloud-subnet-{subnet_id}")
98}
99
100/// How this runtime isolates instance traffic, surfaced by the
101/// `/_fakecloud/ec2/instance-networks` introspection endpoint so users can
102/// answer "why can't X reach Y" — which backend, which SG-enforcement
103/// mechanism, and whether it's actually active vs degraded to metadata-only.
104#[derive(Debug, Clone)]
105pub struct NetworkIsolationSummary {
106    /// `docker` | `podman` | `kubernetes`.
107    pub backend: &'static str,
108    /// `nftables` (Docker host firewall) | `networkpolicy` (k8s) | `disabled`.
109    pub sg_enforcement: &'static str,
110    /// Whether security-group rules are actually enforced. False means rules
111    /// are tracked but not applied (no `CAP_NET_ADMIN`, or a CNI that ignores
112    /// NetworkPolicy) — phase-2 L3 isolation still holds.
113    pub enforced: bool,
114}
115
116/// What the runtime remembers per instance so it can drive the backing
117/// container's lifecycle and recreate it (k8s `Start`/`Reboot`).
118#[derive(Debug, Clone)]
119struct InstanceRecord {
120    /// Docker container id, or Pod name.
121    handle: String,
122    /// The owning account id, captured at `RunInstances`. Keys the durable
123    /// data volume (see [`data_volume_name`]) so `TerminateInstances` can
124    /// remove the right volume without re-consulting the control plane.
125    account_id: String,
126    /// Resolved base image, captured at `RunInstances` so a recreate is
127    /// identical even if `FAKECLOUD_EC2_DEFAULT_IMAGE` later changes.
128    image: String,
129    /// Base64 user-data to re-run on recreate, if any.
130    user_data: Option<String>,
131    /// The instance's tags, captured at `RunInstances`. Reserved
132    /// `fakecloud-k8s/*` entries drive per-instance Pod scheduling and must
133    /// survive a k8s `Start`/`Reboot` recreate, so they're stored here
134    /// rather than re-read from the control plane.
135    tags: BTreeMap<String, String>,
136    /// The instance's subnet placement, captured at `RunInstances` so a k8s
137    /// `Start`/`Reboot` recreate re-applies the same network and phase-5
138    /// introspection can report the backing network. `None` in metadata-only
139    /// network mode.
140    network: Option<InstanceNetwork>,
141}
142
143/// The selected backing-container backend.
144#[derive(Debug, Clone)]
145enum InstanceBackend {
146    Docker(DockerInstances),
147    K8s(k8s::K8sInstances),
148}
149
150/// Host firewall enforcement for security groups + NACLs (#1745 phase 3).
151///
152/// The network-driver abstraction the issue asks for: today there is one real
153/// driver (nftables) plus the degraded no-op, selected once at construction.
154/// Branching on podman vs docker isn't needed explicitly — rootless podman
155/// can't touch the host firewall, so the `nft list ruleset` capability probe
156/// already degrades it; rootful podman with netavark passes the same probe.
157#[derive(Debug, Clone)]
158pub struct FirewallEnforcer {
159    mode: EnforcementMode,
160}
161
162impl FirewallEnforcer {
163    /// Resolve the enforcement mode from `FAKECLOUD_EC2_SG_ENFORCEMENT` and an
164    /// `nft` capability probe, warning once when enforcement was requested but
165    /// can't be backed (so the operator knows it degraded, not silently).
166    fn detect() -> Self {
167        let requested = std::env::var("FAKECLOUD_EC2_SG_ENFORCEMENT").ok();
168        let mode = resolve_enforcement_mode(
169            requested.as_deref(),
170            firewall::host_shares_daemon_netns(),
171            firewall::nft_available,
172        );
173        if requested.is_some() && mode == EnforcementMode::Disabled {
174            tracing::warn!(
175                "EC2 security-group enforcement was requested but it can't take effect here \
176                 (needs nftables + CAP_NET_ADMIN on a native-Linux host whose daemon shares this \
177                 network namespace — Docker Desktop / podman-machine run the daemon in a VM); \
178                 falling back to metadata-only (phase-2 L3 isolation stays active, security-group \
179                 rules are tracked but not enforced)"
180            );
181        } else if mode == EnforcementMode::Nftables {
182            tracing::info!("EC2 security-group enforcement active via nftables");
183        }
184        Self { mode }
185    }
186
187    /// Disabled enforcer (k8s backend, or no container runtime).
188    fn disabled() -> Self {
189        Self {
190            mode: EnforcementMode::Disabled,
191        }
192    }
193
194    pub fn mode(&self) -> EnforcementMode {
195        self.mode
196    }
197
198    pub fn enabled(&self) -> bool {
199        self.mode != EnforcementMode::Disabled
200    }
201
202    /// Atomically swap in the rendered ruleset via `nft -f -`. No-op when
203    /// disabled. Best-effort: a failed apply logs and leaves the previous
204    /// ruleset in place rather than erroring the originating API call.
205    async fn reconcile(&self, subnets: &[SubnetFirewall]) {
206        if self.mode == EnforcementMode::Disabled {
207            return;
208        }
209        // Instances in the same subnet share one Linux bridge; their traffic is
210        // L2-switched and only traverses the `forward` chain (where our SG rules
211        // live) when bridge netfilter is enabled. Without this, same-subnet SG
212        // rules silently filter nothing — exactly what the real-packet E2E
213        // caught. Needs CAP_NET_ADMIN (which the enforcer holds) and the
214        // `modprobe`/`sysctl` binaries (shipped via kmod/procps in the image).
215        // Warn rather than swallow the error: a missing binary or a failed call
216        // means enforcement degrades to filtering nothing, and the operator who
217        // opted in deserves to know (bug-audit 2026-06-20, 0.B1).
218        match tokio::process::Command::new("modprobe")
219            .arg("br_netfilter")
220            .output()
221            .await
222        {
223            Ok(o) if o.status.success() => {}
224            Ok(o) => tracing::warn!(
225                stderr = %String::from_utf8_lossy(&o.stderr).trim(),
226                "modprobe br_netfilter failed; same-subnet security-group enforcement may filter nothing"
227            ),
228            Err(e) => tracing::warn!(
229                error = %e,
230                "could not run modprobe (is kmod installed?); same-subnet security-group enforcement may filter nothing"
231            ),
232        }
233        match tokio::process::Command::new("sysctl")
234            .args(["-w", "net.bridge.bridge-nf-call-iptables=1"])
235            .output()
236            .await
237        {
238            Ok(o) if o.status.success() => {}
239            Ok(o) => tracing::warn!(
240                stderr = %String::from_utf8_lossy(&o.stderr).trim(),
241                "sysctl bridge-nf-call-iptables=1 failed; same-subnet security-group enforcement may filter nothing"
242            ),
243            Err(e) => tracing::warn!(
244                error = %e,
245                "could not run sysctl (is procps installed?); same-subnet security-group enforcement may filter nothing"
246            ),
247        }
248        use tokio::io::AsyncWriteExt;
249        // Load a rendered ruleset via `nft -f -`. `required=false` marks the
250        // best-effort same-subnet bridge table: a kernel without
251        // `nf_conntrack_bridge` rejects its `ct state` line, and since the inet
252        // table is applied independently first, that rejection is logged at
253        // debug (degraded same-subnet enforcement) rather than warn.
254        async fn load_nft(label: &str, ruleset: &str, subnets: usize, required: bool) {
255            let mut child = match tokio::process::Command::new("nft")
256                .args(["-f", "-"])
257                .stdin(std::process::Stdio::piped())
258                .stdout(std::process::Stdio::null())
259                .stderr(std::process::Stdio::piped())
260                .spawn()
261            {
262                Ok(c) => c,
263                Err(e) => {
264                    tracing::warn!(error = %e, table = label, "failed to spawn nft; security-group ruleset not applied");
265                    return;
266                }
267            };
268            if let Some(mut stdin) = child.stdin.take() {
269                let _ = stdin.write_all(ruleset.as_bytes()).await;
270                let _ = stdin.shutdown().await;
271            }
272            match child.wait_with_output().await {
273                Ok(out) if out.status.success() => {
274                    tracing::debug!(
275                        subnets,
276                        table = label,
277                        "applied EC2 security-group nft ruleset"
278                    );
279                }
280                Ok(out) => {
281                    let stderr = String::from_utf8_lossy(&out.stderr);
282                    let stderr = stderr.trim();
283                    if required {
284                        tracing::warn!(table = label, stderr = %stderr, "nft rejected the security-group ruleset; leaving the previous ruleset in place");
285                    } else {
286                        tracing::debug!(table = label, stderr = %stderr, "bridge-family SG ruleset not applied (kernel may lack nf_conntrack_bridge); same-subnet enforcement degraded to inet table only");
287                    }
288                }
289                Err(e) => tracing::warn!(error = %e, table = label, "nft apply failed"),
290            }
291        }
292        let n = subnets.len();
293        // inet: cross-subnet routed enforcement (required). bridge: same-subnet
294        // L2 enforcement that the inet forward hook misses for bridged frames.
295        load_nft("inet fakecloud_ec2", &render_ruleset(subnets), n, true).await;
296        load_nft(
297            "bridge fakecloud_ec2_l2",
298            &render_bridge_ruleset(subnets),
299            n,
300            false,
301        )
302        .await;
303    }
304}
305
306#[derive(Debug, Clone)]
307pub struct Ec2Runtime {
308    backend: InstanceBackend,
309    /// Per-instance backing records, keyed by EC2 instance id, so the
310    /// lifecycle operations and reset/shutdown teardown work without
311    /// consulting service state.
312    instances: Arc<RwLock<HashMap<String, InstanceRecord>>>,
313    /// Host firewall enforcer for security groups + NACLs.
314    firewall: FirewallEnforcer,
315    /// Serializes firewall reconciles. Reconcile is fired from many concurrent
316    /// background tasks (per SG/NACL/lifecycle event); without this, two
317    /// reconciles built from divergent state could interleave so the k8s
318    /// apply+prune of one deletes a policy the other just applied (bug-hunt
319    /// 2026-06-18 finding 4.3). Holding it across the whole reconcile makes the
320    /// last-started reconcile the last-applied for both backends.
321    reconcile_lock: Arc<tokio::sync::Mutex<()>>,
322}
323
324impl Ec2Runtime {
325    /// Construct the Docker/Podman backend. Returns `None` when no container
326    /// CLI is available — callers then run in metadata-only mode.
327    pub fn new() -> Option<Self> {
328        let cli = fakecloud_core::container_net::detect_container_cli()?;
329        Some(Self {
330            backend: InstanceBackend::Docker(DockerInstances {
331                cli,
332                instance_id: format!("fakecloud-{}", std::process::id()),
333            }),
334            instances: Arc::new(RwLock::new(HashMap::new())),
335            firewall: FirewallEnforcer::detect(),
336            reconcile_lock: Arc::new(tokio::sync::Mutex::new(())),
337        })
338    }
339
340    /// Construct the Kubernetes backend. `server_port` is fakecloud's bound
341    /// port (used when `FAKECLOUD_K8S_SELF_URL` omits one). Fails fast on
342    /// misconfiguration — never silently degrades to Docker.
343    pub async fn new_k8s(server_port: u16) -> Result<Self, BackendInitError> {
344        let backend = k8s::K8sInstances::from_env(server_port).await?;
345        Ok(Self {
346            backend: InstanceBackend::K8s(backend),
347            instances: Arc::new(RwLock::new(HashMap::new())),
348            // k8s isolation is a NetworkPolicy concern (phase 4), not host nft.
349            firewall: FirewallEnforcer::disabled(),
350            reconcile_lock: Arc::new(tokio::sync::Mutex::new(())),
351        })
352    }
353
354    /// The firewall enforcer, so the control plane can skip building the model
355    /// when enforcement is disabled and report the mode for introspection.
356    pub fn firewall(&self) -> &FirewallEnforcer {
357        &self.firewall
358    }
359
360    /// Re-render and atomically apply the security-group/NACL ruleset for the
361    /// given per-subnet model. No-op (cheap) when enforcement is disabled.
362    /// Serialized against other reconciles (finding 4.3).
363    pub async fn reconcile_firewall(&self, subnets: Vec<SubnetFirewall>) {
364        let _guard = self.reconcile_lock.lock().await;
365        self.firewall.reconcile(&subnets).await;
366    }
367
368    /// Whether this runtime backs network isolation with real enforcement —
369    /// host nftables (Docker, opt-in) or k8s NetworkPolicy. Lets the control
370    /// plane skip building the firewall model entirely when neither applies.
371    pub fn network_isolation_enforced(&self) -> bool {
372        self.firewall.enabled() || self.is_k8s()
373    }
374
375    /// True for the Kubernetes backend (isolation via NetworkPolicy).
376    pub fn is_k8s(&self) -> bool {
377        matches!(self.backend, InstanceBackend::K8s(_))
378    }
379
380    /// Apply one NetworkPolicy per instance for the k8s backend. No-op on the
381    /// Docker backend (which uses nftables instead). Serialized against other
382    /// reconciles so a concurrent apply+prune can't delete a just-applied
383    /// policy (finding 4.3).
384    pub async fn reconcile_network_policies(&self, rules: Vec<InstanceRules>) {
385        if let InstanceBackend::K8s(k) = &self.backend {
386            let _guard = self.reconcile_lock.lock().await;
387            k.reconcile_network_policies(&rules).await;
388        }
389    }
390
391    /// A snapshot of how this runtime isolates instance traffic, for the
392    /// `/_fakecloud/ec2/instance-networks` introspection endpoint (#1745 ph5).
393    pub fn network_isolation_summary(&self) -> NetworkIsolationSummary {
394        match &self.backend {
395            InstanceBackend::Docker(d) => NetworkIsolationSummary {
396                backend: if fakecloud_core::container_net::is_podman_binary(&d.cli) {
397                    "podman"
398                } else {
399                    "docker"
400                },
401                sg_enforcement: match self.firewall.mode() {
402                    EnforcementMode::Nftables => "nftables",
403                    EnforcementMode::Disabled => "disabled",
404                },
405                enforced: self.firewall.enabled(),
406            },
407            InstanceBackend::K8s(k) => NetworkIsolationSummary {
408                backend: "kubernetes",
409                sg_enforcement: "networkpolicy",
410                // NetworkPolicies are always created; "enforced" reflects
411                // whether the detected CNI actually applies them.
412                enforced: k.cni_enforces(),
413            },
414        }
415    }
416
417    /// Name of the active backend, for logging.
418    pub fn cli_name(&self) -> &str {
419        match &self.backend {
420            InstanceBackend::Docker(d) => &d.cli,
421            InstanceBackend::K8s(_) => "kubernetes",
422        }
423    }
424
425    /// Boot a container for an instance. `user_data` is the base64-encoded
426    /// user-data as received on the wire (RunInstances `UserData`), run at
427    /// boot the way cloud-init would, if present.
428    pub async fn run_instance(
429        &self,
430        account_id: &str,
431        instance_id: &str,
432        user_data: Option<&str>,
433        tags: &BTreeMap<String, String>,
434        network: Option<&InstanceNetwork>,
435    ) -> Result<RunningInstance, RuntimeError> {
436        let image = default_image();
437        let running = match &self.backend {
438            // Docker attaches the container to the subnet's per-VPC bridge for
439            // L3 isolation. k8s pods share a flat network; isolation there is a
440            // NetworkPolicy concern handled separately (#1745 phase 4).
441            InstanceBackend::Docker(d) => {
442                d.run_instance(account_id, instance_id, &image, user_data, network)
443                    .await?
444            }
445            InstanceBackend::K8s(k) => k.spawn_pod(instance_id, &image, user_data, tags).await?,
446        };
447        self.instances.write().insert(
448            instance_id.to_string(),
449            InstanceRecord {
450                handle: running.container_id.clone(),
451                account_id: account_id.to_string(),
452                image,
453                user_data: user_data.map(str::to_string),
454                tags: tags.clone(),
455                network: network.cloned(),
456            },
457        );
458        Ok(running)
459    }
460
461    /// Stop an instance's backing container (maps to `StopInstances`).
462    /// Docker stops the container in place; k8s deletes the Pod (recreated
463    /// on the next `Start`).
464    pub async fn stop_instance(&self, instance_id: &str) {
465        let Some(handle) = self.handle_of(instance_id) else {
466            return;
467        };
468        match &self.backend {
469            InstanceBackend::Docker(d) => d.stop(&handle).await,
470            InstanceBackend::K8s(k) => k.delete_pod(&handle).await,
471        }
472    }
473
474    /// Start a previously-stopped instance (maps to `StartInstances`).
475    /// Returns the running container's (possibly new) handle and private IP.
476    /// Docker starts the existing container; k8s recreates the Pod under a new
477    /// unique name, so the handle changes — callers should persist it.
478    pub async fn start_instance(&self, instance_id: &str) -> Option<RunningInstance> {
479        let record = self.instances.read().get(instance_id)?.clone();
480        match &self.backend {
481            InstanceBackend::Docker(d) => {
482                // Same container; only the IP may change. The subnet network the
483                // container was created on persists across stop/start.
484                let private_ip = d.start(&record.handle).await?;
485                Some(RunningInstance {
486                    container_id: record.handle,
487                    private_ip,
488                    network: record
489                        .network
490                        .as_ref()
491                        .map(|n| subnet_network_name(&n.subnet_id)),
492                })
493            }
494            InstanceBackend::K8s(k) => {
495                let running = k
496                    .spawn_pod(
497                        instance_id,
498                        &record.image,
499                        record.user_data.as_deref(),
500                        &record.tags,
501                    )
502                    .await
503                    .ok()?;
504                self.update_handle(instance_id, &running.container_id);
505                Some(running)
506            }
507        }
508    }
509
510    /// Restart an instance's backing container (maps to `RebootInstances`).
511    /// Docker restarts in place; k8s deletes and recreates the Pod under a new
512    /// name. Returns the running container's handle + IP when it changed (k8s),
513    /// so callers can persist the new handle; `None` when nothing to update.
514    pub async fn reboot_instance(&self, instance_id: &str) -> Option<RunningInstance> {
515        let record = self.instances.read().get(instance_id).cloned()?;
516        match &self.backend {
517            InstanceBackend::Docker(d) => {
518                d.reboot(&record.handle).await;
519                None
520            }
521            InstanceBackend::K8s(k) => {
522                k.delete_pod(&record.handle).await;
523                let running = k
524                    .spawn_pod(
525                        instance_id,
526                        &record.image,
527                        record.user_data.as_deref(),
528                        &record.tags,
529                    )
530                    .await
531                    .ok()?;
532                self.update_handle(instance_id, &running.container_id);
533                Some(running)
534            }
535        }
536    }
537
538    /// Remove an instance's backing container (maps to `TerminateInstances`).
539    pub async fn terminate_instance(&self, instance_id: &str) {
540        let record = self.instances.write().remove(instance_id);
541        if let Some(record) = record {
542            match &self.backend {
543                InstanceBackend::Docker(d) => {
544                    d.remove(&record.handle).await;
545                    // Drop the durable root-disk volume so a later instance
546                    // reusing this id starts clean (terminate = volume gone,
547                    // matching a deleted EBS root volume). No-op when volumes
548                    // are disabled.
549                    d.remove_data_volume(&record.account_id, instance_id).await;
550                }
551                InstanceBackend::K8s(k) => k.delete_pod(&record.handle).await,
552            }
553        }
554    }
555
556    /// Tear down every container this runtime spawned (used on reset and
557    /// shutdown). The Docker backend leans on the shared reaper for any
558    /// container it loses track of.
559    pub async fn stop_all(&self) {
560        let records: Vec<InstanceRecord> = {
561            let mut instances = self.instances.write();
562            instances.drain().map(|(_, r)| r).collect()
563        };
564        for record in records {
565            match &self.backend {
566                InstanceBackend::Docker(d) => d.remove(&record.handle).await,
567                InstanceBackend::K8s(k) => k.delete_pod(&record.handle).await,
568            }
569        }
570    }
571
572    /// Sweep instance Pods orphaned by a previous fakecloud process (k8s
573    /// only; the Docker backend relies on the shared reaper).
574    pub async fn reap_stale(&self) {
575        if let InstanceBackend::K8s(k) = &self.backend {
576            k.reap_stale().await;
577        }
578    }
579
580    /// The backing container's console log — its combined stdout/stderr, which
581    /// includes anything user-data printed at boot (maps to `GetConsoleOutput`).
582    /// `None` for an unbacked instance or when logs can't be read.
583    pub async fn console_output(&self, instance_id: &str) -> Option<Vec<u8>> {
584        let handle = self.handle_of(instance_id)?;
585        match &self.backend {
586            InstanceBackend::Docker(d) => d.logs(&handle).await,
587            InstanceBackend::K8s(k) => k.logs(&handle).await,
588        }
589    }
590
591    fn handle_of(&self, instance_id: &str) -> Option<String> {
592        self.instances
593            .read()
594            .get(instance_id)
595            .map(|r| r.handle.clone())
596    }
597
598    fn update_handle(&self, instance_id: &str, handle: &str) {
599        if let Some(record) = self.instances.write().get_mut(instance_id) {
600            record.handle = handle.to_string();
601        }
602    }
603}
604
605fn default_image() -> String {
606    std::env::var(DEFAULT_IMAGE_ENV).unwrap_or_else(|_| DEFAULT_IMAGE.to_string())
607}
608
609/// The in-instance directory backed by the durable data volume. Defaults to
610/// `/var/lib/fakecloud/ec2`; override with `FAKECLOUD_EC2_INSTANCE_DATA_DIR`
611/// to capture whichever path the instance's workload writes its long-lived
612/// state to. This is fakecloud's persistent-instance-data convention rather
613/// than a full root-filesystem snapshot: data written here survives restart
614/// and stop/start; the rest of the container's ephemeral filesystem does not.
615fn instance_data_dir() -> String {
616    std::env::var("FAKECLOUD_EC2_INSTANCE_DATA_DIR")
617        .unwrap_or_else(|_| "/var/lib/fakecloud/ec2".to_string())
618}
619
620/// Whether EC2 instance data should survive a fakecloud restart via a durable
621/// named volume. OFF by default (ephemeral, keeping test/CI runs clean and
622/// avoiding a stale volume bleeding into a later instance that reuses an id);
623/// opt in with `FAKECLOUD_PERSIST_EC2_VOLUMES=1`. A follow-up flips this
624/// default to on in persistent storage mode by exporting the same env var.
625fn ec2_volumes_enabled() -> bool {
626    std::env::var("FAKECLOUD_PERSIST_EC2_VOLUMES")
627        .map(|v| matches!(v.as_str(), "1" | "true" | "TRUE" | "yes"))
628        .unwrap_or(false)
629}
630
631/// Deterministic Docker volume name for an instance's data dir. Keyed only on
632/// account + instance id (NOT the per-process fakecloud instance id) so the
633/// same volume reattaches after a fakecloud restart recreates the container.
634/// Characters outside Docker's `[a-zA-Z0-9_.-]` volume-name set become `-`.
635fn data_volume_name(account_id: &str, instance_id: &str) -> String {
636    let sanitize = |s: &str| -> String {
637        s.chars()
638            .map(|c| {
639                if c.is_ascii_alphanumeric() || c == '_' || c == '.' || c == '-' {
640                    c
641                } else {
642                    '-'
643                }
644            })
645            .collect()
646    };
647    format!(
648        "fakecloud-ec2-data-{}-{}",
649        sanitize(account_id),
650        sanitize(instance_id)
651    )
652}
653
654/// Keep-alive command + user-data wrapper for a base image. Shared by both
655/// backends so they boot identical containers. When `user_data` (base64) is
656/// present it is decoded and run as a root shell script, backgrounded so a
657/// slow script never blocks readiness, then the container tails forever.
658fn boot_command(user_data: Option<&str>) -> Vec<String> {
659    match user_data.filter(|s| !s.is_empty()) {
660        Some(b64) => {
661            let script = format!("printf %s '{b64}' | base64 -d | sh & exec tail -f /dev/null");
662            vec!["sh".to_string(), "-c".to_string(), script]
663        }
664        None => vec![
665            "tail".to_string(),
666            "-f".to_string(),
667            "/dev/null".to_string(),
668        ],
669    }
670}
671
672/// Docker/Podman backend: shells out to the container CLI.
673#[derive(Debug, Clone)]
674struct DockerInstances {
675    cli: String,
676    instance_id: String,
677}
678
679impl DockerInstances {
680    async fn run_instance(
681        &self,
682        account_id: &str,
683        instance_id: &str,
684        image: &str,
685        user_data: Option<&str>,
686        network: Option<&InstanceNetwork>,
687    ) -> Result<RunningInstance, RuntimeError> {
688        // Ensure the subnet's bridge exists and attach to it for L3 isolation.
689        // Network creation is best-effort: on failure we fall back to the
690        // default bridge so the instance still boots (no regression vs today).
691        let attached_network = match network {
692            Some(net) => self.ensure_subnet_network(net).await,
693            None => None,
694        };
695
696        let mut args: Vec<String> = vec![
697            "run".to_string(),
698            "-d".to_string(),
699            "--label".to_string(),
700            format!("fakecloud-ec2={instance_id}"),
701            "--label".to_string(),
702            format!("fakecloud-instance={}", self.instance_id),
703        ];
704        // Optionally back the instance's writable data directory with a durable
705        // named volume keyed on account + instance id, so the filesystem state
706        // an instance writes there survives a fakecloud restart (the recovery
707        // path recreates the container, which reattaches the same volume) and a
708        // stop/start (Docker reuses the same container, so the volume persists
709        // regardless). OFF by default to keep test/CI runs ephemeral and avoid
710        // a stale volume bleeding into a later instance that reuses an id;
711        // enabled by `FAKECLOUD_PERSIST_EC2_VOLUMES=1` (and, in a follow-up,
712        // default-on in persistent storage mode). The volume is dropped on
713        // TerminateInstances. See [`data_volume_name`] / [`instance_data_dir`].
714        if ec2_volumes_enabled() {
715            args.push("-v".to_string());
716            args.push(format!(
717                "{}:{}",
718                data_volume_name(account_id, instance_id),
719                instance_data_dir()
720            ));
721        }
722        if let Some(name) = &attached_network {
723            args.push("--network".to_string());
724            args.push(name.clone());
725        }
726        args.push(image.to_string());
727        args.extend(boot_command(user_data));
728
729        let output = tokio::process::Command::new(&self.cli)
730            .args(&args)
731            .output()
732            .await
733            .map_err(|e| RuntimeError::ContainerStartFailed(e.to_string()))?;
734
735        if !output.status.success() {
736            return Err(RuntimeError::ContainerStartFailed(
737                String::from_utf8_lossy(&output.stderr).trim().to_string(),
738            ));
739        }
740
741        let container_id = String::from_utf8_lossy(&output.stdout).trim().to_string();
742        let private_ip = self
743            .inspect_ip(&container_id)
744            .await
745            .unwrap_or_else(|| "10.0.0.1".to_string());
746
747        Ok(RunningInstance {
748            container_id,
749            private_ip,
750            network: attached_network,
751        })
752    }
753
754    /// Create (idempotently) the daemon network backing a subnet and return its
755    /// name, or `None` if creation failed (caller falls back to the default
756    /// bridge). The network carries the shared `fakecloud-instance` ownership
757    /// label so the startup reaper prunes it after an ungraceful restart, plus
758    /// a `fakecloud-subnet=<id>` label for introspection. Private subnets get
759    /// an `--internal` network (no NAT to the host/internet).
760    async fn ensure_subnet_network(&self, net: &InstanceNetwork) -> Option<String> {
761        let name = subnet_network_name(&net.subnet_id);
762        let mut args = vec!["network".to_string(), "create".to_string()];
763        if net.internal {
764            args.push("--internal".to_string());
765        }
766        args.push("--label".to_string());
767        args.push(format!("fakecloud-subnet={}", net.subnet_id));
768        args.push("--label".to_string());
769        args.push(format!("fakecloud-instance={}", self.instance_id));
770        args.push(name.clone());
771
772        let output = tokio::process::Command::new(&self.cli)
773            .args(&args)
774            .output()
775            .await;
776        match output {
777            // Created fresh.
778            Ok(out) if out.status.success() => Some(name),
779            // Already exists (another instance in the same subnet created it):
780            // a benign race — the network is there, so attach to it.
781            Ok(out) => {
782                let err = String::from_utf8_lossy(&out.stderr);
783                if err.contains("already exists") || err.contains("exists") {
784                    Some(name)
785                } else {
786                    tracing::warn!(
787                        subnet = %net.subnet_id,
788                        network = %name,
789                        error = %err.trim(),
790                        "subnet network creation failed; falling back to default bridge"
791                    );
792                    None
793                }
794            }
795            Err(e) => {
796                tracing::warn!(
797                    subnet = %net.subnet_id,
798                    network = %name,
799                    error = %e,
800                    "subnet network creation failed; falling back to default bridge"
801                );
802                None
803            }
804        }
805    }
806
807    /// Read the container's private IP from `inspect`. Returns `None` if the
808    /// container has no address (e.g. host networking) — the caller falls
809    /// back to a synthesized IP.
810    async fn inspect_ip(&self, container_id: &str) -> Option<String> {
811        let output = tokio::process::Command::new(&self.cli)
812            .args([
813                "inspect",
814                "-f",
815                "{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}",
816                container_id,
817            ])
818            .output()
819            .await
820            .ok()?;
821        if !output.status.success() {
822            return None;
823        }
824        let ip = String::from_utf8_lossy(&output.stdout).trim().to_string();
825        if ip.is_empty() {
826            None
827        } else {
828            Some(ip)
829        }
830    }
831
832    async fn stop(&self, container_id: &str) {
833        let _ = tokio::process::Command::new(&self.cli)
834            .args(["stop", container_id])
835            .output()
836            .await;
837    }
838
839    async fn start(&self, container_id: &str) -> Option<String> {
840        let started = tokio::process::Command::new(&self.cli)
841            .args(["start", container_id])
842            .output()
843            .await
844            .map(|o| o.status.success())
845            .unwrap_or(false);
846        if !started {
847            return None;
848        }
849        self.inspect_ip(container_id).await
850    }
851
852    async fn reboot(&self, container_id: &str) {
853        let _ = tokio::process::Command::new(&self.cli)
854            .args(["restart", container_id])
855            .output()
856            .await;
857    }
858
859    async fn remove(&self, container_id: &str) {
860        let _ = tokio::process::Command::new(&self.cli)
861            .args(["rm", "-f", container_id])
862            .output()
863            .await;
864    }
865
866    /// Remove the durable data volume for an instance (called on
867    /// `TerminateInstances`) so a later instance reusing the same id starts
868    /// clean rather than inheriting the terminated instance's filesystem. A
869    /// no-op when volume persistence is disabled (no such volume was created).
870    async fn remove_data_volume(&self, account_id: &str, instance_id: &str) {
871        if !ec2_volumes_enabled() {
872            return;
873        }
874        let _ = tokio::process::Command::new(&self.cli)
875            .args([
876                "volume",
877                "rm",
878                "-f",
879                &data_volume_name(account_id, instance_id),
880            ])
881            .output()
882            .await;
883    }
884
885    /// The container's combined stdout+stderr (`docker logs`). `None` if the
886    /// command fails; an empty log is `Some(vec![])`.
887    async fn logs(&self, container_id: &str) -> Option<Vec<u8>> {
888        let output = tokio::process::Command::new(&self.cli)
889            .args(["logs", container_id])
890            .output()
891            .await
892            .ok()?;
893        if !output.status.success() {
894            return None;
895        }
896        // `docker logs` writes the container's stdout to ours and its stderr to
897        // ours; concatenate so the console output carries both streams.
898        let mut buf = output.stdout;
899        buf.extend_from_slice(&output.stderr);
900        Some(buf)
901    }
902}
903
904#[cfg(test)]
905mod volume_tests {
906    use super::*;
907
908    #[test]
909    fn data_volume_name_is_stable_and_sanitized() {
910        // Stable across calls (so recovery reattaches the same volume) and
911        // keyed on account + instance id, not the per-process instance id.
912        assert_eq!(
913            data_volume_name("123456789012", "i-0abc123"),
914            "fakecloud-ec2-data-123456789012-i-0abc123"
915        );
916        // Characters outside Docker's volume-name set become '-'.
917        assert_eq!(
918            data_volume_name("1234/5678", "i-0abc:1"),
919            "fakecloud-ec2-data-1234-5678-i-0abc-1"
920        );
921    }
922
923    #[test]
924    fn distinct_instances_get_distinct_volumes() {
925        // Two instances in the same account never share a data volume, so
926        // terminating one cannot wipe another's filesystem.
927        assert_ne!(
928            data_volume_name("123456789012", "i-aaaa"),
929            data_volume_name("123456789012", "i-bbbb")
930        );
931    }
932}