Skip to main content

fakecloud_ec2/runtime/
mod.rs

1//! Backing-container runtime for EC2 instances.
2//!
3//! `RunInstances` spins a real container per instance; the instance
4//! lifecycle (`Start`/`Stop`/`Reboot`/`Terminate`) maps onto the container
5//! lifecycle, and `DescribeInstances` reports the container's real private
6//! IP. The container can run either as a local Docker/Podman container (the
7//! default) or as a native Kubernetes Pod (`FAKECLOUD_EC2_BACKEND=k8s` or the
8//! global `FAKECLOUD_CONTAINER_BACKEND=k8s`).
9//!
10//! Operations are keyed by **instance id**, not the backend handle: a
11//! Kubernetes Pod can't be stopped and restarted in place, so `Stop` deletes
12//! the Pod and `Start`/`Reboot` recreate it. The runtime therefore keeps,
13//! per instance, the handle plus enough of the original request (image,
14//! user-data) to recreate the backing container deterministically.
15//!
16//! The runtime is strictly additive: when no container backend is available
17//! the control plane keeps its metadata-faithful behaviour (synthesized IPs,
18//! state transitions) so every API call still succeeds. Real container
19//! backing is best-effort fidelity layered on top.
20
21pub mod firewall;
22mod k8s;
23pub mod netpolicy;
24
25use std::collections::{BTreeMap, HashMap};
26use std::sync::Arc;
27
28use parking_lot::RwLock;
29
30use firewall::{
31    render_ruleset, resolve_enforcement_mode, EnforcementMode, InstanceRules, SubnetFirewall,
32};
33
34/// Default base image an instance's container runs. AMIs don't map to a
35/// concrete OS image, so we boot a real Amazon Linux container by default
36/// (overridable via `FAKECLOUD_EC2_DEFAULT_IMAGE`, e.g. to a lighter image
37/// in CI). The container is kept alive with `tail -f /dev/null` — EC2
38/// instances are long-running hosts, not one-shot tasks. `tail` is used
39/// rather than `sleep infinity` so any base image works (busybox `sleep`
40/// rejects `infinity`).
41const DEFAULT_IMAGE_ENV: &str = "FAKECLOUD_EC2_DEFAULT_IMAGE";
42const DEFAULT_IMAGE: &str = "amazonlinux:2023";
43
44#[derive(Debug, thiserror::Error)]
45pub enum RuntimeError {
46    #[error("container failed to start: {0}")]
47    ContainerStartFailed(String),
48}
49
50/// Error initializing the Kubernetes backend at startup. Surfaced to the
51/// operator so a misconfigured cluster fails fast rather than silently
52/// falling back to Docker.
53#[derive(Debug, thiserror::Error)]
54pub enum BackendInitError {
55    #[error(transparent)]
56    Env(#[from] fakecloud_k8s::K8sEnvError),
57    #[error(transparent)]
58    PodConfig(#[from] fakecloud_k8s::K8sPodConfigError),
59    #[error("failed to connect to the Kubernetes cluster: {0}")]
60    Connect(String),
61}
62
63/// A running instance's backing container.
64#[derive(Debug, Clone)]
65pub struct RunningInstance {
66    /// Backend-specific handle: a Docker container id, or a Pod name.
67    pub container_id: String,
68    /// The instance's private IP — the container's address on the daemon
69    /// network (Docker) or the Pod IP (k8s).
70    pub private_ip: String,
71    /// Name of the backing daemon network the container was attached to
72    /// (`fakecloud-subnet-<id>`), or `None` when it ran on the default bridge
73    /// (no network spec, or creation failed and we fell back). Surfaced for
74    /// introspection (#1745 phase 5).
75    pub network: Option<String>,
76}
77
78/// The L3 placement of an instance's backing container: which subnet it lands
79/// in and whether that subnet is private.
80///
81/// Per-subnet networks give the isolation #1745 wants for free: two instances
82/// in the same subnet share a bridge and can talk; instances in different
83/// subnets / VPCs land on different bridges and cannot route to each other.
84#[derive(Debug, Clone)]
85pub struct InstanceNetwork {
86    /// The EC2 subnet id the instance launched into.
87    pub subnet_id: String,
88    /// True when the subnet has no `0.0.0.0/0 -> igw` route (private): the
89    /// backing network is created `--internal` (no NAT to host/internet).
90    pub internal: bool,
91}
92
93/// The daemon network name backing an EC2 subnet. Stable per subnet so every
94/// instance in the subnet attaches to the same bridge.
95pub fn subnet_network_name(subnet_id: &str) -> String {
96    format!("fakecloud-subnet-{subnet_id}")
97}
98
99/// How this runtime isolates instance traffic, surfaced by the
100/// `/_fakecloud/ec2/instance-networks` introspection endpoint so users can
101/// answer "why can't X reach Y" — which backend, which SG-enforcement
102/// mechanism, and whether it's actually active vs degraded to metadata-only.
103#[derive(Debug, Clone)]
104pub struct NetworkIsolationSummary {
105    /// `docker` | `podman` | `kubernetes`.
106    pub backend: &'static str,
107    /// `nftables` (Docker host firewall) | `networkpolicy` (k8s) | `disabled`.
108    pub sg_enforcement: &'static str,
109    /// Whether security-group rules are actually enforced. False means rules
110    /// are tracked but not applied (no `CAP_NET_ADMIN`, or a CNI that ignores
111    /// NetworkPolicy) — phase-2 L3 isolation still holds.
112    pub enforced: bool,
113}
114
115/// What the runtime remembers per instance so it can drive the backing
116/// container's lifecycle and recreate it (k8s `Start`/`Reboot`).
117#[derive(Debug, Clone)]
118struct InstanceRecord {
119    /// Docker container id, or Pod name.
120    handle: String,
121    /// The owning account id, captured at `RunInstances`. Keys the durable
122    /// data volume (see [`data_volume_name`]) so `TerminateInstances` can
123    /// remove the right volume without re-consulting the control plane.
124    account_id: String,
125    /// Resolved base image, captured at `RunInstances` so a recreate is
126    /// identical even if `FAKECLOUD_EC2_DEFAULT_IMAGE` later changes.
127    image: String,
128    /// Base64 user-data to re-run on recreate, if any.
129    user_data: Option<String>,
130    /// The instance's tags, captured at `RunInstances`. Reserved
131    /// `fakecloud-k8s/*` entries drive per-instance Pod scheduling and must
132    /// survive a k8s `Start`/`Reboot` recreate, so they're stored here
133    /// rather than re-read from the control plane.
134    tags: BTreeMap<String, String>,
135    /// The instance's subnet placement, captured at `RunInstances` so a k8s
136    /// `Start`/`Reboot` recreate re-applies the same network and phase-5
137    /// introspection can report the backing network. `None` in metadata-only
138    /// network mode.
139    network: Option<InstanceNetwork>,
140}
141
142/// The selected backing-container backend.
143#[derive(Debug, Clone)]
144enum InstanceBackend {
145    Docker(DockerInstances),
146    K8s(k8s::K8sInstances),
147}
148
149/// Host firewall enforcement for security groups + NACLs (#1745 phase 3).
150///
151/// The network-driver abstraction the issue asks for: today there is one real
152/// driver (nftables) plus the degraded no-op, selected once at construction.
153/// Branching on podman vs docker isn't needed explicitly — rootless podman
154/// can't touch the host firewall, so the `nft list ruleset` capability probe
155/// already degrades it; rootful podman with netavark passes the same probe.
156#[derive(Debug, Clone)]
157pub struct FirewallEnforcer {
158    mode: EnforcementMode,
159}
160
161impl FirewallEnforcer {
162    /// Resolve the enforcement mode from `FAKECLOUD_EC2_SG_ENFORCEMENT` and an
163    /// `nft` capability probe, warning once when enforcement was requested but
164    /// can't be backed (so the operator knows it degraded, not silently).
165    fn detect() -> Self {
166        let requested = std::env::var("FAKECLOUD_EC2_SG_ENFORCEMENT").ok();
167        let mode = resolve_enforcement_mode(
168            requested.as_deref(),
169            firewall::host_shares_daemon_netns(),
170            firewall::nft_available,
171        );
172        if requested.is_some() && mode == EnforcementMode::Disabled {
173            tracing::warn!(
174                "EC2 security-group enforcement was requested but it can't take effect here \
175                 (needs nftables + CAP_NET_ADMIN on a native-Linux host whose daemon shares this \
176                 network namespace — Docker Desktop / podman-machine run the daemon in a VM); \
177                 falling back to metadata-only (phase-2 L3 isolation stays active, security-group \
178                 rules are tracked but not enforced)"
179            );
180        } else if mode == EnforcementMode::Nftables {
181            tracing::info!("EC2 security-group enforcement active via nftables");
182        }
183        Self { mode }
184    }
185
186    /// Disabled enforcer (k8s backend, or no container runtime).
187    fn disabled() -> Self {
188        Self {
189            mode: EnforcementMode::Disabled,
190        }
191    }
192
193    pub fn mode(&self) -> EnforcementMode {
194        self.mode
195    }
196
197    pub fn enabled(&self) -> bool {
198        self.mode != EnforcementMode::Disabled
199    }
200
201    /// Atomically swap in the rendered ruleset via `nft -f -`. No-op when
202    /// disabled. Best-effort: a failed apply logs and leaves the previous
203    /// ruleset in place rather than erroring the originating API call.
204    async fn reconcile(&self, subnets: &[SubnetFirewall]) {
205        if self.mode == EnforcementMode::Disabled {
206            return;
207        }
208        // Instances in the same subnet share one Linux bridge; their traffic is
209        // L2-switched and only traverses the `forward` chain (where our SG rules
210        // live) when bridge netfilter is enabled. Without this, same-subnet SG
211        // rules silently filter nothing — exactly what the real-packet E2E
212        // caught. Needs CAP_NET_ADMIN (which the enforcer holds) and the
213        // `modprobe`/`sysctl` binaries (shipped via kmod/procps in the image).
214        // Warn rather than swallow the error: a missing binary or a failed call
215        // means enforcement degrades to filtering nothing, and the operator who
216        // opted in deserves to know (bug-audit 2026-06-20, 0.B1).
217        match tokio::process::Command::new("modprobe")
218            .arg("br_netfilter")
219            .output()
220            .await
221        {
222            Ok(o) if o.status.success() => {}
223            Ok(o) => tracing::warn!(
224                stderr = %String::from_utf8_lossy(&o.stderr).trim(),
225                "modprobe br_netfilter failed; same-subnet security-group enforcement may filter nothing"
226            ),
227            Err(e) => tracing::warn!(
228                error = %e,
229                "could not run modprobe (is kmod installed?); same-subnet security-group enforcement may filter nothing"
230            ),
231        }
232        match tokio::process::Command::new("sysctl")
233            .args(["-w", "net.bridge.bridge-nf-call-iptables=1"])
234            .output()
235            .await
236        {
237            Ok(o) if o.status.success() => {}
238            Ok(o) => tracing::warn!(
239                stderr = %String::from_utf8_lossy(&o.stderr).trim(),
240                "sysctl bridge-nf-call-iptables=1 failed; same-subnet security-group enforcement may filter nothing"
241            ),
242            Err(e) => tracing::warn!(
243                error = %e,
244                "could not run sysctl (is procps installed?); same-subnet security-group enforcement may filter nothing"
245            ),
246        }
247        let ruleset = render_ruleset(subnets);
248        use tokio::io::AsyncWriteExt;
249        let mut child = match tokio::process::Command::new("nft")
250            .args(["-f", "-"])
251            .stdin(std::process::Stdio::piped())
252            .stdout(std::process::Stdio::null())
253            .stderr(std::process::Stdio::piped())
254            .spawn()
255        {
256            Ok(c) => c,
257            Err(e) => {
258                tracing::warn!(error = %e, "failed to spawn nft; security-group ruleset not applied");
259                return;
260            }
261        };
262        if let Some(mut stdin) = child.stdin.take() {
263            let _ = stdin.write_all(ruleset.as_bytes()).await;
264            let _ = stdin.shutdown().await;
265        }
266        match child.wait_with_output().await {
267            Ok(out) if out.status.success() => {
268                tracing::debug!(
269                    subnets = subnets.len(),
270                    "applied EC2 security-group nft ruleset"
271                );
272            }
273            Ok(out) => {
274                tracing::warn!(
275                    stderr = %String::from_utf8_lossy(&out.stderr).trim(),
276                    "nft rejected the security-group ruleset; leaving the previous ruleset in place"
277                );
278            }
279            Err(e) => tracing::warn!(error = %e, "nft apply failed"),
280        }
281    }
282}
283
284#[derive(Debug, Clone)]
285pub struct Ec2Runtime {
286    backend: InstanceBackend,
287    /// Per-instance backing records, keyed by EC2 instance id, so the
288    /// lifecycle operations and reset/shutdown teardown work without
289    /// consulting service state.
290    instances: Arc<RwLock<HashMap<String, InstanceRecord>>>,
291    /// Host firewall enforcer for security groups + NACLs.
292    firewall: FirewallEnforcer,
293    /// Serializes firewall reconciles. Reconcile is fired from many concurrent
294    /// background tasks (per SG/NACL/lifecycle event); without this, two
295    /// reconciles built from divergent state could interleave so the k8s
296    /// apply+prune of one deletes a policy the other just applied (bug-hunt
297    /// 2026-06-18 finding 4.3). Holding it across the whole reconcile makes the
298    /// last-started reconcile the last-applied for both backends.
299    reconcile_lock: Arc<tokio::sync::Mutex<()>>,
300}
301
302impl Ec2Runtime {
303    /// Construct the Docker/Podman backend. Returns `None` when no container
304    /// CLI is available — callers then run in metadata-only mode.
305    pub fn new() -> Option<Self> {
306        let cli = fakecloud_core::container_net::detect_container_cli()?;
307        Some(Self {
308            backend: InstanceBackend::Docker(DockerInstances {
309                cli,
310                instance_id: format!("fakecloud-{}", std::process::id()),
311            }),
312            instances: Arc::new(RwLock::new(HashMap::new())),
313            firewall: FirewallEnforcer::detect(),
314            reconcile_lock: Arc::new(tokio::sync::Mutex::new(())),
315        })
316    }
317
318    /// Construct the Kubernetes backend. `server_port` is fakecloud's bound
319    /// port (used when `FAKECLOUD_K8S_SELF_URL` omits one). Fails fast on
320    /// misconfiguration — never silently degrades to Docker.
321    pub async fn new_k8s(server_port: u16) -> Result<Self, BackendInitError> {
322        let backend = k8s::K8sInstances::from_env(server_port).await?;
323        Ok(Self {
324            backend: InstanceBackend::K8s(backend),
325            instances: Arc::new(RwLock::new(HashMap::new())),
326            // k8s isolation is a NetworkPolicy concern (phase 4), not host nft.
327            firewall: FirewallEnforcer::disabled(),
328            reconcile_lock: Arc::new(tokio::sync::Mutex::new(())),
329        })
330    }
331
332    /// The firewall enforcer, so the control plane can skip building the model
333    /// when enforcement is disabled and report the mode for introspection.
334    pub fn firewall(&self) -> &FirewallEnforcer {
335        &self.firewall
336    }
337
338    /// Re-render and atomically apply the security-group/NACL ruleset for the
339    /// given per-subnet model. No-op (cheap) when enforcement is disabled.
340    /// Serialized against other reconciles (finding 4.3).
341    pub async fn reconcile_firewall(&self, subnets: Vec<SubnetFirewall>) {
342        let _guard = self.reconcile_lock.lock().await;
343        self.firewall.reconcile(&subnets).await;
344    }
345
346    /// Whether this runtime backs network isolation with real enforcement —
347    /// host nftables (Docker, opt-in) or k8s NetworkPolicy. Lets the control
348    /// plane skip building the firewall model entirely when neither applies.
349    pub fn network_isolation_enforced(&self) -> bool {
350        self.firewall.enabled() || self.is_k8s()
351    }
352
353    /// True for the Kubernetes backend (isolation via NetworkPolicy).
354    pub fn is_k8s(&self) -> bool {
355        matches!(self.backend, InstanceBackend::K8s(_))
356    }
357
358    /// Apply one NetworkPolicy per instance for the k8s backend. No-op on the
359    /// Docker backend (which uses nftables instead). Serialized against other
360    /// reconciles so a concurrent apply+prune can't delete a just-applied
361    /// policy (finding 4.3).
362    pub async fn reconcile_network_policies(&self, rules: Vec<InstanceRules>) {
363        if let InstanceBackend::K8s(k) = &self.backend {
364            let _guard = self.reconcile_lock.lock().await;
365            k.reconcile_network_policies(&rules).await;
366        }
367    }
368
369    /// A snapshot of how this runtime isolates instance traffic, for the
370    /// `/_fakecloud/ec2/instance-networks` introspection endpoint (#1745 ph5).
371    pub fn network_isolation_summary(&self) -> NetworkIsolationSummary {
372        match &self.backend {
373            InstanceBackend::Docker(d) => NetworkIsolationSummary {
374                backend: if fakecloud_core::container_net::is_podman_binary(&d.cli) {
375                    "podman"
376                } else {
377                    "docker"
378                },
379                sg_enforcement: match self.firewall.mode() {
380                    EnforcementMode::Nftables => "nftables",
381                    EnforcementMode::Disabled => "disabled",
382                },
383                enforced: self.firewall.enabled(),
384            },
385            InstanceBackend::K8s(k) => NetworkIsolationSummary {
386                backend: "kubernetes",
387                sg_enforcement: "networkpolicy",
388                // NetworkPolicies are always created; "enforced" reflects
389                // whether the detected CNI actually applies them.
390                enforced: k.cni_enforces(),
391            },
392        }
393    }
394
395    /// Name of the active backend, for logging.
396    pub fn cli_name(&self) -> &str {
397        match &self.backend {
398            InstanceBackend::Docker(d) => &d.cli,
399            InstanceBackend::K8s(_) => "kubernetes",
400        }
401    }
402
403    /// Boot a container for an instance. `user_data` is the base64-encoded
404    /// user-data as received on the wire (RunInstances `UserData`), run at
405    /// boot the way cloud-init would, if present.
406    pub async fn run_instance(
407        &self,
408        account_id: &str,
409        instance_id: &str,
410        user_data: Option<&str>,
411        tags: &BTreeMap<String, String>,
412        network: Option<&InstanceNetwork>,
413    ) -> Result<RunningInstance, RuntimeError> {
414        let image = default_image();
415        let running = match &self.backend {
416            // Docker attaches the container to the subnet's per-VPC bridge for
417            // L3 isolation. k8s pods share a flat network; isolation there is a
418            // NetworkPolicy concern handled separately (#1745 phase 4).
419            InstanceBackend::Docker(d) => {
420                d.run_instance(account_id, instance_id, &image, user_data, network)
421                    .await?
422            }
423            InstanceBackend::K8s(k) => k.spawn_pod(instance_id, &image, user_data, tags).await?,
424        };
425        self.instances.write().insert(
426            instance_id.to_string(),
427            InstanceRecord {
428                handle: running.container_id.clone(),
429                account_id: account_id.to_string(),
430                image,
431                user_data: user_data.map(str::to_string),
432                tags: tags.clone(),
433                network: network.cloned(),
434            },
435        );
436        Ok(running)
437    }
438
439    /// Stop an instance's backing container (maps to `StopInstances`).
440    /// Docker stops the container in place; k8s deletes the Pod (recreated
441    /// on the next `Start`).
442    pub async fn stop_instance(&self, instance_id: &str) {
443        let Some(handle) = self.handle_of(instance_id) else {
444            return;
445        };
446        match &self.backend {
447            InstanceBackend::Docker(d) => d.stop(&handle).await,
448            InstanceBackend::K8s(k) => k.delete_pod(&handle).await,
449        }
450    }
451
452    /// Start a previously-stopped instance (maps to `StartInstances`).
453    /// Returns the running container's (possibly new) handle and private IP.
454    /// Docker starts the existing container; k8s recreates the Pod under a new
455    /// unique name, so the handle changes — callers should persist it.
456    pub async fn start_instance(&self, instance_id: &str) -> Option<RunningInstance> {
457        let record = self.instances.read().get(instance_id)?.clone();
458        match &self.backend {
459            InstanceBackend::Docker(d) => {
460                // Same container; only the IP may change. The subnet network the
461                // container was created on persists across stop/start.
462                let private_ip = d.start(&record.handle).await?;
463                Some(RunningInstance {
464                    container_id: record.handle,
465                    private_ip,
466                    network: record
467                        .network
468                        .as_ref()
469                        .map(|n| subnet_network_name(&n.subnet_id)),
470                })
471            }
472            InstanceBackend::K8s(k) => {
473                let running = k
474                    .spawn_pod(
475                        instance_id,
476                        &record.image,
477                        record.user_data.as_deref(),
478                        &record.tags,
479                    )
480                    .await
481                    .ok()?;
482                self.update_handle(instance_id, &running.container_id);
483                Some(running)
484            }
485        }
486    }
487
488    /// Restart an instance's backing container (maps to `RebootInstances`).
489    /// Docker restarts in place; k8s deletes and recreates the Pod under a new
490    /// name. Returns the running container's handle + IP when it changed (k8s),
491    /// so callers can persist the new handle; `None` when nothing to update.
492    pub async fn reboot_instance(&self, instance_id: &str) -> Option<RunningInstance> {
493        let record = self.instances.read().get(instance_id).cloned()?;
494        match &self.backend {
495            InstanceBackend::Docker(d) => {
496                d.reboot(&record.handle).await;
497                None
498            }
499            InstanceBackend::K8s(k) => {
500                k.delete_pod(&record.handle).await;
501                let running = k
502                    .spawn_pod(
503                        instance_id,
504                        &record.image,
505                        record.user_data.as_deref(),
506                        &record.tags,
507                    )
508                    .await
509                    .ok()?;
510                self.update_handle(instance_id, &running.container_id);
511                Some(running)
512            }
513        }
514    }
515
516    /// Remove an instance's backing container (maps to `TerminateInstances`).
517    pub async fn terminate_instance(&self, instance_id: &str) {
518        let record = self.instances.write().remove(instance_id);
519        if let Some(record) = record {
520            match &self.backend {
521                InstanceBackend::Docker(d) => {
522                    d.remove(&record.handle).await;
523                    // Drop the durable root-disk volume so a later instance
524                    // reusing this id starts clean (terminate = volume gone,
525                    // matching a deleted EBS root volume). No-op when volumes
526                    // are disabled.
527                    d.remove_data_volume(&record.account_id, instance_id).await;
528                }
529                InstanceBackend::K8s(k) => k.delete_pod(&record.handle).await,
530            }
531        }
532    }
533
534    /// Tear down every container this runtime spawned (used on reset and
535    /// shutdown). The Docker backend leans on the shared reaper for any
536    /// container it loses track of.
537    pub async fn stop_all(&self) {
538        let records: Vec<InstanceRecord> = {
539            let mut instances = self.instances.write();
540            instances.drain().map(|(_, r)| r).collect()
541        };
542        for record in records {
543            match &self.backend {
544                InstanceBackend::Docker(d) => d.remove(&record.handle).await,
545                InstanceBackend::K8s(k) => k.delete_pod(&record.handle).await,
546            }
547        }
548    }
549
550    /// Sweep instance Pods orphaned by a previous fakecloud process (k8s
551    /// only; the Docker backend relies on the shared reaper).
552    pub async fn reap_stale(&self) {
553        if let InstanceBackend::K8s(k) = &self.backend {
554            k.reap_stale().await;
555        }
556    }
557
558    /// The backing container's console log — its combined stdout/stderr, which
559    /// includes anything user-data printed at boot (maps to `GetConsoleOutput`).
560    /// `None` for an unbacked instance or when logs can't be read.
561    pub async fn console_output(&self, instance_id: &str) -> Option<Vec<u8>> {
562        let handle = self.handle_of(instance_id)?;
563        match &self.backend {
564            InstanceBackend::Docker(d) => d.logs(&handle).await,
565            InstanceBackend::K8s(k) => k.logs(&handle).await,
566        }
567    }
568
569    fn handle_of(&self, instance_id: &str) -> Option<String> {
570        self.instances
571            .read()
572            .get(instance_id)
573            .map(|r| r.handle.clone())
574    }
575
576    fn update_handle(&self, instance_id: &str, handle: &str) {
577        if let Some(record) = self.instances.write().get_mut(instance_id) {
578            record.handle = handle.to_string();
579        }
580    }
581}
582
583fn default_image() -> String {
584    std::env::var(DEFAULT_IMAGE_ENV).unwrap_or_else(|_| DEFAULT_IMAGE.to_string())
585}
586
587/// The in-instance directory backed by the durable data volume. Defaults to
588/// `/var/lib/fakecloud/ec2`; override with `FAKECLOUD_EC2_INSTANCE_DATA_DIR`
589/// to capture whichever path the instance's workload writes its long-lived
590/// state to. This is fakecloud's persistent-instance-data convention rather
591/// than a full root-filesystem snapshot: data written here survives restart
592/// and stop/start; the rest of the container's ephemeral filesystem does not.
593fn instance_data_dir() -> String {
594    std::env::var("FAKECLOUD_EC2_INSTANCE_DATA_DIR")
595        .unwrap_or_else(|_| "/var/lib/fakecloud/ec2".to_string())
596}
597
598/// Whether EC2 instance data should survive a fakecloud restart via a durable
599/// named volume. OFF by default (ephemeral, keeping test/CI runs clean and
600/// avoiding a stale volume bleeding into a later instance that reuses an id);
601/// opt in with `FAKECLOUD_PERSIST_EC2_VOLUMES=1`. A follow-up flips this
602/// default to on in persistent storage mode by exporting the same env var.
603fn ec2_volumes_enabled() -> bool {
604    std::env::var("FAKECLOUD_PERSIST_EC2_VOLUMES")
605        .map(|v| matches!(v.as_str(), "1" | "true" | "TRUE" | "yes"))
606        .unwrap_or(false)
607}
608
609/// Deterministic Docker volume name for an instance's data dir. Keyed only on
610/// account + instance id (NOT the per-process fakecloud instance id) so the
611/// same volume reattaches after a fakecloud restart recreates the container.
612/// Characters outside Docker's `[a-zA-Z0-9_.-]` volume-name set become `-`.
613fn data_volume_name(account_id: &str, instance_id: &str) -> String {
614    let sanitize = |s: &str| -> String {
615        s.chars()
616            .map(|c| {
617                if c.is_ascii_alphanumeric() || c == '_' || c == '.' || c == '-' {
618                    c
619                } else {
620                    '-'
621                }
622            })
623            .collect()
624    };
625    format!(
626        "fakecloud-ec2-data-{}-{}",
627        sanitize(account_id),
628        sanitize(instance_id)
629    )
630}
631
632/// Keep-alive command + user-data wrapper for a base image. Shared by both
633/// backends so they boot identical containers. When `user_data` (base64) is
634/// present it is decoded and run as a root shell script, backgrounded so a
635/// slow script never blocks readiness, then the container tails forever.
636fn boot_command(user_data: Option<&str>) -> Vec<String> {
637    match user_data.filter(|s| !s.is_empty()) {
638        Some(b64) => {
639            let script = format!("printf %s '{b64}' | base64 -d | sh & exec tail -f /dev/null");
640            vec!["sh".to_string(), "-c".to_string(), script]
641        }
642        None => vec![
643            "tail".to_string(),
644            "-f".to_string(),
645            "/dev/null".to_string(),
646        ],
647    }
648}
649
650/// Docker/Podman backend: shells out to the container CLI.
651#[derive(Debug, Clone)]
652struct DockerInstances {
653    cli: String,
654    instance_id: String,
655}
656
657impl DockerInstances {
658    async fn run_instance(
659        &self,
660        account_id: &str,
661        instance_id: &str,
662        image: &str,
663        user_data: Option<&str>,
664        network: Option<&InstanceNetwork>,
665    ) -> Result<RunningInstance, RuntimeError> {
666        // Ensure the subnet's bridge exists and attach to it for L3 isolation.
667        // Network creation is best-effort: on failure we fall back to the
668        // default bridge so the instance still boots (no regression vs today).
669        let attached_network = match network {
670            Some(net) => self.ensure_subnet_network(net).await,
671            None => None,
672        };
673
674        let mut args: Vec<String> = vec![
675            "run".to_string(),
676            "-d".to_string(),
677            "--label".to_string(),
678            format!("fakecloud-ec2={instance_id}"),
679            "--label".to_string(),
680            format!("fakecloud-instance={}", self.instance_id),
681        ];
682        // Optionally back the instance's writable data directory with a durable
683        // named volume keyed on account + instance id, so the filesystem state
684        // an instance writes there survives a fakecloud restart (the recovery
685        // path recreates the container, which reattaches the same volume) and a
686        // stop/start (Docker reuses the same container, so the volume persists
687        // regardless). OFF by default to keep test/CI runs ephemeral and avoid
688        // a stale volume bleeding into a later instance that reuses an id;
689        // enabled by `FAKECLOUD_PERSIST_EC2_VOLUMES=1` (and, in a follow-up,
690        // default-on in persistent storage mode). The volume is dropped on
691        // TerminateInstances. See [`data_volume_name`] / [`instance_data_dir`].
692        if ec2_volumes_enabled() {
693            args.push("-v".to_string());
694            args.push(format!(
695                "{}:{}",
696                data_volume_name(account_id, instance_id),
697                instance_data_dir()
698            ));
699        }
700        if let Some(name) = &attached_network {
701            args.push("--network".to_string());
702            args.push(name.clone());
703        }
704        args.push(image.to_string());
705        args.extend(boot_command(user_data));
706
707        let output = tokio::process::Command::new(&self.cli)
708            .args(&args)
709            .output()
710            .await
711            .map_err(|e| RuntimeError::ContainerStartFailed(e.to_string()))?;
712
713        if !output.status.success() {
714            return Err(RuntimeError::ContainerStartFailed(
715                String::from_utf8_lossy(&output.stderr).trim().to_string(),
716            ));
717        }
718
719        let container_id = String::from_utf8_lossy(&output.stdout).trim().to_string();
720        let private_ip = self
721            .inspect_ip(&container_id)
722            .await
723            .unwrap_or_else(|| "10.0.0.1".to_string());
724
725        Ok(RunningInstance {
726            container_id,
727            private_ip,
728            network: attached_network,
729        })
730    }
731
732    /// Create (idempotently) the daemon network backing a subnet and return its
733    /// name, or `None` if creation failed (caller falls back to the default
734    /// bridge). The network carries the shared `fakecloud-instance` ownership
735    /// label so the startup reaper prunes it after an ungraceful restart, plus
736    /// a `fakecloud-subnet=<id>` label for introspection. Private subnets get
737    /// an `--internal` network (no NAT to the host/internet).
738    async fn ensure_subnet_network(&self, net: &InstanceNetwork) -> Option<String> {
739        let name = subnet_network_name(&net.subnet_id);
740        let mut args = vec!["network".to_string(), "create".to_string()];
741        if net.internal {
742            args.push("--internal".to_string());
743        }
744        args.push("--label".to_string());
745        args.push(format!("fakecloud-subnet={}", net.subnet_id));
746        args.push("--label".to_string());
747        args.push(format!("fakecloud-instance={}", self.instance_id));
748        args.push(name.clone());
749
750        let output = tokio::process::Command::new(&self.cli)
751            .args(&args)
752            .output()
753            .await;
754        match output {
755            // Created fresh.
756            Ok(out) if out.status.success() => Some(name),
757            // Already exists (another instance in the same subnet created it):
758            // a benign race — the network is there, so attach to it.
759            Ok(out) => {
760                let err = String::from_utf8_lossy(&out.stderr);
761                if err.contains("already exists") || err.contains("exists") {
762                    Some(name)
763                } else {
764                    tracing::warn!(
765                        subnet = %net.subnet_id,
766                        network = %name,
767                        error = %err.trim(),
768                        "subnet network creation failed; falling back to default bridge"
769                    );
770                    None
771                }
772            }
773            Err(e) => {
774                tracing::warn!(
775                    subnet = %net.subnet_id,
776                    network = %name,
777                    error = %e,
778                    "subnet network creation failed; falling back to default bridge"
779                );
780                None
781            }
782        }
783    }
784
785    /// Read the container's private IP from `inspect`. Returns `None` if the
786    /// container has no address (e.g. host networking) — the caller falls
787    /// back to a synthesized IP.
788    async fn inspect_ip(&self, container_id: &str) -> Option<String> {
789        let output = tokio::process::Command::new(&self.cli)
790            .args([
791                "inspect",
792                "-f",
793                "{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}",
794                container_id,
795            ])
796            .output()
797            .await
798            .ok()?;
799        if !output.status.success() {
800            return None;
801        }
802        let ip = String::from_utf8_lossy(&output.stdout).trim().to_string();
803        if ip.is_empty() {
804            None
805        } else {
806            Some(ip)
807        }
808    }
809
810    async fn stop(&self, container_id: &str) {
811        let _ = tokio::process::Command::new(&self.cli)
812            .args(["stop", container_id])
813            .output()
814            .await;
815    }
816
817    async fn start(&self, container_id: &str) -> Option<String> {
818        let started = tokio::process::Command::new(&self.cli)
819            .args(["start", container_id])
820            .output()
821            .await
822            .map(|o| o.status.success())
823            .unwrap_or(false);
824        if !started {
825            return None;
826        }
827        self.inspect_ip(container_id).await
828    }
829
830    async fn reboot(&self, container_id: &str) {
831        let _ = tokio::process::Command::new(&self.cli)
832            .args(["restart", container_id])
833            .output()
834            .await;
835    }
836
837    async fn remove(&self, container_id: &str) {
838        let _ = tokio::process::Command::new(&self.cli)
839            .args(["rm", "-f", container_id])
840            .output()
841            .await;
842    }
843
844    /// Remove the durable data volume for an instance (called on
845    /// `TerminateInstances`) so a later instance reusing the same id starts
846    /// clean rather than inheriting the terminated instance's filesystem. A
847    /// no-op when volume persistence is disabled (no such volume was created).
848    async fn remove_data_volume(&self, account_id: &str, instance_id: &str) {
849        if !ec2_volumes_enabled() {
850            return;
851        }
852        let _ = tokio::process::Command::new(&self.cli)
853            .args([
854                "volume",
855                "rm",
856                "-f",
857                &data_volume_name(account_id, instance_id),
858            ])
859            .output()
860            .await;
861    }
862
863    /// The container's combined stdout+stderr (`docker logs`). `None` if the
864    /// command fails; an empty log is `Some(vec![])`.
865    async fn logs(&self, container_id: &str) -> Option<Vec<u8>> {
866        let output = tokio::process::Command::new(&self.cli)
867            .args(["logs", container_id])
868            .output()
869            .await
870            .ok()?;
871        if !output.status.success() {
872            return None;
873        }
874        // `docker logs` writes the container's stdout to ours and its stderr to
875        // ours; concatenate so the console output carries both streams.
876        let mut buf = output.stdout;
877        buf.extend_from_slice(&output.stderr);
878        Some(buf)
879    }
880}
881
882#[cfg(test)]
883mod volume_tests {
884    use super::*;
885
886    #[test]
887    fn data_volume_name_is_stable_and_sanitized() {
888        // Stable across calls (so recovery reattaches the same volume) and
889        // keyed on account + instance id, not the per-process instance id.
890        assert_eq!(
891            data_volume_name("123456789012", "i-0abc123"),
892            "fakecloud-ec2-data-123456789012-i-0abc123"
893        );
894        // Characters outside Docker's volume-name set become '-'.
895        assert_eq!(
896            data_volume_name("1234/5678", "i-0abc:1"),
897            "fakecloud-ec2-data-1234-5678-i-0abc-1"
898        );
899    }
900
901    #[test]
902    fn distinct_instances_get_distinct_volumes() {
903        // Two instances in the same account never share a data volume, so
904        // terminating one cannot wipe another's filesystem.
905        assert_ne!(
906            data_volume_name("123456789012", "i-aaaa"),
907            data_volume_name("123456789012", "i-bbbb")
908        );
909    }
910}