fakecloud_ec2/runtime/mod.rs
1//! Backing-container runtime for EC2 instances.
2//!
3//! `RunInstances` spins a real container per instance; the instance
4//! lifecycle (`Start`/`Stop`/`Reboot`/`Terminate`) maps onto the container
5//! lifecycle, and `DescribeInstances` reports the container's real private
6//! IP. The container can run either as a local Docker/Podman container (the
7//! default) or as a native Kubernetes Pod (`FAKECLOUD_EC2_BACKEND=k8s` or the
8//! global `FAKECLOUD_CONTAINER_BACKEND=k8s`).
9//!
10//! Operations are keyed by **instance id**, not the backend handle: a
11//! Kubernetes Pod can't be stopped and restarted in place, so `Stop` deletes
12//! the Pod and `Start`/`Reboot` recreate it. The runtime therefore keeps,
13//! per instance, the handle plus enough of the original request (image,
14//! user-data) to recreate the backing container deterministically.
15//!
16//! The runtime is strictly additive: when no container backend is available
17//! the control plane keeps its metadata-faithful behaviour (synthesized IPs,
18//! state transitions) so every API call still succeeds. Real container
19//! backing is best-effort fidelity layered on top.
20
21pub mod firewall;
22mod k8s;
23pub mod netpolicy;
24
25use std::collections::{BTreeMap, HashMap};
26use std::sync::Arc;
27
28use parking_lot::RwLock;
29
30use firewall::{
31 render_bridge_ruleset, render_ruleset, resolve_enforcement_mode, EnforcementMode,
32 InstanceRules, SubnetFirewall,
33};
34
35/// Default base image an instance's container runs. AMIs don't map to a
36/// concrete OS image, so we boot a real Amazon Linux container by default
37/// (overridable via `FAKECLOUD_EC2_DEFAULT_IMAGE`, e.g. to a lighter image
38/// in CI). The container is kept alive with `tail -f /dev/null` — EC2
39/// instances are long-running hosts, not one-shot tasks. `tail` is used
40/// rather than `sleep infinity` so any base image works (busybox `sleep`
41/// rejects `infinity`).
42const DEFAULT_IMAGE_ENV: &str = "FAKECLOUD_EC2_DEFAULT_IMAGE";
43const DEFAULT_IMAGE: &str = "amazonlinux:2023";
44
45#[derive(Debug, thiserror::Error)]
46pub enum RuntimeError {
47 #[error("container failed to start: {0}")]
48 ContainerStartFailed(String),
49}
50
51/// Error initializing the Kubernetes backend at startup. Surfaced to the
52/// operator so a misconfigured cluster fails fast rather than silently
53/// falling back to Docker.
54#[derive(Debug, thiserror::Error)]
55pub enum BackendInitError {
56 #[error(transparent)]
57 Env(#[from] fakecloud_k8s::K8sEnvError),
58 #[error(transparent)]
59 PodConfig(#[from] fakecloud_k8s::K8sPodConfigError),
60 #[error("failed to connect to the Kubernetes cluster: {0}")]
61 Connect(String),
62}
63
64/// A running instance's backing container.
65#[derive(Debug, Clone)]
66pub struct RunningInstance {
67 /// Backend-specific handle: a Docker container id, or a Pod name.
68 pub container_id: String,
69 /// The instance's private IP — the container's address on the daemon
70 /// network (Docker) or the Pod IP (k8s).
71 pub private_ip: String,
72 /// Name of the backing daemon network the container was attached to
73 /// (`fakecloud-subnet-<id>`), or `None` when it ran on the default bridge
74 /// (no network spec, or creation failed and we fell back). Surfaced for
75 /// introspection (#1745 phase 5).
76 pub network: Option<String>,
77}
78
79/// The L3 placement of an instance's backing container: which subnet it lands
80/// in and whether that subnet is private.
81///
82/// Per-subnet networks give the isolation #1745 wants for free: two instances
83/// in the same subnet share a bridge and can talk; instances in different
84/// subnets / VPCs land on different bridges and cannot route to each other.
85#[derive(Debug, Clone)]
86pub struct InstanceNetwork {
87 /// The EC2 subnet id the instance launched into.
88 pub subnet_id: String,
89 /// True when the subnet has no `0.0.0.0/0 -> igw` route (private): the
90 /// backing network is created `--internal` (no NAT to host/internet).
91 pub internal: bool,
92}
93
94/// The daemon network name backing an EC2 subnet. Stable per subnet so every
95/// instance in the subnet attaches to the same bridge.
96pub fn subnet_network_name(subnet_id: &str) -> String {
97 format!("fakecloud-subnet-{subnet_id}")
98}
99
100/// How this runtime isolates instance traffic, surfaced by the
101/// `/_fakecloud/ec2/instance-networks` introspection endpoint so users can
102/// answer "why can't X reach Y" — which backend, which SG-enforcement
103/// mechanism, and whether it's actually active vs degraded to metadata-only.
104#[derive(Debug, Clone)]
105pub struct NetworkIsolationSummary {
106 /// `docker` | `podman` | `kubernetes`.
107 pub backend: &'static str,
108 /// `nftables` (Docker host firewall) | `networkpolicy` (k8s) | `disabled`.
109 pub sg_enforcement: &'static str,
110 /// Whether security-group rules are actually enforced. False means rules
111 /// are tracked but not applied (no `CAP_NET_ADMIN`, or a CNI that ignores
112 /// NetworkPolicy) — phase-2 L3 isolation still holds.
113 pub enforced: bool,
114}
115
116/// What the runtime remembers per instance so it can drive the backing
117/// container's lifecycle and recreate it (k8s `Start`/`Reboot`).
118#[derive(Debug, Clone)]
119struct InstanceRecord {
120 /// Docker container id, or Pod name.
121 handle: String,
122 /// The owning account id, captured at `RunInstances`. Keys the durable
123 /// data volume (see [`data_volume_name`]) so `TerminateInstances` can
124 /// remove the right volume without re-consulting the control plane.
125 account_id: String,
126 /// Resolved base image, captured at `RunInstances` so a recreate is
127 /// identical even if `FAKECLOUD_EC2_DEFAULT_IMAGE` later changes.
128 image: String,
129 /// Base64 user-data to re-run on recreate, if any.
130 user_data: Option<String>,
131 /// The instance's tags, captured at `RunInstances`. Reserved
132 /// `fakecloud-k8s/*` entries drive per-instance Pod scheduling and must
133 /// survive a k8s `Start`/`Reboot` recreate, so they're stored here
134 /// rather than re-read from the control plane.
135 tags: BTreeMap<String, String>,
136 /// The instance's subnet placement, captured at `RunInstances` so a k8s
137 /// `Start`/`Reboot` recreate re-applies the same network and phase-5
138 /// introspection can report the backing network. `None` in metadata-only
139 /// network mode.
140 network: Option<InstanceNetwork>,
141}
142
143/// The selected backing-container backend.
144#[derive(Debug, Clone)]
145enum InstanceBackend {
146 Docker(DockerInstances),
147 K8s(k8s::K8sInstances),
148}
149
150/// Host firewall enforcement for security groups + NACLs (#1745 phase 3).
151///
152/// The network-driver abstraction the issue asks for: today there is one real
153/// driver (nftables) plus the degraded no-op, selected once at construction.
154/// Branching on podman vs docker isn't needed explicitly — rootless podman
155/// can't touch the host firewall, so the `nft list ruleset` capability probe
156/// already degrades it; rootful podman with netavark passes the same probe.
157#[derive(Debug, Clone)]
158pub struct FirewallEnforcer {
159 mode: EnforcementMode,
160}
161
162impl FirewallEnforcer {
163 /// Resolve the enforcement mode from `FAKECLOUD_EC2_SG_ENFORCEMENT` and an
164 /// `nft` capability probe, warning once when enforcement was requested but
165 /// can't be backed (so the operator knows it degraded, not silently).
166 fn detect() -> Self {
167 let requested = std::env::var("FAKECLOUD_EC2_SG_ENFORCEMENT").ok();
168 let mode = resolve_enforcement_mode(
169 requested.as_deref(),
170 firewall::host_shares_daemon_netns(),
171 firewall::nft_available,
172 );
173 if requested.is_some() && mode == EnforcementMode::Disabled {
174 tracing::warn!(
175 "EC2 security-group enforcement was requested but it can't take effect here \
176 (needs nftables + CAP_NET_ADMIN on a native-Linux host whose daemon shares this \
177 network namespace — Docker Desktop / podman-machine run the daemon in a VM); \
178 falling back to metadata-only (phase-2 L3 isolation stays active, security-group \
179 rules are tracked but not enforced)"
180 );
181 } else if mode == EnforcementMode::Nftables {
182 tracing::info!("EC2 security-group enforcement active via nftables");
183 }
184 Self { mode }
185 }
186
187 /// Disabled enforcer (k8s backend, or no container runtime).
188 fn disabled() -> Self {
189 Self {
190 mode: EnforcementMode::Disabled,
191 }
192 }
193
194 pub fn mode(&self) -> EnforcementMode {
195 self.mode
196 }
197
198 pub fn enabled(&self) -> bool {
199 self.mode != EnforcementMode::Disabled
200 }
201
202 /// Atomically swap in the rendered ruleset via `nft -f -`. No-op when
203 /// disabled. Best-effort: a failed apply logs and leaves the previous
204 /// ruleset in place rather than erroring the originating API call.
205 async fn reconcile(&self, subnets: &[SubnetFirewall]) {
206 if self.mode == EnforcementMode::Disabled {
207 return;
208 }
209 // Instances in the same subnet share one Linux bridge; their traffic is
210 // L2-switched and only traverses the `forward` chain (where our SG rules
211 // live) when bridge netfilter is enabled. Without this, same-subnet SG
212 // rules silently filter nothing — exactly what the real-packet E2E
213 // caught. Needs CAP_NET_ADMIN (which the enforcer holds) and the
214 // `modprobe`/`sysctl` binaries (shipped via kmod/procps in the image).
215 // Warn rather than swallow the error: a missing binary or a failed call
216 // means enforcement degrades to filtering nothing, and the operator who
217 // opted in deserves to know (bug-audit 2026-06-20, 0.B1).
218 match tokio::process::Command::new("modprobe")
219 .arg("br_netfilter")
220 .output()
221 .await
222 {
223 Ok(o) if o.status.success() => {}
224 Ok(o) => tracing::warn!(
225 stderr = %String::from_utf8_lossy(&o.stderr).trim(),
226 "modprobe br_netfilter failed; same-subnet security-group enforcement may filter nothing"
227 ),
228 Err(e) => tracing::warn!(
229 error = %e,
230 "could not run modprobe (is kmod installed?); same-subnet security-group enforcement may filter nothing"
231 ),
232 }
233 match tokio::process::Command::new("sysctl")
234 .args(["-w", "net.bridge.bridge-nf-call-iptables=1"])
235 .output()
236 .await
237 {
238 Ok(o) if o.status.success() => {}
239 Ok(o) => tracing::warn!(
240 stderr = %String::from_utf8_lossy(&o.stderr).trim(),
241 "sysctl bridge-nf-call-iptables=1 failed; same-subnet security-group enforcement may filter nothing"
242 ),
243 Err(e) => tracing::warn!(
244 error = %e,
245 "could not run sysctl (is procps installed?); same-subnet security-group enforcement may filter nothing"
246 ),
247 }
248 use tokio::io::AsyncWriteExt;
249 // Load a rendered ruleset via `nft -f -`. `required=false` marks the
250 // best-effort same-subnet bridge table: a kernel without
251 // `nf_conntrack_bridge` rejects its `ct state` line, and since the inet
252 // table is applied independently first, that rejection is logged at
253 // debug (degraded same-subnet enforcement) rather than warn.
254 async fn load_nft(label: &str, ruleset: &str, subnets: usize, required: bool) {
255 let mut child = match tokio::process::Command::new("nft")
256 .args(["-f", "-"])
257 .stdin(std::process::Stdio::piped())
258 .stdout(std::process::Stdio::null())
259 .stderr(std::process::Stdio::piped())
260 .spawn()
261 {
262 Ok(c) => c,
263 Err(e) => {
264 tracing::warn!(error = %e, table = label, "failed to spawn nft; security-group ruleset not applied");
265 return;
266 }
267 };
268 if let Some(mut stdin) = child.stdin.take() {
269 let _ = stdin.write_all(ruleset.as_bytes()).await;
270 let _ = stdin.shutdown().await;
271 }
272 match child.wait_with_output().await {
273 Ok(out) if out.status.success() => {
274 tracing::debug!(
275 subnets,
276 table = label,
277 "applied EC2 security-group nft ruleset"
278 );
279 }
280 Ok(out) => {
281 let stderr = String::from_utf8_lossy(&out.stderr);
282 let stderr = stderr.trim();
283 if required {
284 tracing::warn!(table = label, stderr = %stderr, "nft rejected the security-group ruleset; leaving the previous ruleset in place");
285 } else {
286 tracing::debug!(table = label, stderr = %stderr, "bridge-family SG ruleset not applied (kernel may lack nf_conntrack_bridge); same-subnet enforcement degraded to inet table only");
287 }
288 }
289 Err(e) => tracing::warn!(error = %e, table = label, "nft apply failed"),
290 }
291 }
292 let n = subnets.len();
293 // inet: cross-subnet routed enforcement (required). bridge: same-subnet
294 // L2 enforcement that the inet forward hook misses for bridged frames.
295 load_nft("inet fakecloud_ec2", &render_ruleset(subnets), n, true).await;
296 load_nft(
297 "bridge fakecloud_ec2_l2",
298 &render_bridge_ruleset(subnets),
299 n,
300 false,
301 )
302 .await;
303 }
304}
305
306#[derive(Debug, Clone)]
307pub struct Ec2Runtime {
308 backend: InstanceBackend,
309 /// Per-instance backing records, keyed by EC2 instance id, so the
310 /// lifecycle operations and reset/shutdown teardown work without
311 /// consulting service state.
312 instances: Arc<RwLock<HashMap<String, InstanceRecord>>>,
313 /// Host firewall enforcer for security groups + NACLs.
314 firewall: FirewallEnforcer,
315 /// Serializes firewall reconciles. Reconcile is fired from many concurrent
316 /// background tasks (per SG/NACL/lifecycle event); without this, two
317 /// reconciles built from divergent state could interleave so the k8s
318 /// apply+prune of one deletes a policy the other just applied (bug-hunt
319 /// 2026-06-18 finding 4.3). Holding it across the whole reconcile makes the
320 /// last-started reconcile the last-applied for both backends.
321 reconcile_lock: Arc<tokio::sync::Mutex<()>>,
322}
323
324impl Ec2Runtime {
325 /// Construct the Docker/Podman backend. Returns `None` when no container
326 /// CLI is available — callers then run in metadata-only mode.
327 pub fn new() -> Option<Self> {
328 let cli = fakecloud_core::container_net::detect_container_cli()?;
329 Some(Self {
330 backend: InstanceBackend::Docker(DockerInstances {
331 cli,
332 instance_id: format!("fakecloud-{}", std::process::id()),
333 }),
334 instances: Arc::new(RwLock::new(HashMap::new())),
335 firewall: FirewallEnforcer::detect(),
336 reconcile_lock: Arc::new(tokio::sync::Mutex::new(())),
337 })
338 }
339
340 /// Construct the Kubernetes backend. `server_port` is fakecloud's bound
341 /// port (used when `FAKECLOUD_K8S_SELF_URL` omits one). Fails fast on
342 /// misconfiguration — never silently degrades to Docker.
343 pub async fn new_k8s(server_port: u16) -> Result<Self, BackendInitError> {
344 let backend = k8s::K8sInstances::from_env(server_port).await?;
345 Ok(Self {
346 backend: InstanceBackend::K8s(backend),
347 instances: Arc::new(RwLock::new(HashMap::new())),
348 // k8s isolation is a NetworkPolicy concern (phase 4), not host nft.
349 firewall: FirewallEnforcer::disabled(),
350 reconcile_lock: Arc::new(tokio::sync::Mutex::new(())),
351 })
352 }
353
354 /// The firewall enforcer, so the control plane can skip building the model
355 /// when enforcement is disabled and report the mode for introspection.
356 pub fn firewall(&self) -> &FirewallEnforcer {
357 &self.firewall
358 }
359
360 /// Re-render and atomically apply the security-group/NACL ruleset for the
361 /// given per-subnet model. No-op (cheap) when enforcement is disabled.
362 /// Serialized against other reconciles (finding 4.3).
363 pub async fn reconcile_firewall(&self, subnets: Vec<SubnetFirewall>) {
364 let _guard = self.reconcile_lock.lock().await;
365 self.firewall.reconcile(&subnets).await;
366 }
367
368 /// Whether this runtime backs network isolation with real enforcement —
369 /// host nftables (Docker, opt-in) or k8s NetworkPolicy. Lets the control
370 /// plane skip building the firewall model entirely when neither applies.
371 pub fn network_isolation_enforced(&self) -> bool {
372 self.firewall.enabled() || self.is_k8s()
373 }
374
375 /// True for the Kubernetes backend (isolation via NetworkPolicy).
376 pub fn is_k8s(&self) -> bool {
377 matches!(self.backend, InstanceBackend::K8s(_))
378 }
379
380 /// Apply one NetworkPolicy per instance for the k8s backend. No-op on the
381 /// Docker backend (which uses nftables instead). Serialized against other
382 /// reconciles so a concurrent apply+prune can't delete a just-applied
383 /// policy (finding 4.3).
384 pub async fn reconcile_network_policies(&self, rules: Vec<InstanceRules>) {
385 if let InstanceBackend::K8s(k) = &self.backend {
386 let _guard = self.reconcile_lock.lock().await;
387 k.reconcile_network_policies(&rules).await;
388 }
389 }
390
391 /// A snapshot of how this runtime isolates instance traffic, for the
392 /// `/_fakecloud/ec2/instance-networks` introspection endpoint (#1745 ph5).
393 pub fn network_isolation_summary(&self) -> NetworkIsolationSummary {
394 match &self.backend {
395 InstanceBackend::Docker(d) => NetworkIsolationSummary {
396 backend: if fakecloud_core::container_net::is_podman_binary(&d.cli) {
397 "podman"
398 } else {
399 "docker"
400 },
401 sg_enforcement: match self.firewall.mode() {
402 EnforcementMode::Nftables => "nftables",
403 EnforcementMode::Disabled => "disabled",
404 },
405 enforced: self.firewall.enabled(),
406 },
407 InstanceBackend::K8s(k) => NetworkIsolationSummary {
408 backend: "kubernetes",
409 sg_enforcement: "networkpolicy",
410 // NetworkPolicies are always created; "enforced" reflects
411 // whether the detected CNI actually applies them.
412 enforced: k.cni_enforces(),
413 },
414 }
415 }
416
417 /// Name of the active backend, for logging.
418 pub fn cli_name(&self) -> &str {
419 match &self.backend {
420 InstanceBackend::Docker(d) => &d.cli,
421 InstanceBackend::K8s(_) => "kubernetes",
422 }
423 }
424
425 /// Boot a container for an instance. `user_data` is the base64-encoded
426 /// user-data as received on the wire (RunInstances `UserData`), run at
427 /// boot the way cloud-init would, if present.
428 pub async fn run_instance(
429 &self,
430 account_id: &str,
431 instance_id: &str,
432 user_data: Option<&str>,
433 tags: &BTreeMap<String, String>,
434 network: Option<&InstanceNetwork>,
435 ) -> Result<RunningInstance, RuntimeError> {
436 let image = default_image();
437 let running = match &self.backend {
438 // Docker attaches the container to the subnet's per-VPC bridge for
439 // L3 isolation. k8s pods share a flat network; isolation there is a
440 // NetworkPolicy concern handled separately (#1745 phase 4).
441 InstanceBackend::Docker(d) => {
442 d.run_instance(account_id, instance_id, &image, user_data, network)
443 .await?
444 }
445 InstanceBackend::K8s(k) => k.spawn_pod(instance_id, &image, user_data, tags).await?,
446 };
447 self.instances.write().insert(
448 instance_id.to_string(),
449 InstanceRecord {
450 handle: running.container_id.clone(),
451 account_id: account_id.to_string(),
452 image,
453 user_data: user_data.map(str::to_string),
454 tags: tags.clone(),
455 network: network.cloned(),
456 },
457 );
458 Ok(running)
459 }
460
461 /// Stop an instance's backing container (maps to `StopInstances`).
462 /// Docker stops the container in place; k8s deletes the Pod (recreated
463 /// on the next `Start`).
464 pub async fn stop_instance(&self, instance_id: &str) {
465 let Some(handle) = self.handle_of(instance_id) else {
466 return;
467 };
468 match &self.backend {
469 InstanceBackend::Docker(d) => d.stop(&handle).await,
470 InstanceBackend::K8s(k) => k.delete_pod(&handle).await,
471 }
472 }
473
474 /// Start a previously-stopped instance (maps to `StartInstances`).
475 /// Returns the running container's (possibly new) handle and private IP.
476 /// Docker starts the existing container; k8s recreates the Pod under a new
477 /// unique name, so the handle changes — callers should persist it.
478 pub async fn start_instance(&self, instance_id: &str) -> Option<RunningInstance> {
479 let record = self.instances.read().get(instance_id)?.clone();
480 match &self.backend {
481 InstanceBackend::Docker(d) => {
482 // Same container; only the IP may change. The subnet network the
483 // container was created on persists across stop/start.
484 let private_ip = d.start(&record.handle).await?;
485 Some(RunningInstance {
486 container_id: record.handle,
487 private_ip,
488 network: record
489 .network
490 .as_ref()
491 .map(|n| subnet_network_name(&n.subnet_id)),
492 })
493 }
494 InstanceBackend::K8s(k) => {
495 let running = k
496 .spawn_pod(
497 instance_id,
498 &record.image,
499 record.user_data.as_deref(),
500 &record.tags,
501 )
502 .await
503 .ok()?;
504 self.update_handle(instance_id, &running.container_id);
505 Some(running)
506 }
507 }
508 }
509
510 /// Restart an instance's backing container (maps to `RebootInstances`).
511 /// Docker restarts in place; k8s deletes and recreates the Pod under a new
512 /// name. Returns the running container's handle + IP when it changed (k8s),
513 /// so callers can persist the new handle; `None` when nothing to update.
514 pub async fn reboot_instance(&self, instance_id: &str) -> Option<RunningInstance> {
515 let record = self.instances.read().get(instance_id).cloned()?;
516 match &self.backend {
517 InstanceBackend::Docker(d) => {
518 d.reboot(&record.handle).await;
519 None
520 }
521 InstanceBackend::K8s(k) => {
522 k.delete_pod(&record.handle).await;
523 let running = k
524 .spawn_pod(
525 instance_id,
526 &record.image,
527 record.user_data.as_deref(),
528 &record.tags,
529 )
530 .await
531 .ok()?;
532 self.update_handle(instance_id, &running.container_id);
533 Some(running)
534 }
535 }
536 }
537
538 /// Remove an instance's backing container (maps to `TerminateInstances`).
539 pub async fn terminate_instance(&self, instance_id: &str) {
540 let record = self.instances.write().remove(instance_id);
541 if let Some(record) = record {
542 match &self.backend {
543 InstanceBackend::Docker(d) => {
544 d.remove(&record.handle).await;
545 // Drop the durable root-disk volume so a later instance
546 // reusing this id starts clean (terminate = volume gone,
547 // matching a deleted EBS root volume). No-op when volumes
548 // are disabled.
549 d.remove_data_volume(&record.account_id, instance_id).await;
550 }
551 InstanceBackend::K8s(k) => k.delete_pod(&record.handle).await,
552 }
553 }
554 }
555
556 /// Tear down every container this runtime spawned (used on reset and
557 /// shutdown). The Docker backend leans on the shared reaper for any
558 /// container it loses track of.
559 pub async fn stop_all(&self) {
560 let records: Vec<InstanceRecord> = {
561 let mut instances = self.instances.write();
562 instances.drain().map(|(_, r)| r).collect()
563 };
564 for record in records {
565 match &self.backend {
566 InstanceBackend::Docker(d) => d.remove(&record.handle).await,
567 InstanceBackend::K8s(k) => k.delete_pod(&record.handle).await,
568 }
569 }
570 }
571
572 /// Sweep instance Pods orphaned by a previous fakecloud process (k8s
573 /// only; the Docker backend relies on the shared reaper).
574 pub async fn reap_stale(&self) {
575 if let InstanceBackend::K8s(k) = &self.backend {
576 k.reap_stale().await;
577 }
578 }
579
580 /// The backing container's console log — its combined stdout/stderr, which
581 /// includes anything user-data printed at boot (maps to `GetConsoleOutput`).
582 /// `None` for an unbacked instance or when logs can't be read.
583 pub async fn console_output(&self, instance_id: &str) -> Option<Vec<u8>> {
584 let handle = self.handle_of(instance_id)?;
585 match &self.backend {
586 InstanceBackend::Docker(d) => d.logs(&handle).await,
587 InstanceBackend::K8s(k) => k.logs(&handle).await,
588 }
589 }
590
591 fn handle_of(&self, instance_id: &str) -> Option<String> {
592 self.instances
593 .read()
594 .get(instance_id)
595 .map(|r| r.handle.clone())
596 }
597
598 fn update_handle(&self, instance_id: &str, handle: &str) {
599 if let Some(record) = self.instances.write().get_mut(instance_id) {
600 record.handle = handle.to_string();
601 }
602 }
603}
604
605fn default_image() -> String {
606 std::env::var(DEFAULT_IMAGE_ENV).unwrap_or_else(|_| DEFAULT_IMAGE.to_string())
607}
608
609/// The in-instance directory backed by the durable data volume. Defaults to
610/// `/var/lib/fakecloud/ec2`; override with `FAKECLOUD_EC2_INSTANCE_DATA_DIR`
611/// to capture whichever path the instance's workload writes its long-lived
612/// state to. This is fakecloud's persistent-instance-data convention rather
613/// than a full root-filesystem snapshot: data written here survives restart
614/// and stop/start; the rest of the container's ephemeral filesystem does not.
615fn instance_data_dir() -> String {
616 std::env::var("FAKECLOUD_EC2_INSTANCE_DATA_DIR")
617 .unwrap_or_else(|_| "/var/lib/fakecloud/ec2".to_string())
618}
619
620/// Whether EC2 instance data should survive a fakecloud restart via a durable
621/// named volume. OFF by default (ephemeral, keeping test/CI runs clean and
622/// avoiding a stale volume bleeding into a later instance that reuses an id);
623/// opt in with `FAKECLOUD_PERSIST_EC2_VOLUMES=1`. A follow-up flips this
624/// default to on in persistent storage mode by exporting the same env var.
625fn ec2_volumes_enabled() -> bool {
626 std::env::var("FAKECLOUD_PERSIST_EC2_VOLUMES")
627 .map(|v| matches!(v.as_str(), "1" | "true" | "TRUE" | "yes"))
628 .unwrap_or(false)
629}
630
631/// Deterministic Docker volume name for an instance's data dir. Keyed only on
632/// account + instance id (NOT the per-process fakecloud instance id) so the
633/// same volume reattaches after a fakecloud restart recreates the container.
634/// Characters outside Docker's `[a-zA-Z0-9_.-]` volume-name set become `-`.
635fn data_volume_name(account_id: &str, instance_id: &str) -> String {
636 let sanitize = |s: &str| -> String {
637 s.chars()
638 .map(|c| {
639 if c.is_ascii_alphanumeric() || c == '_' || c == '.' || c == '-' {
640 c
641 } else {
642 '-'
643 }
644 })
645 .collect()
646 };
647 format!(
648 "fakecloud-ec2-data-{}-{}",
649 sanitize(account_id),
650 sanitize(instance_id)
651 )
652}
653
654/// Keep-alive command + user-data wrapper for a base image. Shared by both
655/// backends so they boot identical containers. When `user_data` (base64) is
656/// present it is decoded and run as a root shell script, backgrounded so a
657/// slow script never blocks readiness, then the container tails forever.
658fn boot_command(user_data: Option<&str>) -> Vec<String> {
659 match user_data.filter(|s| !s.is_empty()) {
660 Some(b64) => {
661 let script = format!("printf %s '{b64}' | base64 -d | sh & exec tail -f /dev/null");
662 vec!["sh".to_string(), "-c".to_string(), script]
663 }
664 None => vec![
665 "tail".to_string(),
666 "-f".to_string(),
667 "/dev/null".to_string(),
668 ],
669 }
670}
671
672/// Docker/Podman backend: shells out to the container CLI.
673#[derive(Debug, Clone)]
674struct DockerInstances {
675 cli: String,
676 instance_id: String,
677}
678
679impl DockerInstances {
680 async fn run_instance(
681 &self,
682 account_id: &str,
683 instance_id: &str,
684 image: &str,
685 user_data: Option<&str>,
686 network: Option<&InstanceNetwork>,
687 ) -> Result<RunningInstance, RuntimeError> {
688 // Ensure the subnet's bridge exists and attach to it for L3 isolation.
689 // Network creation is best-effort: on failure we fall back to the
690 // default bridge so the instance still boots (no regression vs today).
691 let attached_network = match network {
692 Some(net) => self.ensure_subnet_network(net).await,
693 None => None,
694 };
695
696 let mut args: Vec<String> = vec![
697 "run".to_string(),
698 "-d".to_string(),
699 "--label".to_string(),
700 format!("fakecloud-ec2={instance_id}"),
701 "--label".to_string(),
702 format!("fakecloud-instance={}", self.instance_id),
703 ];
704 // Optionally back the instance's writable data directory with a durable
705 // named volume keyed on account + instance id, so the filesystem state
706 // an instance writes there survives a fakecloud restart (the recovery
707 // path recreates the container, which reattaches the same volume) and a
708 // stop/start (Docker reuses the same container, so the volume persists
709 // regardless). OFF by default to keep test/CI runs ephemeral and avoid
710 // a stale volume bleeding into a later instance that reuses an id;
711 // enabled by `FAKECLOUD_PERSIST_EC2_VOLUMES=1` (and, in a follow-up,
712 // default-on in persistent storage mode). The volume is dropped on
713 // TerminateInstances. See [`data_volume_name`] / [`instance_data_dir`].
714 if ec2_volumes_enabled() {
715 args.push("-v".to_string());
716 args.push(format!(
717 "{}:{}",
718 data_volume_name(account_id, instance_id),
719 instance_data_dir()
720 ));
721 }
722 if let Some(name) = &attached_network {
723 args.push("--network".to_string());
724 args.push(name.clone());
725 }
726 args.push(image.to_string());
727 args.extend(boot_command(user_data));
728
729 let output = tokio::process::Command::new(&self.cli)
730 .args(&args)
731 .output()
732 .await
733 .map_err(|e| RuntimeError::ContainerStartFailed(e.to_string()))?;
734
735 if !output.status.success() {
736 return Err(RuntimeError::ContainerStartFailed(
737 String::from_utf8_lossy(&output.stderr).trim().to_string(),
738 ));
739 }
740
741 let container_id = String::from_utf8_lossy(&output.stdout).trim().to_string();
742 let private_ip = self
743 .inspect_ip(&container_id)
744 .await
745 .unwrap_or_else(|| "10.0.0.1".to_string());
746
747 Ok(RunningInstance {
748 container_id,
749 private_ip,
750 network: attached_network,
751 })
752 }
753
754 /// Create (idempotently) the daemon network backing a subnet and return its
755 /// name, or `None` if creation failed (caller falls back to the default
756 /// bridge). The network carries the shared `fakecloud-instance` ownership
757 /// label so the startup reaper prunes it after an ungraceful restart, plus
758 /// a `fakecloud-subnet=<id>` label for introspection. Private subnets get
759 /// an `--internal` network (no NAT to the host/internet).
760 async fn ensure_subnet_network(&self, net: &InstanceNetwork) -> Option<String> {
761 let name = subnet_network_name(&net.subnet_id);
762 let mut args = vec!["network".to_string(), "create".to_string()];
763 if net.internal {
764 args.push("--internal".to_string());
765 }
766 args.push("--label".to_string());
767 args.push(format!("fakecloud-subnet={}", net.subnet_id));
768 args.push("--label".to_string());
769 args.push(format!("fakecloud-instance={}", self.instance_id));
770 args.push(name.clone());
771
772 let output = tokio::process::Command::new(&self.cli)
773 .args(&args)
774 .output()
775 .await;
776 match output {
777 // Created fresh.
778 Ok(out) if out.status.success() => Some(name),
779 // Already exists (another instance in the same subnet created it):
780 // a benign race — the network is there, so attach to it.
781 Ok(out) => {
782 let err = String::from_utf8_lossy(&out.stderr);
783 if err.contains("already exists") || err.contains("exists") {
784 Some(name)
785 } else {
786 tracing::warn!(
787 subnet = %net.subnet_id,
788 network = %name,
789 error = %err.trim(),
790 "subnet network creation failed; falling back to default bridge"
791 );
792 None
793 }
794 }
795 Err(e) => {
796 tracing::warn!(
797 subnet = %net.subnet_id,
798 network = %name,
799 error = %e,
800 "subnet network creation failed; falling back to default bridge"
801 );
802 None
803 }
804 }
805 }
806
807 /// Read the container's private IP from `inspect`. Returns `None` if the
808 /// container has no address (e.g. host networking) — the caller falls
809 /// back to a synthesized IP.
810 async fn inspect_ip(&self, container_id: &str) -> Option<String> {
811 let output = tokio::process::Command::new(&self.cli)
812 .args([
813 "inspect",
814 "-f",
815 "{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}",
816 container_id,
817 ])
818 .output()
819 .await
820 .ok()?;
821 if !output.status.success() {
822 return None;
823 }
824 let ip = String::from_utf8_lossy(&output.stdout).trim().to_string();
825 if ip.is_empty() {
826 None
827 } else {
828 Some(ip)
829 }
830 }
831
832 async fn stop(&self, container_id: &str) {
833 let _ = tokio::process::Command::new(&self.cli)
834 .args(["stop", container_id])
835 .output()
836 .await;
837 }
838
839 async fn start(&self, container_id: &str) -> Option<String> {
840 let started = tokio::process::Command::new(&self.cli)
841 .args(["start", container_id])
842 .output()
843 .await
844 .map(|o| o.status.success())
845 .unwrap_or(false);
846 if !started {
847 return None;
848 }
849 self.inspect_ip(container_id).await
850 }
851
852 async fn reboot(&self, container_id: &str) {
853 let _ = tokio::process::Command::new(&self.cli)
854 .args(["restart", container_id])
855 .output()
856 .await;
857 }
858
859 async fn remove(&self, container_id: &str) {
860 let _ = tokio::process::Command::new(&self.cli)
861 .args(["rm", "-f", container_id])
862 .output()
863 .await;
864 }
865
866 /// Remove the durable data volume for an instance (called on
867 /// `TerminateInstances`) so a later instance reusing the same id starts
868 /// clean rather than inheriting the terminated instance's filesystem. A
869 /// no-op when volume persistence is disabled (no such volume was created).
870 async fn remove_data_volume(&self, account_id: &str, instance_id: &str) {
871 if !ec2_volumes_enabled() {
872 return;
873 }
874 let _ = tokio::process::Command::new(&self.cli)
875 .args([
876 "volume",
877 "rm",
878 "-f",
879 &data_volume_name(account_id, instance_id),
880 ])
881 .output()
882 .await;
883 }
884
885 /// The container's combined stdout+stderr (`docker logs`). `None` if the
886 /// command fails; an empty log is `Some(vec![])`.
887 async fn logs(&self, container_id: &str) -> Option<Vec<u8>> {
888 let output = tokio::process::Command::new(&self.cli)
889 .args(["logs", container_id])
890 .output()
891 .await
892 .ok()?;
893 if !output.status.success() {
894 return None;
895 }
896 // `docker logs` writes the container's stdout to ours and its stderr to
897 // ours; concatenate so the console output carries both streams.
898 let mut buf = output.stdout;
899 buf.extend_from_slice(&output.stderr);
900 Some(buf)
901 }
902}
903
904#[cfg(test)]
905mod volume_tests {
906 use super::*;
907
908 #[test]
909 fn data_volume_name_is_stable_and_sanitized() {
910 // Stable across calls (so recovery reattaches the same volume) and
911 // keyed on account + instance id, not the per-process instance id.
912 assert_eq!(
913 data_volume_name("123456789012", "i-0abc123"),
914 "fakecloud-ec2-data-123456789012-i-0abc123"
915 );
916 // Characters outside Docker's volume-name set become '-'.
917 assert_eq!(
918 data_volume_name("1234/5678", "i-0abc:1"),
919 "fakecloud-ec2-data-1234-5678-i-0abc-1"
920 );
921 }
922
923 #[test]
924 fn distinct_instances_get_distinct_volumes() {
925 // Two instances in the same account never share a data volume, so
926 // terminating one cannot wipe another's filesystem.
927 assert_ne!(
928 data_volume_name("123456789012", "i-aaaa"),
929 data_volume_name("123456789012", "i-bbbb")
930 );
931 }
932}