fakecloud_ec2/runtime/mod.rs
1//! Backing-container runtime for EC2 instances.
2//!
3//! `RunInstances` spins a real container per instance; the instance
4//! lifecycle (`Start`/`Stop`/`Reboot`/`Terminate`) maps onto the container
5//! lifecycle, and `DescribeInstances` reports the container's real private
6//! IP. The container can run either as a local Docker/Podman container (the
7//! default) or as a native Kubernetes Pod (`FAKECLOUD_EC2_BACKEND=k8s` or the
8//! global `FAKECLOUD_CONTAINER_BACKEND=k8s`).
9//!
10//! Operations are keyed by **instance id**, not the backend handle: a
11//! Kubernetes Pod can't be stopped and restarted in place, so `Stop` deletes
12//! the Pod and `Start`/`Reboot` recreate it. The runtime therefore keeps,
13//! per instance, the handle plus enough of the original request (image,
14//! user-data) to recreate the backing container deterministically.
15//!
16//! The runtime is strictly additive: when no container backend is available
17//! the control plane keeps its metadata-faithful behaviour (synthesized IPs,
18//! state transitions) so every API call still succeeds. Real container
19//! backing is best-effort fidelity layered on top.
20
21pub mod firewall;
22mod k8s;
23pub mod netpolicy;
24
25use std::collections::{BTreeMap, HashMap};
26use std::sync::Arc;
27
28use parking_lot::RwLock;
29
30use firewall::{
31 render_ruleset, resolve_enforcement_mode, EnforcementMode, InstanceRules, SubnetFirewall,
32};
33
34/// Default base image an instance's container runs. AMIs don't map to a
35/// concrete OS image, so we boot a real Amazon Linux container by default
36/// (overridable via `FAKECLOUD_EC2_DEFAULT_IMAGE`, e.g. to a lighter image
37/// in CI). The container is kept alive with `tail -f /dev/null` — EC2
38/// instances are long-running hosts, not one-shot tasks. `tail` is used
39/// rather than `sleep infinity` so any base image works (busybox `sleep`
40/// rejects `infinity`).
41const DEFAULT_IMAGE_ENV: &str = "FAKECLOUD_EC2_DEFAULT_IMAGE";
42const DEFAULT_IMAGE: &str = "amazonlinux:2023";
43
44#[derive(Debug, thiserror::Error)]
45pub enum RuntimeError {
46 #[error("container failed to start: {0}")]
47 ContainerStartFailed(String),
48}
49
50/// Error initializing the Kubernetes backend at startup. Surfaced to the
51/// operator so a misconfigured cluster fails fast rather than silently
52/// falling back to Docker.
53#[derive(Debug, thiserror::Error)]
54pub enum BackendInitError {
55 #[error(transparent)]
56 Env(#[from] fakecloud_k8s::K8sEnvError),
57 #[error(transparent)]
58 PodConfig(#[from] fakecloud_k8s::K8sPodConfigError),
59 #[error("failed to connect to the Kubernetes cluster: {0}")]
60 Connect(String),
61}
62
63/// A running instance's backing container.
64#[derive(Debug, Clone)]
65pub struct RunningInstance {
66 /// Backend-specific handle: a Docker container id, or a Pod name.
67 pub container_id: String,
68 /// The instance's private IP — the container's address on the daemon
69 /// network (Docker) or the Pod IP (k8s).
70 pub private_ip: String,
71 /// Name of the backing daemon network the container was attached to
72 /// (`fakecloud-subnet-<id>`), or `None` when it ran on the default bridge
73 /// (no network spec, or creation failed and we fell back). Surfaced for
74 /// introspection (#1745 phase 5).
75 pub network: Option<String>,
76}
77
78/// The L3 placement of an instance's backing container: which subnet it lands
79/// in and whether that subnet is private.
80///
81/// Per-subnet networks give the isolation #1745 wants for free: two instances
82/// in the same subnet share a bridge and can talk; instances in different
83/// subnets / VPCs land on different bridges and cannot route to each other.
84#[derive(Debug, Clone)]
85pub struct InstanceNetwork {
86 /// The EC2 subnet id the instance launched into.
87 pub subnet_id: String,
88 /// True when the subnet has no `0.0.0.0/0 -> igw` route (private): the
89 /// backing network is created `--internal` (no NAT to host/internet).
90 pub internal: bool,
91}
92
93/// The daemon network name backing an EC2 subnet. Stable per subnet so every
94/// instance in the subnet attaches to the same bridge.
95pub fn subnet_network_name(subnet_id: &str) -> String {
96 format!("fakecloud-subnet-{subnet_id}")
97}
98
99/// How this runtime isolates instance traffic, surfaced by the
100/// `/_fakecloud/ec2/instance-networks` introspection endpoint so users can
101/// answer "why can't X reach Y" — which backend, which SG-enforcement
102/// mechanism, and whether it's actually active vs degraded to metadata-only.
103#[derive(Debug, Clone)]
104pub struct NetworkIsolationSummary {
105 /// `docker` | `podman` | `kubernetes`.
106 pub backend: &'static str,
107 /// `nftables` (Docker host firewall) | `networkpolicy` (k8s) | `disabled`.
108 pub sg_enforcement: &'static str,
109 /// Whether security-group rules are actually enforced. False means rules
110 /// are tracked but not applied (no `CAP_NET_ADMIN`, or a CNI that ignores
111 /// NetworkPolicy) — phase-2 L3 isolation still holds.
112 pub enforced: bool,
113}
114
115/// What the runtime remembers per instance so it can drive the backing
116/// container's lifecycle and recreate it (k8s `Start`/`Reboot`).
117#[derive(Debug, Clone)]
118struct InstanceRecord {
119 /// Docker container id, or Pod name.
120 handle: String,
121 /// The owning account id, captured at `RunInstances`. Keys the durable
122 /// data volume (see [`data_volume_name`]) so `TerminateInstances` can
123 /// remove the right volume without re-consulting the control plane.
124 account_id: String,
125 /// Resolved base image, captured at `RunInstances` so a recreate is
126 /// identical even if `FAKECLOUD_EC2_DEFAULT_IMAGE` later changes.
127 image: String,
128 /// Base64 user-data to re-run on recreate, if any.
129 user_data: Option<String>,
130 /// The instance's tags, captured at `RunInstances`. Reserved
131 /// `fakecloud-k8s/*` entries drive per-instance Pod scheduling and must
132 /// survive a k8s `Start`/`Reboot` recreate, so they're stored here
133 /// rather than re-read from the control plane.
134 tags: BTreeMap<String, String>,
135 /// The instance's subnet placement, captured at `RunInstances` so a k8s
136 /// `Start`/`Reboot` recreate re-applies the same network and phase-5
137 /// introspection can report the backing network. `None` in metadata-only
138 /// network mode.
139 network: Option<InstanceNetwork>,
140}
141
142/// The selected backing-container backend.
143#[derive(Debug, Clone)]
144enum InstanceBackend {
145 Docker(DockerInstances),
146 K8s(k8s::K8sInstances),
147}
148
149/// Host firewall enforcement for security groups + NACLs (#1745 phase 3).
150///
151/// The network-driver abstraction the issue asks for: today there is one real
152/// driver (nftables) plus the degraded no-op, selected once at construction.
153/// Branching on podman vs docker isn't needed explicitly — rootless podman
154/// can't touch the host firewall, so the `nft list ruleset` capability probe
155/// already degrades it; rootful podman with netavark passes the same probe.
156#[derive(Debug, Clone)]
157pub struct FirewallEnforcer {
158 mode: EnforcementMode,
159}
160
161impl FirewallEnforcer {
162 /// Resolve the enforcement mode from `FAKECLOUD_EC2_SG_ENFORCEMENT` and an
163 /// `nft` capability probe, warning once when enforcement was requested but
164 /// can't be backed (so the operator knows it degraded, not silently).
165 fn detect() -> Self {
166 let requested = std::env::var("FAKECLOUD_EC2_SG_ENFORCEMENT").ok();
167 let mode = resolve_enforcement_mode(
168 requested.as_deref(),
169 firewall::host_shares_daemon_netns(),
170 firewall::nft_available,
171 );
172 if requested.is_some() && mode == EnforcementMode::Disabled {
173 tracing::warn!(
174 "EC2 security-group enforcement was requested but it can't take effect here \
175 (needs nftables + CAP_NET_ADMIN on a native-Linux host whose daemon shares this \
176 network namespace — Docker Desktop / podman-machine run the daemon in a VM); \
177 falling back to metadata-only (phase-2 L3 isolation stays active, security-group \
178 rules are tracked but not enforced)"
179 );
180 } else if mode == EnforcementMode::Nftables {
181 tracing::info!("EC2 security-group enforcement active via nftables");
182 }
183 Self { mode }
184 }
185
186 /// Disabled enforcer (k8s backend, or no container runtime).
187 fn disabled() -> Self {
188 Self {
189 mode: EnforcementMode::Disabled,
190 }
191 }
192
193 pub fn mode(&self) -> EnforcementMode {
194 self.mode
195 }
196
197 pub fn enabled(&self) -> bool {
198 self.mode != EnforcementMode::Disabled
199 }
200
201 /// Atomically swap in the rendered ruleset via `nft -f -`. No-op when
202 /// disabled. Best-effort: a failed apply logs and leaves the previous
203 /// ruleset in place rather than erroring the originating API call.
204 async fn reconcile(&self, subnets: &[SubnetFirewall]) {
205 if self.mode == EnforcementMode::Disabled {
206 return;
207 }
208 // Instances in the same subnet share one Linux bridge; their traffic is
209 // L2-switched and only traverses the `forward` chain (where our SG rules
210 // live) when bridge netfilter is enabled. Without this, same-subnet SG
211 // rules silently filter nothing — exactly what the real-packet E2E
212 // caught. Needs CAP_NET_ADMIN (which the enforcer holds) and the
213 // `modprobe`/`sysctl` binaries (shipped via kmod/procps in the image).
214 // Warn rather than swallow the error: a missing binary or a failed call
215 // means enforcement degrades to filtering nothing, and the operator who
216 // opted in deserves to know (bug-audit 2026-06-20, 0.B1).
217 match tokio::process::Command::new("modprobe")
218 .arg("br_netfilter")
219 .output()
220 .await
221 {
222 Ok(o) if o.status.success() => {}
223 Ok(o) => tracing::warn!(
224 stderr = %String::from_utf8_lossy(&o.stderr).trim(),
225 "modprobe br_netfilter failed; same-subnet security-group enforcement may filter nothing"
226 ),
227 Err(e) => tracing::warn!(
228 error = %e,
229 "could not run modprobe (is kmod installed?); same-subnet security-group enforcement may filter nothing"
230 ),
231 }
232 match tokio::process::Command::new("sysctl")
233 .args(["-w", "net.bridge.bridge-nf-call-iptables=1"])
234 .output()
235 .await
236 {
237 Ok(o) if o.status.success() => {}
238 Ok(o) => tracing::warn!(
239 stderr = %String::from_utf8_lossy(&o.stderr).trim(),
240 "sysctl bridge-nf-call-iptables=1 failed; same-subnet security-group enforcement may filter nothing"
241 ),
242 Err(e) => tracing::warn!(
243 error = %e,
244 "could not run sysctl (is procps installed?); same-subnet security-group enforcement may filter nothing"
245 ),
246 }
247 let ruleset = render_ruleset(subnets);
248 use tokio::io::AsyncWriteExt;
249 let mut child = match tokio::process::Command::new("nft")
250 .args(["-f", "-"])
251 .stdin(std::process::Stdio::piped())
252 .stdout(std::process::Stdio::null())
253 .stderr(std::process::Stdio::piped())
254 .spawn()
255 {
256 Ok(c) => c,
257 Err(e) => {
258 tracing::warn!(error = %e, "failed to spawn nft; security-group ruleset not applied");
259 return;
260 }
261 };
262 if let Some(mut stdin) = child.stdin.take() {
263 let _ = stdin.write_all(ruleset.as_bytes()).await;
264 let _ = stdin.shutdown().await;
265 }
266 match child.wait_with_output().await {
267 Ok(out) if out.status.success() => {
268 tracing::debug!(
269 subnets = subnets.len(),
270 "applied EC2 security-group nft ruleset"
271 );
272 }
273 Ok(out) => {
274 tracing::warn!(
275 stderr = %String::from_utf8_lossy(&out.stderr).trim(),
276 "nft rejected the security-group ruleset; leaving the previous ruleset in place"
277 );
278 }
279 Err(e) => tracing::warn!(error = %e, "nft apply failed"),
280 }
281 }
282}
283
284#[derive(Debug, Clone)]
285pub struct Ec2Runtime {
286 backend: InstanceBackend,
287 /// Per-instance backing records, keyed by EC2 instance id, so the
288 /// lifecycle operations and reset/shutdown teardown work without
289 /// consulting service state.
290 instances: Arc<RwLock<HashMap<String, InstanceRecord>>>,
291 /// Host firewall enforcer for security groups + NACLs.
292 firewall: FirewallEnforcer,
293 /// Serializes firewall reconciles. Reconcile is fired from many concurrent
294 /// background tasks (per SG/NACL/lifecycle event); without this, two
295 /// reconciles built from divergent state could interleave so the k8s
296 /// apply+prune of one deletes a policy the other just applied (bug-hunt
297 /// 2026-06-18 finding 4.3). Holding it across the whole reconcile makes the
298 /// last-started reconcile the last-applied for both backends.
299 reconcile_lock: Arc<tokio::sync::Mutex<()>>,
300}
301
302impl Ec2Runtime {
303 /// Construct the Docker/Podman backend. Returns `None` when no container
304 /// CLI is available — callers then run in metadata-only mode.
305 pub fn new() -> Option<Self> {
306 let cli = fakecloud_core::container_net::detect_container_cli()?;
307 Some(Self {
308 backend: InstanceBackend::Docker(DockerInstances {
309 cli,
310 instance_id: format!("fakecloud-{}", std::process::id()),
311 }),
312 instances: Arc::new(RwLock::new(HashMap::new())),
313 firewall: FirewallEnforcer::detect(),
314 reconcile_lock: Arc::new(tokio::sync::Mutex::new(())),
315 })
316 }
317
318 /// Construct the Kubernetes backend. `server_port` is fakecloud's bound
319 /// port (used when `FAKECLOUD_K8S_SELF_URL` omits one). Fails fast on
320 /// misconfiguration — never silently degrades to Docker.
321 pub async fn new_k8s(server_port: u16) -> Result<Self, BackendInitError> {
322 let backend = k8s::K8sInstances::from_env(server_port).await?;
323 Ok(Self {
324 backend: InstanceBackend::K8s(backend),
325 instances: Arc::new(RwLock::new(HashMap::new())),
326 // k8s isolation is a NetworkPolicy concern (phase 4), not host nft.
327 firewall: FirewallEnforcer::disabled(),
328 reconcile_lock: Arc::new(tokio::sync::Mutex::new(())),
329 })
330 }
331
332 /// The firewall enforcer, so the control plane can skip building the model
333 /// when enforcement is disabled and report the mode for introspection.
334 pub fn firewall(&self) -> &FirewallEnforcer {
335 &self.firewall
336 }
337
338 /// Re-render and atomically apply the security-group/NACL ruleset for the
339 /// given per-subnet model. No-op (cheap) when enforcement is disabled.
340 /// Serialized against other reconciles (finding 4.3).
341 pub async fn reconcile_firewall(&self, subnets: Vec<SubnetFirewall>) {
342 let _guard = self.reconcile_lock.lock().await;
343 self.firewall.reconcile(&subnets).await;
344 }
345
346 /// Whether this runtime backs network isolation with real enforcement —
347 /// host nftables (Docker, opt-in) or k8s NetworkPolicy. Lets the control
348 /// plane skip building the firewall model entirely when neither applies.
349 pub fn network_isolation_enforced(&self) -> bool {
350 self.firewall.enabled() || self.is_k8s()
351 }
352
353 /// True for the Kubernetes backend (isolation via NetworkPolicy).
354 pub fn is_k8s(&self) -> bool {
355 matches!(self.backend, InstanceBackend::K8s(_))
356 }
357
358 /// Apply one NetworkPolicy per instance for the k8s backend. No-op on the
359 /// Docker backend (which uses nftables instead). Serialized against other
360 /// reconciles so a concurrent apply+prune can't delete a just-applied
361 /// policy (finding 4.3).
362 pub async fn reconcile_network_policies(&self, rules: Vec<InstanceRules>) {
363 if let InstanceBackend::K8s(k) = &self.backend {
364 let _guard = self.reconcile_lock.lock().await;
365 k.reconcile_network_policies(&rules).await;
366 }
367 }
368
369 /// A snapshot of how this runtime isolates instance traffic, for the
370 /// `/_fakecloud/ec2/instance-networks` introspection endpoint (#1745 ph5).
371 pub fn network_isolation_summary(&self) -> NetworkIsolationSummary {
372 match &self.backend {
373 InstanceBackend::Docker(d) => NetworkIsolationSummary {
374 backend: if fakecloud_core::container_net::is_podman_binary(&d.cli) {
375 "podman"
376 } else {
377 "docker"
378 },
379 sg_enforcement: match self.firewall.mode() {
380 EnforcementMode::Nftables => "nftables",
381 EnforcementMode::Disabled => "disabled",
382 },
383 enforced: self.firewall.enabled(),
384 },
385 InstanceBackend::K8s(k) => NetworkIsolationSummary {
386 backend: "kubernetes",
387 sg_enforcement: "networkpolicy",
388 // NetworkPolicies are always created; "enforced" reflects
389 // whether the detected CNI actually applies them.
390 enforced: k.cni_enforces(),
391 },
392 }
393 }
394
395 /// Name of the active backend, for logging.
396 pub fn cli_name(&self) -> &str {
397 match &self.backend {
398 InstanceBackend::Docker(d) => &d.cli,
399 InstanceBackend::K8s(_) => "kubernetes",
400 }
401 }
402
403 /// Boot a container for an instance. `user_data` is the base64-encoded
404 /// user-data as received on the wire (RunInstances `UserData`), run at
405 /// boot the way cloud-init would, if present.
406 pub async fn run_instance(
407 &self,
408 account_id: &str,
409 instance_id: &str,
410 user_data: Option<&str>,
411 tags: &BTreeMap<String, String>,
412 network: Option<&InstanceNetwork>,
413 ) -> Result<RunningInstance, RuntimeError> {
414 let image = default_image();
415 let running = match &self.backend {
416 // Docker attaches the container to the subnet's per-VPC bridge for
417 // L3 isolation. k8s pods share a flat network; isolation there is a
418 // NetworkPolicy concern handled separately (#1745 phase 4).
419 InstanceBackend::Docker(d) => {
420 d.run_instance(account_id, instance_id, &image, user_data, network)
421 .await?
422 }
423 InstanceBackend::K8s(k) => k.spawn_pod(instance_id, &image, user_data, tags).await?,
424 };
425 self.instances.write().insert(
426 instance_id.to_string(),
427 InstanceRecord {
428 handle: running.container_id.clone(),
429 account_id: account_id.to_string(),
430 image,
431 user_data: user_data.map(str::to_string),
432 tags: tags.clone(),
433 network: network.cloned(),
434 },
435 );
436 Ok(running)
437 }
438
439 /// Stop an instance's backing container (maps to `StopInstances`).
440 /// Docker stops the container in place; k8s deletes the Pod (recreated
441 /// on the next `Start`).
442 pub async fn stop_instance(&self, instance_id: &str) {
443 let Some(handle) = self.handle_of(instance_id) else {
444 return;
445 };
446 match &self.backend {
447 InstanceBackend::Docker(d) => d.stop(&handle).await,
448 InstanceBackend::K8s(k) => k.delete_pod(&handle).await,
449 }
450 }
451
452 /// Start a previously-stopped instance (maps to `StartInstances`).
453 /// Returns the running container's (possibly new) handle and private IP.
454 /// Docker starts the existing container; k8s recreates the Pod under a new
455 /// unique name, so the handle changes — callers should persist it.
456 pub async fn start_instance(&self, instance_id: &str) -> Option<RunningInstance> {
457 let record = self.instances.read().get(instance_id)?.clone();
458 match &self.backend {
459 InstanceBackend::Docker(d) => {
460 // Same container; only the IP may change. The subnet network the
461 // container was created on persists across stop/start.
462 let private_ip = d.start(&record.handle).await?;
463 Some(RunningInstance {
464 container_id: record.handle,
465 private_ip,
466 network: record
467 .network
468 .as_ref()
469 .map(|n| subnet_network_name(&n.subnet_id)),
470 })
471 }
472 InstanceBackend::K8s(k) => {
473 let running = k
474 .spawn_pod(
475 instance_id,
476 &record.image,
477 record.user_data.as_deref(),
478 &record.tags,
479 )
480 .await
481 .ok()?;
482 self.update_handle(instance_id, &running.container_id);
483 Some(running)
484 }
485 }
486 }
487
488 /// Restart an instance's backing container (maps to `RebootInstances`).
489 /// Docker restarts in place; k8s deletes and recreates the Pod under a new
490 /// name. Returns the running container's handle + IP when it changed (k8s),
491 /// so callers can persist the new handle; `None` when nothing to update.
492 pub async fn reboot_instance(&self, instance_id: &str) -> Option<RunningInstance> {
493 let record = self.instances.read().get(instance_id).cloned()?;
494 match &self.backend {
495 InstanceBackend::Docker(d) => {
496 d.reboot(&record.handle).await;
497 None
498 }
499 InstanceBackend::K8s(k) => {
500 k.delete_pod(&record.handle).await;
501 let running = k
502 .spawn_pod(
503 instance_id,
504 &record.image,
505 record.user_data.as_deref(),
506 &record.tags,
507 )
508 .await
509 .ok()?;
510 self.update_handle(instance_id, &running.container_id);
511 Some(running)
512 }
513 }
514 }
515
516 /// Remove an instance's backing container (maps to `TerminateInstances`).
517 pub async fn terminate_instance(&self, instance_id: &str) {
518 let record = self.instances.write().remove(instance_id);
519 if let Some(record) = record {
520 match &self.backend {
521 InstanceBackend::Docker(d) => {
522 d.remove(&record.handle).await;
523 // Drop the durable root-disk volume so a later instance
524 // reusing this id starts clean (terminate = volume gone,
525 // matching a deleted EBS root volume). No-op when volumes
526 // are disabled.
527 d.remove_data_volume(&record.account_id, instance_id).await;
528 }
529 InstanceBackend::K8s(k) => k.delete_pod(&record.handle).await,
530 }
531 }
532 }
533
534 /// Tear down every container this runtime spawned (used on reset and
535 /// shutdown). The Docker backend leans on the shared reaper for any
536 /// container it loses track of.
537 pub async fn stop_all(&self) {
538 let records: Vec<InstanceRecord> = {
539 let mut instances = self.instances.write();
540 instances.drain().map(|(_, r)| r).collect()
541 };
542 for record in records {
543 match &self.backend {
544 InstanceBackend::Docker(d) => d.remove(&record.handle).await,
545 InstanceBackend::K8s(k) => k.delete_pod(&record.handle).await,
546 }
547 }
548 }
549
550 /// Sweep instance Pods orphaned by a previous fakecloud process (k8s
551 /// only; the Docker backend relies on the shared reaper).
552 pub async fn reap_stale(&self) {
553 if let InstanceBackend::K8s(k) = &self.backend {
554 k.reap_stale().await;
555 }
556 }
557
558 /// The backing container's console log — its combined stdout/stderr, which
559 /// includes anything user-data printed at boot (maps to `GetConsoleOutput`).
560 /// `None` for an unbacked instance or when logs can't be read.
561 pub async fn console_output(&self, instance_id: &str) -> Option<Vec<u8>> {
562 let handle = self.handle_of(instance_id)?;
563 match &self.backend {
564 InstanceBackend::Docker(d) => d.logs(&handle).await,
565 InstanceBackend::K8s(k) => k.logs(&handle).await,
566 }
567 }
568
569 fn handle_of(&self, instance_id: &str) -> Option<String> {
570 self.instances
571 .read()
572 .get(instance_id)
573 .map(|r| r.handle.clone())
574 }
575
576 fn update_handle(&self, instance_id: &str, handle: &str) {
577 if let Some(record) = self.instances.write().get_mut(instance_id) {
578 record.handle = handle.to_string();
579 }
580 }
581}
582
583fn default_image() -> String {
584 std::env::var(DEFAULT_IMAGE_ENV).unwrap_or_else(|_| DEFAULT_IMAGE.to_string())
585}
586
587/// The in-instance directory backed by the durable data volume. Defaults to
588/// `/var/lib/fakecloud/ec2`; override with `FAKECLOUD_EC2_INSTANCE_DATA_DIR`
589/// to capture whichever path the instance's workload writes its long-lived
590/// state to. This is fakecloud's persistent-instance-data convention rather
591/// than a full root-filesystem snapshot: data written here survives restart
592/// and stop/start; the rest of the container's ephemeral filesystem does not.
593fn instance_data_dir() -> String {
594 std::env::var("FAKECLOUD_EC2_INSTANCE_DATA_DIR")
595 .unwrap_or_else(|_| "/var/lib/fakecloud/ec2".to_string())
596}
597
598/// Whether EC2 instance data should survive a fakecloud restart via a durable
599/// named volume. OFF by default (ephemeral, keeping test/CI runs clean and
600/// avoiding a stale volume bleeding into a later instance that reuses an id);
601/// opt in with `FAKECLOUD_PERSIST_EC2_VOLUMES=1`. A follow-up flips this
602/// default to on in persistent storage mode by exporting the same env var.
603fn ec2_volumes_enabled() -> bool {
604 std::env::var("FAKECLOUD_PERSIST_EC2_VOLUMES")
605 .map(|v| matches!(v.as_str(), "1" | "true" | "TRUE" | "yes"))
606 .unwrap_or(false)
607}
608
609/// Deterministic Docker volume name for an instance's data dir. Keyed only on
610/// account + instance id (NOT the per-process fakecloud instance id) so the
611/// same volume reattaches after a fakecloud restart recreates the container.
612/// Characters outside Docker's `[a-zA-Z0-9_.-]` volume-name set become `-`.
613fn data_volume_name(account_id: &str, instance_id: &str) -> String {
614 let sanitize = |s: &str| -> String {
615 s.chars()
616 .map(|c| {
617 if c.is_ascii_alphanumeric() || c == '_' || c == '.' || c == '-' {
618 c
619 } else {
620 '-'
621 }
622 })
623 .collect()
624 };
625 format!(
626 "fakecloud-ec2-data-{}-{}",
627 sanitize(account_id),
628 sanitize(instance_id)
629 )
630}
631
632/// Keep-alive command + user-data wrapper for a base image. Shared by both
633/// backends so they boot identical containers. When `user_data` (base64) is
634/// present it is decoded and run as a root shell script, backgrounded so a
635/// slow script never blocks readiness, then the container tails forever.
636fn boot_command(user_data: Option<&str>) -> Vec<String> {
637 match user_data.filter(|s| !s.is_empty()) {
638 Some(b64) => {
639 let script = format!("printf %s '{b64}' | base64 -d | sh & exec tail -f /dev/null");
640 vec!["sh".to_string(), "-c".to_string(), script]
641 }
642 None => vec![
643 "tail".to_string(),
644 "-f".to_string(),
645 "/dev/null".to_string(),
646 ],
647 }
648}
649
650/// Docker/Podman backend: shells out to the container CLI.
651#[derive(Debug, Clone)]
652struct DockerInstances {
653 cli: String,
654 instance_id: String,
655}
656
657impl DockerInstances {
658 async fn run_instance(
659 &self,
660 account_id: &str,
661 instance_id: &str,
662 image: &str,
663 user_data: Option<&str>,
664 network: Option<&InstanceNetwork>,
665 ) -> Result<RunningInstance, RuntimeError> {
666 // Ensure the subnet's bridge exists and attach to it for L3 isolation.
667 // Network creation is best-effort: on failure we fall back to the
668 // default bridge so the instance still boots (no regression vs today).
669 let attached_network = match network {
670 Some(net) => self.ensure_subnet_network(net).await,
671 None => None,
672 };
673
674 let mut args: Vec<String> = vec![
675 "run".to_string(),
676 "-d".to_string(),
677 "--label".to_string(),
678 format!("fakecloud-ec2={instance_id}"),
679 "--label".to_string(),
680 format!("fakecloud-instance={}", self.instance_id),
681 ];
682 // Optionally back the instance's writable data directory with a durable
683 // named volume keyed on account + instance id, so the filesystem state
684 // an instance writes there survives a fakecloud restart (the recovery
685 // path recreates the container, which reattaches the same volume) and a
686 // stop/start (Docker reuses the same container, so the volume persists
687 // regardless). OFF by default to keep test/CI runs ephemeral and avoid
688 // a stale volume bleeding into a later instance that reuses an id;
689 // enabled by `FAKECLOUD_PERSIST_EC2_VOLUMES=1` (and, in a follow-up,
690 // default-on in persistent storage mode). The volume is dropped on
691 // TerminateInstances. See [`data_volume_name`] / [`instance_data_dir`].
692 if ec2_volumes_enabled() {
693 args.push("-v".to_string());
694 args.push(format!(
695 "{}:{}",
696 data_volume_name(account_id, instance_id),
697 instance_data_dir()
698 ));
699 }
700 if let Some(name) = &attached_network {
701 args.push("--network".to_string());
702 args.push(name.clone());
703 }
704 args.push(image.to_string());
705 args.extend(boot_command(user_data));
706
707 let output = tokio::process::Command::new(&self.cli)
708 .args(&args)
709 .output()
710 .await
711 .map_err(|e| RuntimeError::ContainerStartFailed(e.to_string()))?;
712
713 if !output.status.success() {
714 return Err(RuntimeError::ContainerStartFailed(
715 String::from_utf8_lossy(&output.stderr).trim().to_string(),
716 ));
717 }
718
719 let container_id = String::from_utf8_lossy(&output.stdout).trim().to_string();
720 let private_ip = self
721 .inspect_ip(&container_id)
722 .await
723 .unwrap_or_else(|| "10.0.0.1".to_string());
724
725 Ok(RunningInstance {
726 container_id,
727 private_ip,
728 network: attached_network,
729 })
730 }
731
732 /// Create (idempotently) the daemon network backing a subnet and return its
733 /// name, or `None` if creation failed (caller falls back to the default
734 /// bridge). The network carries the shared `fakecloud-instance` ownership
735 /// label so the startup reaper prunes it after an ungraceful restart, plus
736 /// a `fakecloud-subnet=<id>` label for introspection. Private subnets get
737 /// an `--internal` network (no NAT to the host/internet).
738 async fn ensure_subnet_network(&self, net: &InstanceNetwork) -> Option<String> {
739 let name = subnet_network_name(&net.subnet_id);
740 let mut args = vec!["network".to_string(), "create".to_string()];
741 if net.internal {
742 args.push("--internal".to_string());
743 }
744 args.push("--label".to_string());
745 args.push(format!("fakecloud-subnet={}", net.subnet_id));
746 args.push("--label".to_string());
747 args.push(format!("fakecloud-instance={}", self.instance_id));
748 args.push(name.clone());
749
750 let output = tokio::process::Command::new(&self.cli)
751 .args(&args)
752 .output()
753 .await;
754 match output {
755 // Created fresh.
756 Ok(out) if out.status.success() => Some(name),
757 // Already exists (another instance in the same subnet created it):
758 // a benign race — the network is there, so attach to it.
759 Ok(out) => {
760 let err = String::from_utf8_lossy(&out.stderr);
761 if err.contains("already exists") || err.contains("exists") {
762 Some(name)
763 } else {
764 tracing::warn!(
765 subnet = %net.subnet_id,
766 network = %name,
767 error = %err.trim(),
768 "subnet network creation failed; falling back to default bridge"
769 );
770 None
771 }
772 }
773 Err(e) => {
774 tracing::warn!(
775 subnet = %net.subnet_id,
776 network = %name,
777 error = %e,
778 "subnet network creation failed; falling back to default bridge"
779 );
780 None
781 }
782 }
783 }
784
785 /// Read the container's private IP from `inspect`. Returns `None` if the
786 /// container has no address (e.g. host networking) — the caller falls
787 /// back to a synthesized IP.
788 async fn inspect_ip(&self, container_id: &str) -> Option<String> {
789 let output = tokio::process::Command::new(&self.cli)
790 .args([
791 "inspect",
792 "-f",
793 "{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}",
794 container_id,
795 ])
796 .output()
797 .await
798 .ok()?;
799 if !output.status.success() {
800 return None;
801 }
802 let ip = String::from_utf8_lossy(&output.stdout).trim().to_string();
803 if ip.is_empty() {
804 None
805 } else {
806 Some(ip)
807 }
808 }
809
810 async fn stop(&self, container_id: &str) {
811 let _ = tokio::process::Command::new(&self.cli)
812 .args(["stop", container_id])
813 .output()
814 .await;
815 }
816
817 async fn start(&self, container_id: &str) -> Option<String> {
818 let started = tokio::process::Command::new(&self.cli)
819 .args(["start", container_id])
820 .output()
821 .await
822 .map(|o| o.status.success())
823 .unwrap_or(false);
824 if !started {
825 return None;
826 }
827 self.inspect_ip(container_id).await
828 }
829
830 async fn reboot(&self, container_id: &str) {
831 let _ = tokio::process::Command::new(&self.cli)
832 .args(["restart", container_id])
833 .output()
834 .await;
835 }
836
837 async fn remove(&self, container_id: &str) {
838 let _ = tokio::process::Command::new(&self.cli)
839 .args(["rm", "-f", container_id])
840 .output()
841 .await;
842 }
843
844 /// Remove the durable data volume for an instance (called on
845 /// `TerminateInstances`) so a later instance reusing the same id starts
846 /// clean rather than inheriting the terminated instance's filesystem. A
847 /// no-op when volume persistence is disabled (no such volume was created).
848 async fn remove_data_volume(&self, account_id: &str, instance_id: &str) {
849 if !ec2_volumes_enabled() {
850 return;
851 }
852 let _ = tokio::process::Command::new(&self.cli)
853 .args([
854 "volume",
855 "rm",
856 "-f",
857 &data_volume_name(account_id, instance_id),
858 ])
859 .output()
860 .await;
861 }
862
863 /// The container's combined stdout+stderr (`docker logs`). `None` if the
864 /// command fails; an empty log is `Some(vec![])`.
865 async fn logs(&self, container_id: &str) -> Option<Vec<u8>> {
866 let output = tokio::process::Command::new(&self.cli)
867 .args(["logs", container_id])
868 .output()
869 .await
870 .ok()?;
871 if !output.status.success() {
872 return None;
873 }
874 // `docker logs` writes the container's stdout to ours and its stderr to
875 // ours; concatenate so the console output carries both streams.
876 let mut buf = output.stdout;
877 buf.extend_from_slice(&output.stderr);
878 Some(buf)
879 }
880}
881
882#[cfg(test)]
883mod volume_tests {
884 use super::*;
885
886 #[test]
887 fn data_volume_name_is_stable_and_sanitized() {
888 // Stable across calls (so recovery reattaches the same volume) and
889 // keyed on account + instance id, not the per-process instance id.
890 assert_eq!(
891 data_volume_name("123456789012", "i-0abc123"),
892 "fakecloud-ec2-data-123456789012-i-0abc123"
893 );
894 // Characters outside Docker's volume-name set become '-'.
895 assert_eq!(
896 data_volume_name("1234/5678", "i-0abc:1"),
897 "fakecloud-ec2-data-1234-5678-i-0abc-1"
898 );
899 }
900
901 #[test]
902 fn distinct_instances_get_distinct_volumes() {
903 // Two instances in the same account never share a data volume, so
904 // terminating one cannot wipe another's filesystem.
905 assert_ne!(
906 data_volume_name("123456789012", "i-aaaa"),
907 data_volume_name("123456789012", "i-bbbb")
908 );
909 }
910}