fakecloud_ec2/runtime/mod.rs
1//! Backing-container runtime for EC2 instances.
2//!
3//! `RunInstances` spins a real container per instance; the instance
4//! lifecycle (`Start`/`Stop`/`Reboot`/`Terminate`) maps onto the container
5//! lifecycle, and `DescribeInstances` reports the container's real private
6//! IP. The container can run either as a local Docker/Podman container (the
7//! default) or as a native Kubernetes Pod (`FAKECLOUD_EC2_BACKEND=k8s` or the
8//! global `FAKECLOUD_CONTAINER_BACKEND=k8s`).
9//!
10//! Operations are keyed by **instance id**, not the backend handle: a
11//! Kubernetes Pod can't be stopped and restarted in place, so `Stop` deletes
12//! the Pod and `Start`/`Reboot` recreate it. The runtime therefore keeps,
13//! per instance, the handle plus enough of the original request (image,
14//! user-data) to recreate the backing container deterministically.
15//!
16//! The runtime is strictly additive: when no container backend is available
17//! the control plane keeps its metadata-faithful behaviour (synthesized IPs,
18//! state transitions) so every API call still succeeds. Real container
19//! backing is best-effort fidelity layered on top.
20
21pub mod firewall;
22mod k8s;
23pub mod netpolicy;
24
25use std::collections::{BTreeMap, HashMap};
26use std::sync::Arc;
27
28use parking_lot::RwLock;
29
30use firewall::{
31 render_ruleset, resolve_enforcement_mode, EnforcementMode, InstanceRules, SubnetFirewall,
32};
33
34/// Default base image an instance's container runs. AMIs don't map to a
35/// concrete OS image, so we boot a real Amazon Linux container by default
36/// (overridable via `FAKECLOUD_EC2_DEFAULT_IMAGE`, e.g. to a lighter image
37/// in CI). The container is kept alive with `tail -f /dev/null` — EC2
38/// instances are long-running hosts, not one-shot tasks. `tail` is used
39/// rather than `sleep infinity` so any base image works (busybox `sleep`
40/// rejects `infinity`).
41const DEFAULT_IMAGE_ENV: &str = "FAKECLOUD_EC2_DEFAULT_IMAGE";
42const DEFAULT_IMAGE: &str = "amazonlinux:2023";
43
44#[derive(Debug, thiserror::Error)]
45pub enum RuntimeError {
46 #[error("container failed to start: {0}")]
47 ContainerStartFailed(String),
48}
49
50/// Error initializing the Kubernetes backend at startup. Surfaced to the
51/// operator so a misconfigured cluster fails fast rather than silently
52/// falling back to Docker.
53#[derive(Debug, thiserror::Error)]
54pub enum BackendInitError {
55 #[error(transparent)]
56 Env(#[from] fakecloud_k8s::K8sEnvError),
57 #[error(transparent)]
58 PodConfig(#[from] fakecloud_k8s::K8sPodConfigError),
59 #[error("failed to connect to the Kubernetes cluster: {0}")]
60 Connect(String),
61}
62
63/// A running instance's backing container.
64#[derive(Debug, Clone)]
65pub struct RunningInstance {
66 /// Backend-specific handle: a Docker container id, or a Pod name.
67 pub container_id: String,
68 /// The instance's private IP — the container's address on the daemon
69 /// network (Docker) or the Pod IP (k8s).
70 pub private_ip: String,
71 /// Name of the backing daemon network the container was attached to
72 /// (`fakecloud-subnet-<id>`), or `None` when it ran on the default bridge
73 /// (no network spec, or creation failed and we fell back). Surfaced for
74 /// introspection (#1745 phase 5).
75 pub network: Option<String>,
76}
77
78/// The L3 placement of an instance's backing container: which subnet it lands
79/// in and whether that subnet is private.
80///
81/// Per-subnet networks give the isolation #1745 wants for free: two instances
82/// in the same subnet share a bridge and can talk; instances in different
83/// subnets / VPCs land on different bridges and cannot route to each other.
84#[derive(Debug, Clone)]
85pub struct InstanceNetwork {
86 /// The EC2 subnet id the instance launched into.
87 pub subnet_id: String,
88 /// True when the subnet has no `0.0.0.0/0 -> igw` route (private): the
89 /// backing network is created `--internal` (no NAT to host/internet).
90 pub internal: bool,
91}
92
93/// The daemon network name backing an EC2 subnet. Stable per subnet so every
94/// instance in the subnet attaches to the same bridge.
95pub fn subnet_network_name(subnet_id: &str) -> String {
96 format!("fakecloud-subnet-{subnet_id}")
97}
98
99/// How this runtime isolates instance traffic, surfaced by the
100/// `/_fakecloud/ec2/instance-networks` introspection endpoint so users can
101/// answer "why can't X reach Y" — which backend, which SG-enforcement
102/// mechanism, and whether it's actually active vs degraded to metadata-only.
103#[derive(Debug, Clone)]
104pub struct NetworkIsolationSummary {
105 /// `docker` | `podman` | `kubernetes`.
106 pub backend: &'static str,
107 /// `nftables` (Docker host firewall) | `networkpolicy` (k8s) | `disabled`.
108 pub sg_enforcement: &'static str,
109 /// Whether security-group rules are actually enforced. False means rules
110 /// are tracked but not applied (no `CAP_NET_ADMIN`, or a CNI that ignores
111 /// NetworkPolicy) — phase-2 L3 isolation still holds.
112 pub enforced: bool,
113}
114
115/// What the runtime remembers per instance so it can drive the backing
116/// container's lifecycle and recreate it (k8s `Start`/`Reboot`).
117#[derive(Debug, Clone)]
118struct InstanceRecord {
119 /// Docker container id, or Pod name.
120 handle: String,
121 /// Resolved base image, captured at `RunInstances` so a recreate is
122 /// identical even if `FAKECLOUD_EC2_DEFAULT_IMAGE` later changes.
123 image: String,
124 /// Base64 user-data to re-run on recreate, if any.
125 user_data: Option<String>,
126 /// The instance's tags, captured at `RunInstances`. Reserved
127 /// `fakecloud-k8s/*` entries drive per-instance Pod scheduling and must
128 /// survive a k8s `Start`/`Reboot` recreate, so they're stored here
129 /// rather than re-read from the control plane.
130 tags: BTreeMap<String, String>,
131 /// The instance's subnet placement, captured at `RunInstances` so a k8s
132 /// `Start`/`Reboot` recreate re-applies the same network and phase-5
133 /// introspection can report the backing network. `None` in metadata-only
134 /// network mode.
135 network: Option<InstanceNetwork>,
136}
137
138/// The selected backing-container backend.
139#[derive(Debug, Clone)]
140enum InstanceBackend {
141 Docker(DockerInstances),
142 K8s(k8s::K8sInstances),
143}
144
145/// Host firewall enforcement for security groups + NACLs (#1745 phase 3).
146///
147/// The network-driver abstraction the issue asks for: today there is one real
148/// driver (nftables) plus the degraded no-op, selected once at construction.
149/// Branching on podman vs docker isn't needed explicitly — rootless podman
150/// can't touch the host firewall, so the `nft list ruleset` capability probe
151/// already degrades it; rootful podman with netavark passes the same probe.
152#[derive(Debug, Clone)]
153pub struct FirewallEnforcer {
154 mode: EnforcementMode,
155}
156
157impl FirewallEnforcer {
158 /// Resolve the enforcement mode from `FAKECLOUD_EC2_SG_ENFORCEMENT` and an
159 /// `nft` capability probe, warning once when enforcement was requested but
160 /// can't be backed (so the operator knows it degraded, not silently).
161 fn detect() -> Self {
162 let requested = std::env::var("FAKECLOUD_EC2_SG_ENFORCEMENT").ok();
163 let mode = resolve_enforcement_mode(
164 requested.as_deref(),
165 firewall::host_shares_daemon_netns(),
166 firewall::nft_available,
167 );
168 if requested.is_some() && mode == EnforcementMode::Disabled {
169 tracing::warn!(
170 "EC2 security-group enforcement was requested but it can't take effect here \
171 (needs nftables + CAP_NET_ADMIN on a native-Linux host whose daemon shares this \
172 network namespace — Docker Desktop / podman-machine run the daemon in a VM); \
173 falling back to metadata-only (phase-2 L3 isolation stays active, security-group \
174 rules are tracked but not enforced)"
175 );
176 } else if mode == EnforcementMode::Nftables {
177 tracing::info!("EC2 security-group enforcement active via nftables");
178 }
179 Self { mode }
180 }
181
182 /// Disabled enforcer (k8s backend, or no container runtime).
183 fn disabled() -> Self {
184 Self {
185 mode: EnforcementMode::Disabled,
186 }
187 }
188
189 pub fn mode(&self) -> EnforcementMode {
190 self.mode
191 }
192
193 pub fn enabled(&self) -> bool {
194 self.mode != EnforcementMode::Disabled
195 }
196
197 /// Atomically swap in the rendered ruleset via `nft -f -`. No-op when
198 /// disabled. Best-effort: a failed apply logs and leaves the previous
199 /// ruleset in place rather than erroring the originating API call.
200 async fn reconcile(&self, subnets: &[SubnetFirewall]) {
201 if self.mode == EnforcementMode::Disabled {
202 return;
203 }
204 // Instances in the same subnet share one Linux bridge; their traffic is
205 // L2-switched and only traverses the `forward` chain (where our SG rules
206 // live) when bridge netfilter is enabled. Without this, same-subnet SG
207 // rules silently filter nothing — exactly what the real-packet E2E
208 // caught. Best-effort (needs CAP_NET_ADMIN, which the enforcer holds).
209 let _ = tokio::process::Command::new("modprobe")
210 .arg("br_netfilter")
211 .output()
212 .await;
213 let _ = tokio::process::Command::new("sysctl")
214 .args(["-w", "net.bridge.bridge-nf-call-iptables=1"])
215 .output()
216 .await;
217 let ruleset = render_ruleset(subnets);
218 use tokio::io::AsyncWriteExt;
219 let mut child = match tokio::process::Command::new("nft")
220 .args(["-f", "-"])
221 .stdin(std::process::Stdio::piped())
222 .stdout(std::process::Stdio::null())
223 .stderr(std::process::Stdio::piped())
224 .spawn()
225 {
226 Ok(c) => c,
227 Err(e) => {
228 tracing::warn!(error = %e, "failed to spawn nft; security-group ruleset not applied");
229 return;
230 }
231 };
232 if let Some(mut stdin) = child.stdin.take() {
233 let _ = stdin.write_all(ruleset.as_bytes()).await;
234 let _ = stdin.shutdown().await;
235 }
236 match child.wait_with_output().await {
237 Ok(out) if out.status.success() => {
238 tracing::debug!(
239 subnets = subnets.len(),
240 "applied EC2 security-group nft ruleset"
241 );
242 }
243 Ok(out) => {
244 tracing::warn!(
245 stderr = %String::from_utf8_lossy(&out.stderr).trim(),
246 "nft rejected the security-group ruleset; leaving the previous ruleset in place"
247 );
248 }
249 Err(e) => tracing::warn!(error = %e, "nft apply failed"),
250 }
251 }
252}
253
254#[derive(Debug, Clone)]
255pub struct Ec2Runtime {
256 backend: InstanceBackend,
257 /// Per-instance backing records, keyed by EC2 instance id, so the
258 /// lifecycle operations and reset/shutdown teardown work without
259 /// consulting service state.
260 instances: Arc<RwLock<HashMap<String, InstanceRecord>>>,
261 /// Host firewall enforcer for security groups + NACLs.
262 firewall: FirewallEnforcer,
263 /// Serializes firewall reconciles. Reconcile is fired from many concurrent
264 /// background tasks (per SG/NACL/lifecycle event); without this, two
265 /// reconciles built from divergent state could interleave so the k8s
266 /// apply+prune of one deletes a policy the other just applied (bug-hunt
267 /// 2026-06-18 finding 4.3). Holding it across the whole reconcile makes the
268 /// last-started reconcile the last-applied for both backends.
269 reconcile_lock: Arc<tokio::sync::Mutex<()>>,
270}
271
272impl Ec2Runtime {
273 /// Construct the Docker/Podman backend. Returns `None` when no container
274 /// CLI is available — callers then run in metadata-only mode.
275 pub fn new() -> Option<Self> {
276 let cli = fakecloud_core::container_net::detect_container_cli()?;
277 Some(Self {
278 backend: InstanceBackend::Docker(DockerInstances {
279 cli,
280 instance_id: format!("fakecloud-{}", std::process::id()),
281 }),
282 instances: Arc::new(RwLock::new(HashMap::new())),
283 firewall: FirewallEnforcer::detect(),
284 reconcile_lock: Arc::new(tokio::sync::Mutex::new(())),
285 })
286 }
287
288 /// Construct the Kubernetes backend. `server_port` is fakecloud's bound
289 /// port (used when `FAKECLOUD_K8S_SELF_URL` omits one). Fails fast on
290 /// misconfiguration — never silently degrades to Docker.
291 pub async fn new_k8s(server_port: u16) -> Result<Self, BackendInitError> {
292 let backend = k8s::K8sInstances::from_env(server_port).await?;
293 Ok(Self {
294 backend: InstanceBackend::K8s(backend),
295 instances: Arc::new(RwLock::new(HashMap::new())),
296 // k8s isolation is a NetworkPolicy concern (phase 4), not host nft.
297 firewall: FirewallEnforcer::disabled(),
298 reconcile_lock: Arc::new(tokio::sync::Mutex::new(())),
299 })
300 }
301
302 /// The firewall enforcer, so the control plane can skip building the model
303 /// when enforcement is disabled and report the mode for introspection.
304 pub fn firewall(&self) -> &FirewallEnforcer {
305 &self.firewall
306 }
307
308 /// Re-render and atomically apply the security-group/NACL ruleset for the
309 /// given per-subnet model. No-op (cheap) when enforcement is disabled.
310 /// Serialized against other reconciles (finding 4.3).
311 pub async fn reconcile_firewall(&self, subnets: Vec<SubnetFirewall>) {
312 let _guard = self.reconcile_lock.lock().await;
313 self.firewall.reconcile(&subnets).await;
314 }
315
316 /// Whether this runtime backs network isolation with real enforcement —
317 /// host nftables (Docker, opt-in) or k8s NetworkPolicy. Lets the control
318 /// plane skip building the firewall model entirely when neither applies.
319 pub fn network_isolation_enforced(&self) -> bool {
320 self.firewall.enabled() || self.is_k8s()
321 }
322
323 /// True for the Kubernetes backend (isolation via NetworkPolicy).
324 pub fn is_k8s(&self) -> bool {
325 matches!(self.backend, InstanceBackend::K8s(_))
326 }
327
328 /// Apply one NetworkPolicy per instance for the k8s backend. No-op on the
329 /// Docker backend (which uses nftables instead). Serialized against other
330 /// reconciles so a concurrent apply+prune can't delete a just-applied
331 /// policy (finding 4.3).
332 pub async fn reconcile_network_policies(&self, rules: Vec<InstanceRules>) {
333 if let InstanceBackend::K8s(k) = &self.backend {
334 let _guard = self.reconcile_lock.lock().await;
335 k.reconcile_network_policies(&rules).await;
336 }
337 }
338
339 /// A snapshot of how this runtime isolates instance traffic, for the
340 /// `/_fakecloud/ec2/instance-networks` introspection endpoint (#1745 ph5).
341 pub fn network_isolation_summary(&self) -> NetworkIsolationSummary {
342 match &self.backend {
343 InstanceBackend::Docker(d) => NetworkIsolationSummary {
344 backend: if fakecloud_core::container_net::is_podman_binary(&d.cli) {
345 "podman"
346 } else {
347 "docker"
348 },
349 sg_enforcement: match self.firewall.mode() {
350 EnforcementMode::Nftables => "nftables",
351 EnforcementMode::Disabled => "disabled",
352 },
353 enforced: self.firewall.enabled(),
354 },
355 InstanceBackend::K8s(k) => NetworkIsolationSummary {
356 backend: "kubernetes",
357 sg_enforcement: "networkpolicy",
358 // NetworkPolicies are always created; "enforced" reflects
359 // whether the detected CNI actually applies them.
360 enforced: k.cni_enforces(),
361 },
362 }
363 }
364
365 /// Name of the active backend, for logging.
366 pub fn cli_name(&self) -> &str {
367 match &self.backend {
368 InstanceBackend::Docker(d) => &d.cli,
369 InstanceBackend::K8s(_) => "kubernetes",
370 }
371 }
372
373 /// Boot a container for an instance. `user_data` is the base64-encoded
374 /// user-data as received on the wire (RunInstances `UserData`), run at
375 /// boot the way cloud-init would, if present.
376 pub async fn run_instance(
377 &self,
378 instance_id: &str,
379 user_data: Option<&str>,
380 tags: &BTreeMap<String, String>,
381 network: Option<&InstanceNetwork>,
382 ) -> Result<RunningInstance, RuntimeError> {
383 let image = default_image();
384 let running = match &self.backend {
385 // Docker attaches the container to the subnet's per-VPC bridge for
386 // L3 isolation. k8s pods share a flat network; isolation there is a
387 // NetworkPolicy concern handled separately (#1745 phase 4).
388 InstanceBackend::Docker(d) => {
389 d.run_instance(instance_id, &image, user_data, network)
390 .await?
391 }
392 InstanceBackend::K8s(k) => k.spawn_pod(instance_id, &image, user_data, tags).await?,
393 };
394 self.instances.write().insert(
395 instance_id.to_string(),
396 InstanceRecord {
397 handle: running.container_id.clone(),
398 image,
399 user_data: user_data.map(str::to_string),
400 tags: tags.clone(),
401 network: network.cloned(),
402 },
403 );
404 Ok(running)
405 }
406
407 /// Stop an instance's backing container (maps to `StopInstances`).
408 /// Docker stops the container in place; k8s deletes the Pod (recreated
409 /// on the next `Start`).
410 pub async fn stop_instance(&self, instance_id: &str) {
411 let Some(handle) = self.handle_of(instance_id) else {
412 return;
413 };
414 match &self.backend {
415 InstanceBackend::Docker(d) => d.stop(&handle).await,
416 InstanceBackend::K8s(k) => k.delete_pod(&handle).await,
417 }
418 }
419
420 /// Start a previously-stopped instance (maps to `StartInstances`).
421 /// Returns the running container's (possibly new) handle and private IP.
422 /// Docker starts the existing container; k8s recreates the Pod under a new
423 /// unique name, so the handle changes — callers should persist it.
424 pub async fn start_instance(&self, instance_id: &str) -> Option<RunningInstance> {
425 let record = self.instances.read().get(instance_id)?.clone();
426 match &self.backend {
427 InstanceBackend::Docker(d) => {
428 // Same container; only the IP may change. The subnet network the
429 // container was created on persists across stop/start.
430 let private_ip = d.start(&record.handle).await?;
431 Some(RunningInstance {
432 container_id: record.handle,
433 private_ip,
434 network: record
435 .network
436 .as_ref()
437 .map(|n| subnet_network_name(&n.subnet_id)),
438 })
439 }
440 InstanceBackend::K8s(k) => {
441 let running = k
442 .spawn_pod(
443 instance_id,
444 &record.image,
445 record.user_data.as_deref(),
446 &record.tags,
447 )
448 .await
449 .ok()?;
450 self.update_handle(instance_id, &running.container_id);
451 Some(running)
452 }
453 }
454 }
455
456 /// Restart an instance's backing container (maps to `RebootInstances`).
457 /// Docker restarts in place; k8s deletes and recreates the Pod under a new
458 /// name. Returns the running container's handle + IP when it changed (k8s),
459 /// so callers can persist the new handle; `None` when nothing to update.
460 pub async fn reboot_instance(&self, instance_id: &str) -> Option<RunningInstance> {
461 let record = self.instances.read().get(instance_id).cloned()?;
462 match &self.backend {
463 InstanceBackend::Docker(d) => {
464 d.reboot(&record.handle).await;
465 None
466 }
467 InstanceBackend::K8s(k) => {
468 k.delete_pod(&record.handle).await;
469 let running = k
470 .spawn_pod(
471 instance_id,
472 &record.image,
473 record.user_data.as_deref(),
474 &record.tags,
475 )
476 .await
477 .ok()?;
478 self.update_handle(instance_id, &running.container_id);
479 Some(running)
480 }
481 }
482 }
483
484 /// Remove an instance's backing container (maps to `TerminateInstances`).
485 pub async fn terminate_instance(&self, instance_id: &str) {
486 let record = self.instances.write().remove(instance_id);
487 if let Some(record) = record {
488 match &self.backend {
489 InstanceBackend::Docker(d) => d.remove(&record.handle).await,
490 InstanceBackend::K8s(k) => k.delete_pod(&record.handle).await,
491 }
492 }
493 }
494
495 /// Tear down every container this runtime spawned (used on reset and
496 /// shutdown). The Docker backend leans on the shared reaper for any
497 /// container it loses track of.
498 pub async fn stop_all(&self) {
499 let records: Vec<InstanceRecord> = {
500 let mut instances = self.instances.write();
501 instances.drain().map(|(_, r)| r).collect()
502 };
503 for record in records {
504 match &self.backend {
505 InstanceBackend::Docker(d) => d.remove(&record.handle).await,
506 InstanceBackend::K8s(k) => k.delete_pod(&record.handle).await,
507 }
508 }
509 }
510
511 /// Sweep instance Pods orphaned by a previous fakecloud process (k8s
512 /// only; the Docker backend relies on the shared reaper).
513 pub async fn reap_stale(&self) {
514 if let InstanceBackend::K8s(k) = &self.backend {
515 k.reap_stale().await;
516 }
517 }
518
519 /// The backing container's console log — its combined stdout/stderr, which
520 /// includes anything user-data printed at boot (maps to `GetConsoleOutput`).
521 /// `None` for an unbacked instance or when logs can't be read.
522 pub async fn console_output(&self, instance_id: &str) -> Option<Vec<u8>> {
523 let handle = self.handle_of(instance_id)?;
524 match &self.backend {
525 InstanceBackend::Docker(d) => d.logs(&handle).await,
526 InstanceBackend::K8s(k) => k.logs(&handle).await,
527 }
528 }
529
530 fn handle_of(&self, instance_id: &str) -> Option<String> {
531 self.instances
532 .read()
533 .get(instance_id)
534 .map(|r| r.handle.clone())
535 }
536
537 fn update_handle(&self, instance_id: &str, handle: &str) {
538 if let Some(record) = self.instances.write().get_mut(instance_id) {
539 record.handle = handle.to_string();
540 }
541 }
542}
543
544fn default_image() -> String {
545 std::env::var(DEFAULT_IMAGE_ENV).unwrap_or_else(|_| DEFAULT_IMAGE.to_string())
546}
547
548/// Keep-alive command + user-data wrapper for a base image. Shared by both
549/// backends so they boot identical containers. When `user_data` (base64) is
550/// present it is decoded and run as a root shell script, backgrounded so a
551/// slow script never blocks readiness, then the container tails forever.
552fn boot_command(user_data: Option<&str>) -> Vec<String> {
553 match user_data.filter(|s| !s.is_empty()) {
554 Some(b64) => {
555 let script = format!("printf %s '{b64}' | base64 -d | sh & exec tail -f /dev/null");
556 vec!["sh".to_string(), "-c".to_string(), script]
557 }
558 None => vec![
559 "tail".to_string(),
560 "-f".to_string(),
561 "/dev/null".to_string(),
562 ],
563 }
564}
565
566/// Docker/Podman backend: shells out to the container CLI.
567#[derive(Debug, Clone)]
568struct DockerInstances {
569 cli: String,
570 instance_id: String,
571}
572
573impl DockerInstances {
574 async fn run_instance(
575 &self,
576 instance_id: &str,
577 image: &str,
578 user_data: Option<&str>,
579 network: Option<&InstanceNetwork>,
580 ) -> Result<RunningInstance, RuntimeError> {
581 // Ensure the subnet's bridge exists and attach to it for L3 isolation.
582 // Network creation is best-effort: on failure we fall back to the
583 // default bridge so the instance still boots (no regression vs today).
584 let attached_network = match network {
585 Some(net) => self.ensure_subnet_network(net).await,
586 None => None,
587 };
588
589 let mut args: Vec<String> = vec![
590 "run".to_string(),
591 "-d".to_string(),
592 "--label".to_string(),
593 format!("fakecloud-ec2={instance_id}"),
594 "--label".to_string(),
595 format!("fakecloud-instance={}", self.instance_id),
596 ];
597 if let Some(name) = &attached_network {
598 args.push("--network".to_string());
599 args.push(name.clone());
600 }
601 args.push(image.to_string());
602 args.extend(boot_command(user_data));
603
604 let output = tokio::process::Command::new(&self.cli)
605 .args(&args)
606 .output()
607 .await
608 .map_err(|e| RuntimeError::ContainerStartFailed(e.to_string()))?;
609
610 if !output.status.success() {
611 return Err(RuntimeError::ContainerStartFailed(
612 String::from_utf8_lossy(&output.stderr).trim().to_string(),
613 ));
614 }
615
616 let container_id = String::from_utf8_lossy(&output.stdout).trim().to_string();
617 let private_ip = self
618 .inspect_ip(&container_id)
619 .await
620 .unwrap_or_else(|| "10.0.0.1".to_string());
621
622 Ok(RunningInstance {
623 container_id,
624 private_ip,
625 network: attached_network,
626 })
627 }
628
629 /// Create (idempotently) the daemon network backing a subnet and return its
630 /// name, or `None` if creation failed (caller falls back to the default
631 /// bridge). The network carries the shared `fakecloud-instance` ownership
632 /// label so the startup reaper prunes it after an ungraceful restart, plus
633 /// a `fakecloud-subnet=<id>` label for introspection. Private subnets get
634 /// an `--internal` network (no NAT to the host/internet).
635 async fn ensure_subnet_network(&self, net: &InstanceNetwork) -> Option<String> {
636 let name = subnet_network_name(&net.subnet_id);
637 let mut args = vec!["network".to_string(), "create".to_string()];
638 if net.internal {
639 args.push("--internal".to_string());
640 }
641 args.push("--label".to_string());
642 args.push(format!("fakecloud-subnet={}", net.subnet_id));
643 args.push("--label".to_string());
644 args.push(format!("fakecloud-instance={}", self.instance_id));
645 args.push(name.clone());
646
647 let output = tokio::process::Command::new(&self.cli)
648 .args(&args)
649 .output()
650 .await;
651 match output {
652 // Created fresh.
653 Ok(out) if out.status.success() => Some(name),
654 // Already exists (another instance in the same subnet created it):
655 // a benign race — the network is there, so attach to it.
656 Ok(out) => {
657 let err = String::from_utf8_lossy(&out.stderr);
658 if err.contains("already exists") || err.contains("exists") {
659 Some(name)
660 } else {
661 tracing::warn!(
662 subnet = %net.subnet_id,
663 network = %name,
664 error = %err.trim(),
665 "subnet network creation failed; falling back to default bridge"
666 );
667 None
668 }
669 }
670 Err(e) => {
671 tracing::warn!(
672 subnet = %net.subnet_id,
673 network = %name,
674 error = %e,
675 "subnet network creation failed; falling back to default bridge"
676 );
677 None
678 }
679 }
680 }
681
682 /// Read the container's private IP from `inspect`. Returns `None` if the
683 /// container has no address (e.g. host networking) — the caller falls
684 /// back to a synthesized IP.
685 async fn inspect_ip(&self, container_id: &str) -> Option<String> {
686 let output = tokio::process::Command::new(&self.cli)
687 .args([
688 "inspect",
689 "-f",
690 "{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}",
691 container_id,
692 ])
693 .output()
694 .await
695 .ok()?;
696 if !output.status.success() {
697 return None;
698 }
699 let ip = String::from_utf8_lossy(&output.stdout).trim().to_string();
700 if ip.is_empty() {
701 None
702 } else {
703 Some(ip)
704 }
705 }
706
707 async fn stop(&self, container_id: &str) {
708 let _ = tokio::process::Command::new(&self.cli)
709 .args(["stop", container_id])
710 .output()
711 .await;
712 }
713
714 async fn start(&self, container_id: &str) -> Option<String> {
715 let started = tokio::process::Command::new(&self.cli)
716 .args(["start", container_id])
717 .output()
718 .await
719 .map(|o| o.status.success())
720 .unwrap_or(false);
721 if !started {
722 return None;
723 }
724 self.inspect_ip(container_id).await
725 }
726
727 async fn reboot(&self, container_id: &str) {
728 let _ = tokio::process::Command::new(&self.cli)
729 .args(["restart", container_id])
730 .output()
731 .await;
732 }
733
734 async fn remove(&self, container_id: &str) {
735 let _ = tokio::process::Command::new(&self.cli)
736 .args(["rm", "-f", container_id])
737 .output()
738 .await;
739 }
740
741 /// The container's combined stdout+stderr (`docker logs`). `None` if the
742 /// command fails; an empty log is `Some(vec![])`.
743 async fn logs(&self, container_id: &str) -> Option<Vec<u8>> {
744 let output = tokio::process::Command::new(&self.cli)
745 .args(["logs", container_id])
746 .output()
747 .await
748 .ok()?;
749 if !output.status.success() {
750 return None;
751 }
752 // `docker logs` writes the container's stdout to ours and its stderr to
753 // ours; concatenate so the console output carries both streams.
754 let mut buf = output.stdout;
755 buf.extend_from_slice(&output.stderr);
756 Some(buf)
757 }
758}