cellos-supervisor 0.5.1

//! Linux isolation helpers split out from `supervisor.rs` (P0-2).
//!
//! Hosts:
//! - `linux_subprocess_unshare_flags` — parse `CELLOS_SUBPROCESS_UNSHARE` env var.
//! - `linux_subprocess_cgroup_parent` — parse `CELLOS_CGROUP_PARENT` env var.
//! - `linux_cgroup_write_optional_controller_files` — write `memory.max` /
//!   `cpu.max` to the leaf cgroup before spawn.
//! - `LinuxCgroupAttach` + `linux_cgroup_attach_for_run` — decide which cgroup
//!   leaf to attach the workload to.
//! - `linux_run_cell_command_isolated` — the `spawn_blocking` body that owns
//!   the workload child PID: pre_exec unshare/seccomp, nft apply, optional
//!   in-netns dns_proxy / sni_proxy / per-flow listener spawn, wait/timeout,
//!   post-run counter scrape.
//! - `maybe_warn_linux_isolation_env_on_non_linux` — one-shot warn so
//!   operators on non-Linux supervisors see why CELLOS_SUBPROCESS_UNSHARE
//!   / CELLOS_CGROUP_PARENT are no-ops.
//!
//! Pure code-move from `supervisor.rs`. No logic changes.

#[cfg(target_os = "linux")]
use std::path::PathBuf;
#[cfg(target_os = "linux")]
use std::time::{Duration, Instant};

#[cfg(target_os = "linux")]
use cellos_core::ports::CellHandle;
#[cfg(target_os = "linux")]
use cellos_core::{EgressRule, RunSpec, SecretView};

#[cfg(target_os = "linux")]
use crate::runtime_secret::RuntimeSecretSession;
#[cfg(target_os = "linux")]
use crate::supervisor::{DnsProxyActivation, SniProxyActivation};
#[cfg(target_os = "linux")]
use crate::supervisor_helpers::{run_timeout_message, RunTimeoutSource};

/// `CELLOS_SUBPROCESS_UNSHARE`: when non-empty on Linux, `spec.run` enters new namespaces before exec.
/// - `1` or `default` → `CLONE_NEWPID | CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWNET` (private PID/IPC/mount/net)
/// - Comma list: `pid`, `ipc`, `net`, `uts`, `mnt` (maps to `CLONE_NEWNS`)
#[cfg(target_os = "linux")]
pub(crate) fn linux_subprocess_unshare_flags() -> Option<libc::c_int> {
    let Ok(raw) = std::env::var("CELLOS_SUBPROCESS_UNSHARE") else {
        return None;
    };
    let t = raw.trim();
    if t.is_empty() {
        return None;
    }
    if t == "1" || t.eq_ignore_ascii_case("default") {
        return Some(
            libc::CLONE_NEWPID | libc::CLONE_NEWIPC | libc::CLONE_NEWNS | libc::CLONE_NEWNET,
        );
    }
    let mut flags: libc::c_int = 0;
    for part in t.split(',') {
        match part.trim().to_ascii_lowercase().as_str() {
            "pid" => flags |= libc::CLONE_NEWPID,
            "ipc" => flags |= libc::CLONE_NEWIPC,
            "net" => flags |= libc::CLONE_NEWNET,
            "uts" => flags |= libc::CLONE_NEWUTS,
            "mnt" | "mount" | "ns" => flags |= libc::CLONE_NEWNS,
            other if !other.is_empty() => {
                tracing::warn!(
                    target: "cellos.supervisor.linux_isolation",
                    token = other,
                    "CELLOS_SUBPROCESS_UNSHARE: unknown flag ignored"
                );
            }
            _ => {}
        }
    }
    if flags == 0 {
        tracing::warn!(
            target: "cellos.supervisor.linux_isolation",
            "CELLOS_SUBPROCESS_UNSHARE set but no valid flags; ignoring"
        );
        return None;
    }
    Some(flags)
}

#[cfg(all(unix, not(target_os = "linux")))]
pub(crate) fn maybe_warn_linux_isolation_env_on_non_linux() {
    use std::sync::atomic::{AtomicBool, Ordering};
    static WARNED: AtomicBool = AtomicBool::new(false);
    let unshare = std::env::var("CELLOS_SUBPROCESS_UNSHARE")
        .map(|v| !v.trim().is_empty())
        .unwrap_or(false);
    let cgroup = std::env::var("CELLOS_CGROUP_PARENT")
        .map(|v| !v.trim().is_empty())
        .unwrap_or(false);
    if !unshare && !cgroup {
        return;
    }
    if WARNED.swap(true, Ordering::Relaxed) {
        return;
    }
    tracing::warn!(
        target: "cellos.supervisor.linux_isolation",
        "CELLOS_SUBPROCESS_UNSHARE / CELLOS_CGROUP_PARENT are set but Linux-only isolation is not supported on this OS; ignoring"
    );
}

/// `CELLOS_CGROUP_PARENT`: if non-empty **and** `CellHandle.cgroup_path` is unset (e.g. stub backend),
/// create a leaf cgroup under this v2 path. Prefer the handle from [`ProprietaryCellBackend`].
/// Requires a **writable** unified hierarchy directory. Off unless set.
#[cfg(target_os = "linux")]
pub(crate) fn linux_subprocess_cgroup_parent() -> Option<PathBuf> {
    let Ok(raw) = std::env::var("CELLOS_CGROUP_PARENT") else {
        return None;
    };
    let t = raw.trim();
    if t.is_empty() {
        return None;
    }
    Some(PathBuf::from(t))
}

/// Optional cgroup v2 limit files written **after** the leaf exists and **before** `spawn`.
///
/// **`memory.max`:** sourced from `CELLOS_CGROUP_MEMORY_MAX` (decimal bytes or `max`).
/// Writes are best-effort (**warn** on failure); parent slice may not delegate `memory`.
///
/// **`cpu.max` (P0-3 / I3, doctrine D1):** sourced from
/// `spec.run.limits.cpuMax` as the **primary** input (typed contract field).
/// `CELLOS_CGROUP_CPU_MAX` is an explicit operator **override** consulted only
/// when the spec is silent — the env var alone is not a default. The format
/// mirrors cgroup v2 `cpu.max`: `<quota> <period>` (microseconds) or
/// `max <period>`. Resolution + validation lives in
/// [`crate::linux_cgroup::cpu_max_to_write`] so it is unit-testable without a
/// real cgroup hierarchy. Writes are best-effort (**warn** on failure); parent
/// slice may not delegate `cpu`.
#[cfg(target_os = "linux")]
pub(crate) fn linux_cgroup_write_optional_controller_files(
    leaf: &std::path::Path,
    spec_limits: Option<&cellos_core::types::RunLimits>,
) {
    if let Ok(raw) = std::env::var("CELLOS_CGROUP_MEMORY_MAX") {
        let v = raw.trim();
        if !v.is_empty() {
            let p = leaf.join("memory.max");
            if let Err(e) = std::fs::write(&p, format!("{v}\n")) {
                tracing::warn!(
                    target: "cellos.supervisor.linux_isolation",
                    path = %p.display(),
                    error = %e,
                    "CELLOS_CGROUP_MEMORY_MAX: write failed (memory controller may be unavailable)"
                );
            }
        }
    }

    // P0-3 / I3: spec.run.limits.cpuMax is primary; CELLOS_CGROUP_CPU_MAX is an
    // explicit override applied only when the spec is silent.
    let env_cpu = std::env::var("CELLOS_CGROUP_CPU_MAX").ok();
    let env_cpu_ref = env_cpu.as_deref();
    match crate::linux_cgroup::apply_cpu_max_to_leaf(leaf, spec_limits, env_cpu_ref) {
        crate::linux_cgroup::CpuMaxApplyOutcome::Wrote { .. }
        | crate::linux_cgroup::CpuMaxApplyOutcome::Skipped => {}
        crate::linux_cgroup::CpuMaxApplyOutcome::WriteError(detail) => {
            let source = if spec_limits.and_then(|l| l.cpu_max.as_ref()).is_some() {
                "spec.run.limits.cpuMax"
            } else {
                "CELLOS_CGROUP_CPU_MAX"
            };
            tracing::warn!(
                target: "cellos.supervisor.linux_isolation",
                detail = %detail,
                source = %source,
                "cpu.max: write failed (cpu controller may be unavailable)"
            );
        }
        crate::linux_cgroup::CpuMaxApplyOutcome::InvalidEnvOverride => {
            // Spec was silent and env override was set but malformed.
            let diag = crate::linux_cgroup::cpu_max_env_validation_error(env_cpu_ref)
                .unwrap_or("invalid value");
            tracing::warn!(
                target: "cellos.supervisor.linux_isolation",
                value = %env_cpu_ref.unwrap_or(""),
                "CELLOS_CGROUP_CPU_MAX: ignoring malformed value ({diag})"
            );
        }
    }
}

#[cfg(target_os = "linux")]
#[derive(Debug, PartialEq, Eq)]
pub(crate) enum LinuxCgroupAttach {
    None,
    /// Leaf created here; supervisor removes after run (stub backend + env-only path).
    SupervisorEnvLeaf {
        parent: PathBuf,
    },
    /// Leaf created in `ProprietaryCellBackend::create`; backend removes on destroy.
    BackendLeaf(PathBuf),
}

#[cfg(target_os = "linux")]
pub(crate) fn linux_cgroup_attach_for_run(handle: &CellHandle) -> LinuxCgroupAttach {
    if let Some(ref p) = handle.cgroup_path {
        return LinuxCgroupAttach::BackendLeaf(p.clone());
    }
    if let Some(parent) = linux_subprocess_cgroup_parent() {
        LinuxCgroupAttach::SupervisorEnvLeaf { parent }
    } else {
        LinuxCgroupAttach::None
    }
}

/// Run via `std::process::Command`: optional `pre_exec` for `unshare(2)` (+ loopback ioctl when `net`,
/// private workspace when `mnt`), optional seccomp after namespace setup (so baseline can still use `mount`
/// during setup), then `spawn` → write `cgroup.procs` → best-effort nftables policy via nsenter → `wait`.
#[cfg(target_os = "linux")]
#[allow(clippy::too_many_arguments)]
pub(crate) async fn linux_run_cell_command_isolated(
    run: &RunSpec,
    cell_id: &str,
    attach: LinuxCgroupAttach,
    unshare_flags: Option<libc::c_int>,
    egress_rules: &[EgressRule],
    dns_authority: Option<cellos_core::DnsAuthority>,
    secrets: &[SecretView],
    runtime_secret_session: Option<&RuntimeSecretSession>,
    run_timeout: Option<(Duration, RunTimeoutSource)>,
    seccomp_program: Option<Vec<u8>>,
    dns_proxy_activation: Option<DnsProxyActivation>,
    dns_proxy_emitter: Option<std::sync::Arc<dyn crate::dns_proxy::DnsQueryEmitter>>,
    sni_proxy_activation: Option<SniProxyActivation>,
    sni_proxy_emitter: Option<std::sync::Arc<dyn crate::sni_proxy::L7DecisionEmitter>>,
    // T3.C / E7 — per-flow real-time listener context: (sink, run_id).
    per_flow_realtime: Option<(std::sync::Arc<dyn cellos_core::ports::EventSink>, String)>,
    // L5-15 — shared FlowAccumulator the nflog listener records into when
    // `CELLOS_PER_FLOW_REALTIME=1` is on. Cloned Arc; the supervisor keeps
    // its own clone to read `unique_flow_count()` at homeostasis emit time.
    flow_accumulator: Option<
        std::sync::Arc<std::sync::Mutex<crate::ebpf_flow::connection_tracking::FlowAccumulator>>,
    >,
) -> (
    i32,
    u64,
    Option<String>,
    Option<bool>,
    Option<String>,
    Option<String>,
    Vec<crate::nft_counters::NftCounterRow>,
) {
    let start = Instant::now();
    let run = run.clone();
    let cell_id = cell_id.to_string();
    let egress_rules = egress_rules.to_vec();
    let dns_authority_for_ns = dns_authority;
    let per_flow_realtime_for_closure = per_flow_realtime;
    let flow_accumulator_for_closure = flow_accumulator;
    let secrets: Vec<(String, String)> = secrets
        .iter()
        .map(|s| (s.key.clone(), s.value.as_str().to_string()))
        .collect();
    let runtime_secret_env = runtime_secret_session.map(|session| session.env_pairs().to_vec());
    let has_net = unshare_flags.is_some_and(|f| f & libc::CLONE_NEWNET != 0);
    // FC-38 Phase 1 — gated post-run nft counter scrape. Read once at the top
    // so the env-var check doesn't sit on the hot path inside the closure.
    // Default OFF: operators opt in by setting `CELLOS_PER_FLOW_ENFORCEMENT_EVENTS=1`.
    let per_flow_enforcement_enabled =
        std::env::var("CELLOS_PER_FLOW_ENFORCEMENT_EVENTS").as_deref() == Ok("1");
    // BlockingOk: (exit_code, nft_signal, dns_proxy_spawn_err, per_flow_rows,
    //              sni_proxy_spawn_err)
    type BlockingOk = (
        i32,
        Option<bool>,
        Option<String>,
        Vec<crate::nft_counters::NftCounterRow>,
        Option<String>,
    );
    type BlockingErr = (String, Option<bool>, Option<String>, Option<String>);
    let join = tokio::task::spawn_blocking(move || -> Result<BlockingOk, BlockingErr> {
        let mut nft_rules_signal: Option<bool> = if has_net { Some(false) } else { None };
        let mut supervisor_cleanup_leaf: Option<PathBuf> = None;
        // SEAM-1 Phase 2b: track any proxy spawn error so it surfaces as a
        // single `dns_query` upstream_failure event after the cell exits.
        let mut dns_proxy_spawn_err: Option<String> = None;
        // SEC-22 Phase 2: same shape, separate channel.
        let mut sni_proxy_spawn_err: Option<String> = None;

        let cgroup_leaf: Option<PathBuf> = match &attach {
            LinuxCgroupAttach::None => None,
            LinuxCgroupAttach::BackendLeaf(p) => Some(p.clone()),
            LinuxCgroupAttach::SupervisorEnvLeaf { parent } => {
                let leaf = parent.join(format!(
                    "cellos_{}_{}",
                    cellos_core::sanitize_cgroup_leaf_segment(&cell_id),
                    uuid::Uuid::new_v4()
                ));
                if let Err(e) = std::fs::create_dir(&leaf) {
                    return Err((
                        format!(
                            "CELLOS_CGROUP_PARENT: create cgroup dir {}: {e}",
                            leaf.display()
                        ),
                        nft_rules_signal,
                        dns_proxy_spawn_err,
                        sni_proxy_spawn_err,
                    ));
                }
                supervisor_cleanup_leaf = Some(leaf.clone());
                Some(leaf)
            }
        };

        if let Some(ref leaf) = cgroup_leaf {
            linux_cgroup_write_optional_controller_files(leaf, run.limits.as_ref());
        }

        let mut cmd = std::process::Command::new(&run.argv[0]);
        if run.argv.len() > 1 {
            cmd.args(&run.argv[1..]);
        }
        cmd.env_clear();
        cmd.env("PATH", "/usr/bin:/bin:/usr/local/bin");
        if let Some(runtime_secret_env) = &runtime_secret_env {
            for (key, value) in runtime_secret_env {
                cmd.env(key, value);
            }
        } else {
            for (key, value) in &secrets {
                cmd.env(key, value);
            }
        }
        if let Some(wd) = &run.working_directory {
            cmd.current_dir(wd);
        }

        // E1-04: Close all FDs > 2 (stdin/stdout/stderr) before exec so the child
        // process does not inherit supervisor file descriptors (sockets, NATS
        // connections, secret broker handles, etc.).  We iterate /proc/self/fd and
        // set FD_CLOEXEC on every descriptor with fd > 2.  Setting CLOEXEC rather
        // than calling close() directly avoids races with concurrent threads that may
        // have just opened a new FD; the kernel closes them atomically on execve.
        // SAFETY: runs in the forked child only; single-threaded at this point.
        unsafe {
            use std::os::unix::process::CommandExt;
            cmd.pre_exec(|| {
                if let Ok(dir) = std::fs::read_dir("/proc/self/fd") {
                    for entry in dir.flatten() {
                        if let Ok(name) = entry.file_name().into_string() {
                            if let Ok(fd) = name.parse::<i32>() {
                                if fd > 2 {
                                    // FD_CLOEXEC: closed on execve, not on fork.
                                    libc::fcntl(fd, libc::F_SETFD, libc::FD_CLOEXEC);
                                }
                            }
                        }
                    }
                }
                Ok(())
            });
        }

        // Seccomp runs after unshare/mount setup: baseline profile blocks `mount`/`unshare` syscalls.
        let needs_pre_exec = unshare_flags.is_some() || seccomp_program.is_some();
        if needs_pre_exec {
            let workspace_for_ns = run.working_directory.clone();
            let flags = unshare_flags;
            let seccomp_for_child = seccomp_program;
            // SAFETY: `pre_exec` runs in the forked child only; libc calls are the intended Linux APIs.
            unsafe {
                use std::os::unix::process::CommandExt;
                cmd.pre_exec(move || {
                    if let Some(f) = flags {
                        if libc::unshare(f) != 0 {
                            return Err(std::io::Error::last_os_error());
                        }
                        if (f & libc::CLONE_NEWNET) != 0 {
                            crate::linux_net::loopback_up_after_newnet().map_err(|e| {
                                std::io::Error::other(format!("loopback up in new netns: {e}"))
                            })?;
                        }
                        // Private tmpfs workspace (L2-03): when CLONE_NEWNS is active and a working
                        // directory is set, remount the tree MS_PRIVATE then mount a fresh tmpfs over
                        // the workspace. The cell gets a clean, empty writable dir; host sees nothing
                        // after the process exits.
                        if (f & libc::CLONE_NEWNS) != 0 {
                            if let Some(ref ws) = workspace_for_ns {
                                crate::linux_mount::make_private_workspace(ws).map_err(|e| {
                                    std::io::Error::other(format!(
                                        "private workspace mount at {ws}: {e}"
                                    ))
                                })?;
                            }
                        }
                    }
                    if let Some(ref prog) = seccomp_for_child {
                        crate::linux_seccomp::apply_seccomp_filter(prog)?;
                    }
                    Ok(())
                });
            }
        }

        let mut child = match cmd.spawn() {
            Ok(c) => c,
            Err(e) => {
                return Err((
                    format!("spec.run spawn failed: {e}"),
                    nft_rules_signal,
                    dns_proxy_spawn_err,
                    sni_proxy_spawn_err,
                ));
            }
        };

        let child_pid = child.id();

        // Apply nftables policy inside the child's network namespace (best-effort).
        // Primary isolation: CLONE_NEWNET (child has no routing); nft is a supplementary audit layer.
        //
        // FC-38 Phase 1: when per-flow enforcement events are enabled AND we
        // have a private netns AND nft applied successfully, open a long-lived
        // file handle on `/proc/<child_pid>/ns/net` so Linux pins the netns
        // alive even after `child.wait()` reaps the zombie. We then run
        // `nsenter --net=/proc/self/fd/<N> nft -j list ruleset` after wait to
        // scrape the final counter values. Without this fd, /proc/<pid>/ns/net
        // would disappear at reap and post-run scraping would be impossible.
        let mut per_flow_netns_fd: Option<std::fs::File> = None;
        // T3.C / E7 — when the per-flow eBPF/nflog gate is on, decide the
        // nflog group up-front so both the ruleset augmentation pass and
        // the listener thread agree on the same value. The activation
        // carries cell_id (already in scope here) and run_id (threaded in
        // through `per_flow_realtime_for_closure`). When the env var is
        // off, `build_activation_from_env` returns `None` and the
        // augmentation + spawn are both skipped — production behaviour is
        // byte-identical to the legacy ruleset path.
        let per_flow_run_id = per_flow_realtime_for_closure
            .as_ref()
            .map(|(_, rid)| rid.clone())
            .unwrap_or_default();
        let per_flow_realtime_activation =
            crate::per_flow::build_activation_from_env(&cell_id, &per_flow_run_id, None, None, None);
        if has_net {
            let mut ruleset = crate::network_policy::generate_nft_ruleset(
                &cell_id,
                &egress_rules,
                dns_authority_for_ns.as_ref(),
            );
            // T3.C / E7 — augment ruleset with `log group N prefix
            // "cellos-flow ..."` actions on every accept/drop verdict so
            // the in-netns listener can attribute real-time per-flow
            // events. Idempotent — safe to re-run on already-augmented
            // input. Only applied when the env gate is on, so the default
            // path is byte-identical to the legacy ruleset.
            if let Some(ref act) = per_flow_realtime_activation {
                ruleset =
                    crate::per_flow::augment_ruleset_with_log_actions(&ruleset, act.nflog_group);
            }
            let nft_enforcement_applied =
                crate::network_policy::apply_nft_in_ns(child_pid, &ruleset);
            nft_rules_signal = Some(nft_enforcement_applied);
            tracing::info!(
                target: "cellos.supervisor.linux_isolation",
                nft_enforcement_applied,
                egress_rules_count = egress_rules.len(),
                child_pid = child_pid,
                "nftables policy application complete"
            );
            if per_flow_enforcement_enabled && nft_enforcement_applied {
                let netns = format!("/proc/{child_pid}/ns/net");
                match std::fs::OpenOptions::new().read(true).open(&netns) {
                    Ok(f) => per_flow_netns_fd = Some(f),
                    Err(e) => {
                        tracing::warn!(
                            target: "cellos.supervisor.per_flow_enforcement",
                            error = %e,
                            child_pid = child_pid,
                            "open /proc/<pid>/ns/net failed — per-flow events skipped"
                        );
                    }
                }
            }
        }

        // SEAM-1 / L2-04 Phase 2b: spawn the in-netns DNS proxy here, after
        // `cmd.spawn()` returned (so /proc/<child_pid>/ns/net exists) and
        // before `child.wait()` (so the proxy listener is bound for the
        // entire workload lifetime). The activation predicate was evaluated
        // earlier in `Supervisor::run`; this slot only does the actual
        // setns + bind.
        let mut dns_proxy_handle: Option<crate::dns_proxy::spawn::DnsProxyHandle> = None;
        if let (Some(activation), Some(emitter)) =
            (dns_proxy_activation.as_ref(), dns_proxy_emitter.as_ref())
        {
            let shutdown = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false));
            match crate::dns_proxy::spawn::spawn_proxy_in_netns(
                child_pid,
                activation.cfg.clone(),
                activation.listen_addr,
                activation.upstream_addr,
                emitter.clone(),
                shutdown.clone(),
            ) {
                Ok(handle) => {
                    tracing::info!(
                        target: "cellos.supervisor.dns_proxy",
                        cell_id = %cell_id,
                        child_pid = child_pid,
                        listen = %handle.listen_addr,
                        "SEAM-1 DNS proxy spawned in cell netns"
                    );
                    dns_proxy_handle = Some(handle);
                }
                Err(e) => {
                    // Spawn failed (setns / bind / etc). The cell will run
                    // without proxy enforcement; the supervisor surfaces this
                    // as an `upstream_failure` `dns_query` event after wait.
                    // We do NOT kill the child — under SEC-22 the kernel-level
                    // direct-DNS block is still in place; the proxy is the
                    // L7 audit layer, not the only line of defence.
                    tracing::warn!(
                        target: "cellos.supervisor.dns_proxy",
                        cell_id = %cell_id,
                        child_pid = child_pid,
                        error = %e,
                        "SEAM-1 DNS proxy spawn failed — cell will run without proxy"
                    );
                    dns_proxy_spawn_err = Some(e.to_string());
                }
            }
        }

        // SEC-22 Phase 2: spawn the in-netns SNI proxy. Same structural slot
        // as the DNS proxy spawn — after `cmd.spawn()` so /proc/<child_pid>/ns/net
        // exists, before `child.wait()` so the listener is bound for the
        // entire workload lifetime. Independent failure path: a SNI proxy
        // spawn failure does NOT kill the cell or affect DNS proxy state;
        // it produces a single l7_egress_decision audit event.
        let mut sni_proxy_handle: Option<crate::sni_proxy::spawn::SniProxyHandle> = None;
        if let (Some(activation), Some(emitter)) =
            (sni_proxy_activation.as_ref(), sni_proxy_emitter.as_ref())
        {
            let shutdown = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false));
            match crate::sni_proxy::spawn::spawn_sni_proxy_in_netns(
                child_pid,
                activation.cfg.clone(),
                emitter.clone(),
                shutdown.clone(),
            ) {
                Ok(handle) => {
                    tracing::info!(
                        target: "cellos.supervisor.sni_proxy",
                        cell_id = %cell_id,
                        child_pid = child_pid,
                        listen = %handle.listen_addr,
                        "SEC-22 Phase 2 SNI proxy spawned in cell netns"
                    );
                    sni_proxy_handle = Some(handle);
                }
                Err(e) => {
                    tracing::warn!(
                        target: "cellos.supervisor.sni_proxy",
                        cell_id = %cell_id,
                        child_pid = child_pid,
                        error = %e,
                        "SEC-22 Phase 2 SNI proxy spawn failed — cell will run without L7 enforcement"
                    );
                    sni_proxy_spawn_err = Some(e.to_string());
                }
            }
        }

        // T3.C / E7 — spawn the in-netns per-flow nflog listener alongside
        // dns_proxy / sni_proxy. Mirrors their structural slot exactly:
        // setns + bind happen on a dedicated OS thread, the activation
        // predicate was resolved earlier in this fn (env gate + run_id +
        // sink presence), and shutdown is signalled at teardown.
        // We additionally require `has_net` (CLONE_NEWNET) so the listener
        // binds the netlink socket inside the cell's namespace rather than
        // the host's — without netns isolation the augmented ruleset
        // wouldn't have applied either.
        let mut per_flow_listener_handle: Option<crate::per_flow::PerFlowListenerHandle> = None;
        if has_net {
            if let (Some(activation), Some((sink, _rid))) = (
                per_flow_realtime_activation,
                per_flow_realtime_for_closure.as_ref(),
            ) {
                let shutdown = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false));
                match crate::per_flow::spawn_per_flow_listener_in_netns(
                    child_pid,
                    activation,
                    sink.clone(),
                    shutdown.clone(),
                    flow_accumulator_for_closure.clone(),
                ) {
                    Ok(handle) => {
                        tracing::info!(
                            target: "cellos.supervisor.per_flow",
                            cell_id = %cell_id,
                            child_pid = child_pid,
                            "T3.C / E7 per-flow listener spawned in cell netns"
                        );
                        per_flow_listener_handle = Some(handle);
                    }
                    Err(e) => {
                        tracing::warn!(
                            target: "cellos.supervisor.per_flow",
                            cell_id = %cell_id,
                            child_pid = child_pid,
                            error = %e,
                            "per-flow listener spawn failed — cell will run without realtime per-flow events"
                        );
                    }
                }
            }
        }

        if let Some(ref leaf) = cgroup_leaf {
            let procs = leaf.join("cgroup.procs");
            if let Err(e) = std::fs::write(&procs, format!("{child_pid}\n")) {
                let _ = child.kill();
                // Tear down any spawned listener / proxy thread before
                // bailing — the child will exit but threads would
                // otherwise outlive it.
                if let Some(mut handle) = per_flow_listener_handle.take() {
                    handle
                        .shutdown
                        .store(true, std::sync::atomic::Ordering::SeqCst);
                    let _ = handle.join();
                }
                if let Some(mut handle) = dns_proxy_handle.take() {
                    handle
                        .shutdown
                        .store(true, std::sync::atomic::Ordering::SeqCst);
                    crate::dns_proxy::spawn::signal_proxy_shutdown(handle.listen_addr);
                    let _ = handle.join();
                }
                if let Some(mut handle) = sni_proxy_handle.take() {
                    handle
                        .shutdown
                        .store(true, std::sync::atomic::Ordering::SeqCst);
                    crate::sni_proxy::spawn::signal_sni_proxy_shutdown(handle.listen_addr);
                    let _ = handle.join();
                }
                return Err((
                    format!("cgroup.procs {}: {e}", procs.display()),
                    nft_rules_signal,
                    dns_proxy_spawn_err,
                    sni_proxy_spawn_err,
                ));
            }
        }

        let status_result: Result<std::process::ExitStatus, String> = if let Some((
            timeout,
            timeout_source,
        )) = run_timeout
        {
            let deadline = Instant::now() + timeout;
            loop {
                match child.try_wait() {
                    Ok(Some(status)) => break Ok(status),
                    Ok(None) => {
                        if Instant::now() >= deadline {
                            let _ = child.kill();
                            let _ = child.wait();
                            // Tear down proxy / per-flow listener on
                            // timeout path so threads do not outlive the cell.
                            if let Some(mut handle) = per_flow_listener_handle.take() {
                                handle
                                    .shutdown
                                    .store(true, std::sync::atomic::Ordering::SeqCst);
                                let _ = handle.join();
                            }
                            if let Some(mut handle) = dns_proxy_handle.take() {
                                handle
                                    .shutdown
                                    .store(true, std::sync::atomic::Ordering::SeqCst);
                                crate::dns_proxy::spawn::signal_proxy_shutdown(handle.listen_addr);
                                let _ = handle.join();
                            }
                            if let Some(mut handle) = sni_proxy_handle.take() {
                                handle
                                    .shutdown
                                    .store(true, std::sync::atomic::Ordering::SeqCst);
                                crate::sni_proxy::spawn::signal_sni_proxy_shutdown(
                                    handle.listen_addr,
                                );
                                let _ = handle.join();
                            }
                            return Err((
                                run_timeout_message(timeout, timeout_source),
                                nft_rules_signal,
                                dns_proxy_spawn_err,
                                sni_proxy_spawn_err,
                            ));
                        }
                        std::thread::sleep(Duration::from_millis(10));
                    }
                    Err(e) => {
                        if let Some(mut handle) = per_flow_listener_handle.take() {
                            handle
                                .shutdown
                                .store(true, std::sync::atomic::Ordering::SeqCst);
                            let _ = handle.join();
                        }
                        if let Some(mut handle) = dns_proxy_handle.take() {
                            handle
                                .shutdown
                                .store(true, std::sync::atomic::Ordering::SeqCst);
                            crate::dns_proxy::spawn::signal_proxy_shutdown(handle.listen_addr);
                            let _ = handle.join();
                        }
                        if let Some(mut handle) = sni_proxy_handle.take() {
                            handle
                                .shutdown
                                .store(true, std::sync::atomic::Ordering::SeqCst);
                            crate::sni_proxy::spawn::signal_sni_proxy_shutdown(handle.listen_addr);
                            let _ = handle.join();
                        }
                        return Err((
                            format!("spec.run wait failed: {e}"),
                            nft_rules_signal,
                            dns_proxy_spawn_err,
                            sni_proxy_spawn_err,
                        ));
                    }
                }
            }
        } else {
            match child.wait() {
                Ok(s) => Ok(s),
                Err(e) => Err(format!("spec.run wait failed: {e}")),
            }
        };

        // T3.C / E7 — tear down the per-flow listener on the normal exit
        // path. The 100ms `SO_RCVTIMEO` bounds shutdown latency.
        if let Some(mut handle) = per_flow_listener_handle.take() {
            handle
                .shutdown
                .store(true, std::sync::atomic::Ordering::SeqCst);
            match handle.join() {
                Some(stats) => {
                    tracing::info!(
                        target: "cellos.supervisor.per_flow",
                        cell_id = %cell_id,
                        datagrams_total = stats.datagrams_total,
                        datagrams_matched = stats.datagrams_matched,
                        datagrams_decode_failed = stats.datagrams_decode_failed,
                        events_emitted = stats.events_emitted,
                        "T3.C / E7 per-flow listener thread joined"
                    );
                }
                None => {
                    tracing::debug!(
                        target: "cellos.supervisor.per_flow",
                        cell_id = %cell_id,
                        "per-flow listener thread join returned no stats"
                    );
                }
            }
        }

        // SEAM-1 / L2-04 Phase 2b: now that the workload has exited, tear
        // down the proxy thread cleanly. The shutdown flag + wake packet
        // collapse the recv loop's worst-case shutdown latency from
        // `LISTENER_READ_TIMEOUT` to ~milliseconds.
        if let Some(mut handle) = dns_proxy_handle.take() {
            handle
                .shutdown
                .store(true, std::sync::atomic::Ordering::SeqCst);
            crate::dns_proxy::spawn::signal_proxy_shutdown(handle.listen_addr);
            match handle.join() {
                Some(stats) => {
                    tracing::info!(
                        target: "cellos.supervisor.dns_proxy",
                        cell_id = %cell_id,
                        queries_total = stats.queries_total,
                        queries_allowed = stats.queries_allowed,
                        queries_denied = stats.queries_denied,
                        queries_malformed = stats.queries_malformed,
                        upstream_failures = stats.upstream_failures,
                        "SEAM-1 DNS proxy thread joined"
                    );
                }
                None => {
                    tracing::debug!(
                        target: "cellos.supervisor.dns_proxy",
                        cell_id = %cell_id,
                        "DNS proxy thread join returned no stats (already taken or panic)"
                    );
                }
            }
        }

        if let Some(dir) = supervisor_cleanup_leaf {
            if let Err(e) = std::fs::remove_dir(&dir) {
                tracing::debug!(
                    target: "cellos.supervisor.linux_isolation",
                    path = %dir.display(),
                    error = %e,
                    "cgroup leaf cleanup failed (non-fatal)"
                );
            }
        }

        let status = match status_result {
            Ok(s) => s,
            Err(msg) => {
                return Err((
                    msg,
                    nft_rules_signal,
                    dns_proxy_spawn_err,
                    sni_proxy_spawn_err,
                ));
            }
        };

        // FC-38 Phase 1: scrape the now-final counter state via `nft -j list
        // ruleset` through the held netns fd. The fd is dropped on closure
        // exit, releasing the namespace inode. Empty `Vec` when scraping is
        // disabled or fails — caller treats this as "no per-flow events".
        let per_flow_rows: Vec<crate::nft_counters::NftCounterRow> =
            if let Some(ref fd) = per_flow_netns_fd {
                use std::os::unix::io::AsRawFd;
                let netns_path = format!("/proc/self/fd/{}", fd.as_raw_fd());
                crate::network_policy::scan_nft_counters_in_ns(&netns_path)
            } else {
                Vec::new()
            };
        drop(per_flow_netns_fd);

        Ok((
            status.code().unwrap_or(-1),
            nft_rules_signal,
            dns_proxy_spawn_err,
            per_flow_rows,
            sni_proxy_spawn_err,
        ))
    })
    .await;

    let duration_ms = start.elapsed().as_millis() as u64;
    match join {
        Ok(Ok((code, nft, dns_err, rows, sni_err))) => {
            (code, duration_ms, None, nft, dns_err, sni_err, rows)
        }
        Ok(Err((msg, nft, dns_err, sni_err))) => (
            -1,
            duration_ms,
            Some(msg),
            nft,
            dns_err,
            sni_err,
            Vec::new(),
        ),
        Err(e) => (
            -1,
            duration_ms,
            Some(format!("spawn_blocking failed: {e}")),
            None,
            None,
            None,
            Vec::new(),
        ),
    }
}