zlayer-agent 0.11.1

//! Youki/libcontainer runtime implementation
//!
//! Implements the Runtime trait using libcontainer (youki's container library)
//! for direct OCI container management without a daemon.

use crate::cgroups_stats::{self, ContainerStats};
use crate::error::{AgentError, Result};
use crate::runtime::{ContainerId, ContainerState, ImageInfo, PruneResult, Runtime};
use crate::storage_manager::StorageManager;
use libcontainer::container::builder::ContainerBuilder;
use libcontainer::container::{Container, ContainerStatus};
use libcontainer::signal::Signal;
use libcontainer::syscall::syscall::SyscallType;
use oci_client::manifest::OciImageManifest;
use std::collections::HashMap;
use std::os::fd::{FromRawFd, OwnedFd};
use std::path::PathBuf;
use std::time::Duration;
use tokio::fs;
use tokio::sync::RwLock;
use tracing::instrument;
use zlayer_observability::logs::{LogEntry, LogSource, LogStream};
use zlayer_spec::{RegistryAuth, ServiceSpec};

/// Configuration for `YoukiRuntime`
#[derive(Debug, Clone)]
pub struct YoukiConfig {
    /// State directory for libcontainer container state
    pub state_dir: PathBuf,
    /// Directory for unpacked image rootfs
    pub rootfs_dir: PathBuf,
    /// Directory for OCI bundles
    pub bundle_dir: PathBuf,
    /// Cache directory for image blobs
    pub cache_dir: PathBuf,
    /// Directory for persistent volumes
    pub volume_dir: PathBuf,
    /// Use systemd cgroups
    pub use_systemd: bool,
    /// Cache type configuration (if None, determined from environment)
    pub cache_type: Option<zlayer_registry::CacheType>,
    /// Base directory for structured container logs.
    ///
    /// When set, container logs are written to
    /// `{log_base_dir}/{deployment_name}/{service}/{container_id}.log`
    /// instead of the bundle directory.  The bundle `logs/` directory will
    /// contain symlinks back to the structured location so that existing
    /// code paths that read from the bundle still work.
    pub log_base_dir: Option<PathBuf>,
    /// Deployment name used in log directory hierarchy.
    ///
    /// Only meaningful when `log_base_dir` is also set.  Defaults to
    /// `"default"` if unset.
    pub deployment_name: Option<String>,
}

impl Default for YoukiConfig {
    fn default() -> Self {
        let dirs = zlayer_paths::ZLayerDirs::system_default();
        Self {
            state_dir: std::env::var("ZLAYER_STATE_DIR")
                .map_or_else(|_| dirs.containers(), PathBuf::from),
            rootfs_dir: std::env::var("ZLAYER_ROOTFS_DIR")
                .map_or_else(|_| dirs.rootfs(), PathBuf::from),
            bundle_dir: std::env::var("ZLAYER_BUNDLE_DIR")
                .map_or_else(|_| dirs.bundles(), PathBuf::from),
            cache_dir: std::env::var("ZLAYER_CACHE_DIR")
                .map_or_else(|_| dirs.cache(), PathBuf::from),
            volume_dir: std::env::var("ZLAYER_VOLUME_DIR")
                .map_or_else(|_| dirs.volumes(), PathBuf::from),
            use_systemd: std::env::var("ZLAYER_USE_SYSTEMD")
                .map(|v| v == "1" || v.to_lowercase() == "true")
                .unwrap_or(false),
            cache_type: None,
            log_base_dir: None,
            deployment_name: None,
        }
    }
}

/// Process-global mutex to serialize libcontainer operations.
/// libcontainer uses `chdir()` internally for notify socket operations
/// (to work around Unix socket 108-char path limit), which affects the
/// entire process. Concurrent container operations race on the CWD.
/// This must be process-global, not per-runtime, since chdir affects all threads.
static LIBCONTAINER_LOCK: std::sync::LazyLock<std::sync::Mutex<()>> =
    std::sync::LazyLock::new(|| std::sync::Mutex::new(()));

/// Container tracking information
#[derive(Debug)]
struct ContainerInfo {
    /// Image reference
    #[allow(dead_code)]
    image: String,
    /// Bundle path
    #[allow(dead_code)]
    bundle_path: PathBuf,
    /// Rootfs path
    #[allow(dead_code)]
    rootfs_path: PathBuf,
    /// Stdout log file path
    stdout_path: PathBuf,
    /// Stderr log file path
    stderr_path: PathBuf,
    /// Process ID (once running)
    #[allow(dead_code)]
    pid: Option<u32>,
}

/// Youki/libcontainer-based container runtime
///
/// This runtime uses libcontainer directly to create and manage OCI containers
/// without requiring a daemon like containerd.
pub struct YoukiRuntime {
    /// Configuration
    config: YoukiConfig,
    /// Local container state tracking
    containers: RwLock<HashMap<String, ContainerInfo>>,
    /// Authentication resolver for registry pulls
    auth_resolver: zlayer_core::AuthResolver,
    /// Storage volume manager
    storage_manager: std::sync::Arc<tokio::sync::RwLock<StorageManager>>,
    /// Shared blob cache for image layers (avoids repeated opens and ensures cache persistence)
    blob_cache: std::sync::Arc<Box<dyn zlayer_registry::BlobCacheBackend>>,
    /// Local OCI registry for resolving locally-built images
    local_registry: Option<std::sync::Arc<zlayer_registry::LocalRegistry>>,
    /// Cached image configs (entrypoint, cmd, env, etc.) keyed by image reference
    image_configs: RwLock<HashMap<String, zlayer_registry::ImageConfig>>,
    /// Auth context for container-to-host API authentication.
    auth_context: Option<crate::runtime::ContainerAuthContext>,
}

impl std::fmt::Debug for YoukiRuntime {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("YoukiRuntime")
            .field("config", &self.config)
            .finish_non_exhaustive()
    }
}

impl YoukiRuntime {
    /// Create a new `YoukiRuntime` with the given configuration
    ///
    /// # Errors
    /// Returns an error if the required directories cannot be created.
    pub async fn new(
        config: YoukiConfig,
        auth_context: Option<crate::runtime::ContainerAuthContext>,
    ) -> Result<Self> {
        // Ensure directories exist
        for dir in [
            &config.state_dir,
            &config.rootfs_dir,
            &config.bundle_dir,
            &config.cache_dir,
        ] {
            fs::create_dir_all(dir)
                .await
                .map_err(|e| AgentError::CreateFailed {
                    id: "runtime".to_string(),
                    reason: format!("failed to create directory {}: {}", dir.display(), e),
                })?;
        }

        // Initialize storage manager
        let storage_manager =
            StorageManager::new(&config.volume_dir).map_err(|e| AgentError::CreateFailed {
                id: "runtime".to_string(),
                reason: format!("failed to create storage manager: {e}"),
            })?;

        // Initialize shared blob cache using CacheType configuration
        // If cache_type is provided, use it directly; otherwise use environment-based config
        // but override the path for Persistent variant to use config.cache_dir
        let blob_cache = if let Some(cache_type) = &config.cache_type {
            cache_type
                .build()
                .await
                .map_err(|e| AgentError::CreateFailed {
                    id: "runtime".to_string(),
                    reason: format!("failed to build blob cache: {e}"),
                })?
        } else {
            let cache_type =
                zlayer_registry::CacheType::from_env().map_err(|e| AgentError::CreateFailed {
                    id: "runtime".to_string(),
                    reason: format!("failed to read cache config from env: {e}"),
                })?;
            // Override persistent path to use config.cache_dir
            #[allow(clippy::match_wildcard_for_single_variants)]
            let cache_type = match cache_type {
                zlayer_registry::CacheType::Persistent { .. } => {
                    zlayer_registry::CacheType::persistent_at(config.cache_dir.join("blobs.redb"))
                }
                other => other,
            };
            cache_type
                .build()
                .await
                .map_err(|e| AgentError::CreateFailed {
                    id: "runtime".to_string(),
                    reason: format!("failed to build blob cache: {e}"),
                })?
        };

        let local_registry = {
            let registry_path = config.cache_dir.parent().map_or_else(
                || config.cache_dir.join("registry"),
                |data_dir| data_dir.join("registry"),
            );
            match zlayer_registry::LocalRegistry::new(registry_path).await {
                Ok(reg) => Some(std::sync::Arc::new(reg)),
                Err(e) => {
                    tracing::warn!("Failed to open local registry: {e}");
                    None
                }
            }
        };

        Ok(Self {
            config,
            containers: RwLock::new(HashMap::new()),
            auth_resolver: zlayer_core::AuthResolver::new(zlayer_core::AuthConfig::default()),
            storage_manager: std::sync::Arc::new(tokio::sync::RwLock::new(storage_manager)),
            blob_cache,
            local_registry,
            image_configs: RwLock::new(HashMap::new()),
            auth_context,
        })
    }

    /// Create a new `YoukiRuntime` with default configuration
    ///
    /// # Errors
    /// Returns an error if the runtime cannot be initialized.
    pub async fn with_defaults() -> Result<Self> {
        Self::new(YoukiConfig::default(), None).await
    }

    /// Create a new `YoukiRuntime` with custom auth configuration
    ///
    /// # Errors
    /// Returns an error if the runtime cannot be initialized.
    pub async fn with_auth(
        config: YoukiConfig,
        auth_config: zlayer_core::AuthConfig,
    ) -> Result<Self> {
        // Ensure directories exist
        for dir in [
            &config.state_dir,
            &config.rootfs_dir,
            &config.bundle_dir,
            &config.cache_dir,
        ] {
            fs::create_dir_all(dir)
                .await
                .map_err(|e| AgentError::CreateFailed {
                    id: "runtime".to_string(),
                    reason: format!("failed to create directory {}: {}", dir.display(), e),
                })?;
        }

        // Initialize storage manager
        let storage_manager =
            StorageManager::new(&config.volume_dir).map_err(|e| AgentError::CreateFailed {
                id: "runtime".to_string(),
                reason: format!("failed to create storage manager: {e}"),
            })?;

        // Initialize shared blob cache using CacheType configuration
        // If cache_type is provided, use it directly; otherwise use environment-based config
        // but override the path for Persistent variant to use config.cache_dir
        let blob_cache = if let Some(cache_type) = &config.cache_type {
            cache_type
                .build()
                .await
                .map_err(|e| AgentError::CreateFailed {
                    id: "runtime".to_string(),
                    reason: format!("failed to build blob cache: {e}"),
                })?
        } else {
            let cache_type =
                zlayer_registry::CacheType::from_env().map_err(|e| AgentError::CreateFailed {
                    id: "runtime".to_string(),
                    reason: format!("failed to read cache config from env: {e}"),
                })?;
            // Override persistent path to use config.cache_dir
            #[allow(clippy::match_wildcard_for_single_variants)]
            let cache_type = match cache_type {
                zlayer_registry::CacheType::Persistent { .. } => {
                    zlayer_registry::CacheType::persistent_at(config.cache_dir.join("blobs.redb"))
                }
                other => other,
            };
            cache_type
                .build()
                .await
                .map_err(|e| AgentError::CreateFailed {
                    id: "runtime".to_string(),
                    reason: format!("failed to build blob cache: {e}"),
                })?
        };

        let local_registry = {
            let registry_path = config.cache_dir.parent().map_or_else(
                || config.cache_dir.join("registry"),
                |data_dir| data_dir.join("registry"),
            );
            match zlayer_registry::LocalRegistry::new(registry_path).await {
                Ok(reg) => Some(std::sync::Arc::new(reg)),
                Err(e) => {
                    tracing::warn!("Failed to open local registry: {e}");
                    None
                }
            }
        };

        Ok(Self {
            config,
            containers: RwLock::new(HashMap::new()),
            auth_resolver: zlayer_core::AuthResolver::new(auth_config),
            storage_manager: std::sync::Arc::new(tokio::sync::RwLock::new(storage_manager)),
            blob_cache,
            local_registry,
            image_configs: RwLock::new(HashMap::new()),
            auth_context: None,
        })
    }

    /// Configure S3-backed volume sync on the internal storage manager.
    ///
    /// When set, named volumes will be automatically registered with the
    /// `LayerSyncManager` on creation and restored from S3 if a backup
    /// exists. Volumes are synced to S3 when containers are stopped via
    /// the `sync_container_volumes` trait method.
    #[cfg(feature = "s3")]
    pub async fn set_layer_sync(
        &self,
        sync: std::sync::Arc<zlayer_storage::sync::LayerSyncManager>,
        service_name: impl Into<String>,
    ) {
        let mut storage_manager = self.storage_manager.write().await;
        storage_manager.set_layer_sync(sync, service_name);
    }

    /// Get the container ID string
    #[allow(clippy::unused_self)]
    fn container_id_str(&self, id: &ContainerId) -> String {
        format!("{}-{}", id.service, id.replica)
    }

    /// Get the root path for a container's state
    fn container_root(&self, id: &ContainerId) -> PathBuf {
        self.config.state_dir.join(self.container_id_str(id))
    }

    /// Get the bundle path for a container
    fn bundle_path(&self, id: &ContainerId) -> PathBuf {
        self.config.bundle_dir.join(self.container_id_str(id))
    }

    /// Get log directory for a container.
    ///
    /// When `log_base_dir` is configured, returns a structured path:
    ///   `{log_base_dir}/{deployment}/{service}/`
    /// Otherwise falls back to the bundle directory:
    ///   `{bundle_dir}/{container_id}/logs/`
    fn log_dir(&self, id: &ContainerId) -> PathBuf {
        if let Some(ref base) = self.config.log_base_dir {
            let deployment = self.config.deployment_name.as_deref().unwrap_or("default");
            base.join(deployment).join(&id.service)
        } else {
            // Fall back to bundle directory to avoid conflicting with libcontainer's state directory
            self.bundle_path(id).join("logs")
        }
    }

    /// Get log file paths for a container.
    ///
    /// Returns `(stdout_path, stderr_path)`. When structured logging is
    /// enabled via `log_base_dir`, the files are named after the container
    /// ID (e.g. `myservice-rep-1.stdout.log`).
    fn log_paths(&self, id: &ContainerId) -> (PathBuf, PathBuf) {
        let log_dir = self.log_dir(id);
        if self.config.log_base_dir.is_some() {
            let container_id = self.container_id_str(id);
            (
                log_dir.join(format!("{container_id}.stdout.log")),
                log_dir.join(format!("{container_id}.stderr.log")),
            )
        } else {
            (log_dir.join("stdout.log"), log_dir.join("stderr.log"))
        }
    }

    /// Map libcontainer status to our `ContainerState`
    #[allow(clippy::unused_self)]
    fn map_status(&self, status: ContainerStatus) -> ContainerState {
        match status {
            ContainerStatus::Creating | ContainerStatus::Created => ContainerState::Pending,
            ContainerStatus::Running => ContainerState::Running,
            ContainerStatus::Stopped => ContainerState::Exited { code: 0 },
            ContainerStatus::Paused => ContainerState::Stopping,
        }
    }

    /// Create log files and return file descriptors for stdout/stderr.
    ///
    /// When `log_base_dir` is configured, creates the structured log
    /// directory (`/var/log/zlayer/{deployment}/{service}/`) and places
    /// symlinks in the bundle's `logs/` directory so that existing code
    /// reading from the bundle still works.
    #[allow(unsafe_code)]
    async fn create_log_files(
        &self,
        id: &ContainerId,
    ) -> Result<(PathBuf, PathBuf, OwnedFd, OwnedFd)> {
        let log_dir = self.log_dir(id);
        fs::create_dir_all(&log_dir)
            .await
            .map_err(|e| AgentError::CreateFailed {
                id: id.to_string(),
                reason: format!("failed to create log dir: {e}"),
            })?;

        let (stdout_path, stderr_path) = self.log_paths(id);

        // Create stdout file
        let stdout_file =
            std::fs::File::create(&stdout_path).map_err(|e| AgentError::CreateFailed {
                id: id.to_string(),
                reason: format!("failed to create stdout log: {e}"),
            })?;

        // Create stderr file
        let stderr_file =
            std::fs::File::create(&stderr_path).map_err(|e| AgentError::CreateFailed {
                id: id.to_string(),
                reason: format!("failed to create stderr log: {e}"),
            })?;

        // When using structured logging, also create symlinks from the bundle
        // logs directory so that code reading from bundle_path/logs/ still works.
        if self.config.log_base_dir.is_some() {
            let bundle_log_dir = self.bundle_path(id).join("logs");
            let _ = fs::create_dir_all(&bundle_log_dir).await;

            // Symlink bundle_log_dir/stdout.log -> structured stdout_path
            let bundle_stdout = bundle_log_dir.join("stdout.log");
            let _ = fs::remove_file(&bundle_stdout).await;
            if let Err(e) = tokio::fs::symlink(&stdout_path, &bundle_stdout).await {
                tracing::warn!(
                    container = %id,
                    error = %e,
                    "Failed to create stdout symlink in bundle logs dir"
                );
            }

            // Symlink bundle_log_dir/stderr.log -> structured stderr_path
            let bundle_stderr = bundle_log_dir.join("stderr.log");
            let _ = fs::remove_file(&bundle_stderr).await;
            if let Err(e) = tokio::fs::symlink(&stderr_path, &bundle_stderr).await {
                tracing::warn!(
                    container = %id,
                    error = %e,
                    "Failed to create stderr symlink in bundle logs dir"
                );
            }
        }

        // Convert to OwnedFd
        // SAFETY: `into_raw_fd()` transfers ownership of the fd, and
        // `OwnedFd::from_raw_fd` takes ownership. No double-close.
        let stdout_fd = unsafe {
            use std::os::unix::io::IntoRawFd;
            OwnedFd::from_raw_fd(stdout_file.into_raw_fd())
        };
        let stderr_fd = unsafe {
            use std::os::unix::io::IntoRawFd;
            OwnedFd::from_raw_fd(stderr_file.into_raw_fd())
        };

        Ok((stdout_path, stderr_path, stdout_fd, stderr_fd))
    }

    /// Clean up bundle directory for a container
    async fn cleanup_bundle(&self, id: &ContainerId) -> Result<()> {
        let bundle_path = self.bundle_path(id);
        if bundle_path.exists() {
            fs::remove_dir_all(&bundle_path)
                .await
                .map_err(|e| AgentError::CreateFailed {
                    id: id.to_string(),
                    reason: format!("failed to remove bundle: {e}"),
                })?;
        }
        Ok(())
    }

    /// Pull image layers and return them for extraction
    ///
    /// Uses the shared blob cache to avoid repeated network requests for cached layers.
    /// The `policy` parameter is translated to a `force_refresh` flag: `PullPolicy::Always`
    /// clears the manifest cache before fetching, while `IfNotPresent` and `Never` reuse
    /// any cached manifest (the puller still revalidates mutable tags via HEAD for
    /// `IfNotPresent`, and serves purely from cache for `Never`).
    async fn pull_image_layers(
        &self,
        image: &str,
        policy: zlayer_spec::PullPolicy,
    ) -> Result<Vec<(Vec<u8>, String)>> {
        // Use the shared blob cache instead of opening a new one each time
        let puller = {
            let p = zlayer_registry::ImagePuller::with_cache(self.blob_cache.clone());
            if let Some(ref registry) = self.local_registry {
                p.with_local_registry(registry.clone())
            } else {
                p
            }
        };
        let auth = self.auth_resolver.resolve(image);
        let force_refresh = matches!(policy, zlayer_spec::PullPolicy::Always);

        puller
            .pull_image_with_policy(image, &auth, force_refresh)
            .await
            .map_err(|e| AgentError::PullFailed {
                image: image.to_string(),
                reason: format!("failed to pull image: {e}"),
            })
    }

    /// Prepare storage volumes for a container, returning paths for mounts
    async fn prepare_storage_volumes(
        &self,
        id: &ContainerId,
        spec: &ServiceSpec,
    ) -> Result<std::collections::HashMap<String, PathBuf>> {
        use zlayer_spec::StorageSpec;

        let mut volume_paths = std::collections::HashMap::new();
        let container_id = self.container_id_str(id);

        let mut storage_manager = self.storage_manager.write().await;

        for storage in &spec.storage {
            match storage {
                StorageSpec::Named { name, .. } => {
                    let path = storage_manager
                        .ensure_volume_with_sync(name)
                        .await
                        .map_err(|e| AgentError::CreateFailed {
                            id: container_id.clone(),
                            reason: format!("failed to ensure volume '{name}': {e}"),
                        })?;
                    storage_manager
                        .attach_volume(name, &container_id)
                        .map_err(|e| AgentError::CreateFailed {
                            id: container_id.clone(),
                            reason: format!("failed to attach volume '{name}': {e}"),
                        })?;
                    volume_paths.insert(name.clone(), path);
                }

                StorageSpec::Anonymous { target, .. } => {
                    let path = storage_manager
                        .create_anonymous(&container_id, target)
                        .map_err(|e| AgentError::CreateFailed {
                            id: container_id.clone(),
                            reason: format!(
                                "failed to create anonymous volume for '{target}': {e}"
                            ),
                        })?;
                    let key = format!("_anon_{}", target.trim_start_matches('/').replace('/', "_"));
                    volume_paths.insert(key, path);
                }

                // Bind and tmpfs mounts don't need preparation
                StorageSpec::Bind { .. } | StorageSpec::Tmpfs { .. } => {}

                StorageSpec::S3 {
                    bucket,
                    prefix,
                    endpoint,
                    ..
                } => {
                    let path = storage_manager
                        .mount_s3(
                            bucket,
                            prefix.as_deref(),
                            endpoint.as_deref(),
                            &container_id,
                        )
                        .map_err(|e| AgentError::CreateFailed {
                            id: container_id.clone(),
                            reason: format!("failed to mount S3 bucket '{bucket}': {e}"),
                        })?;
                    let key = format!("_s3_{}_{}", bucket, prefix.as_deref().unwrap_or(""));
                    volume_paths.insert(key, path);
                }
            }
        }

        Ok(volume_paths)
    }

    /// Clean up storage volumes for a container
    ///
    /// Note: This method requires the `ServiceSpec` to know which volumes to clean up.
    /// For now, `remove_container` uses a simpler approach that only cleans up anonymous volumes.
    /// This method is available for future use when the spec is stored/available at removal time.
    #[allow(dead_code)]
    async fn cleanup_storage_volumes(&self, id: &ContainerId, spec: &ServiceSpec) -> Result<()> {
        use zlayer_spec::StorageSpec;

        let container_id = self.container_id_str(id);
        let mut storage_manager = self.storage_manager.write().await;

        // Detach named volumes
        for storage in &spec.storage {
            match storage {
                StorageSpec::Named { name, .. } => {
                    if let Err(e) = storage_manager.detach_volume(name, &container_id) {
                        tracing::warn!(
                            volume = %name,
                            container = %container_id,
                            error = %e,
                            "failed to detach volume"
                        );
                    }
                }
                StorageSpec::S3 { bucket, prefix, .. } => {
                    if let Err(e) =
                        storage_manager.unmount_s3(bucket, prefix.as_deref(), &container_id)
                    {
                        tracing::warn!(
                            bucket = %bucket,
                            container = %container_id,
                            error = %e,
                            "failed to unmount S3 bucket"
                        );
                    }
                }
                _ => {}
            }
        }

        // Clean up anonymous volumes
        if let Err(e) = storage_manager.cleanup_anonymous(&container_id) {
            tracing::warn!(
                container = %container_id,
                error = %e,
                "failed to cleanup anonymous volumes"
            );
        }

        Ok(())
    }

    /// Get a cached image config by image reference
    ///
    /// Returns the previously pulled image configuration (entrypoint, cmd, env, etc.)
    /// for the given image reference, if available.
    async fn get_image_config(&self, image: &str) -> Option<zlayer_registry::ImageConfig> {
        let configs = self.image_configs.read().await;
        configs.get(image).cloned()
    }
}

#[async_trait::async_trait]
impl Runtime for YoukiRuntime {
    /// Pull an image to local storage
    ///
    /// Downloads image layers from a registry and unpacks them to a rootfs.
    #[instrument(
        skip(self),
        fields(
            otel.name = "image.pull",
            container.image.name = %image,
        )
    )]
    async fn pull_image(&self, image: &str) -> Result<()> {
        self.pull_image_with_policy(image, zlayer_spec::PullPolicy::IfNotPresent, None)
            .await
    }

    /// Pull an image to local storage with a specific pull policy
    ///
    /// This downloads image layers to the blob cache. Layers are extracted
    /// per-container in `create_container` to avoid race conditions.
    ///
    /// The `_auth` parameter is accepted for trait conformance (§3.10) but
    /// currently ignored: `zlayer-registry` resolves credentials through the
    /// existing `AuthResolver` (hostname lookup in the persistent secret
    /// store). Callers that need inline auth should use the Docker runtime.
    #[instrument(
        skip(self, _auth),
        fields(
            otel.name = "image.pull",
            container.image.name = %image,
            pull_policy = ?policy,
        )
    )]
    async fn pull_image_with_policy(
        &self,
        image: &str,
        policy: zlayer_spec::PullPolicy,
        _auth: Option<&RegistryAuth>,
    ) -> Result<()> {
        let puller = {
            let p = zlayer_registry::ImagePuller::with_cache(self.blob_cache.clone());
            if let Some(ref registry) = self.local_registry {
                p.with_local_registry(registry.clone())
            } else {
                p
            }
        };
        let auth = self.auth_resolver.resolve(image);

        // For Never policy, skip pulling layers from the remote, but STILL
        // fetch the image config from the local blob cache (populated by a
        // prior `zlayer import` or an earlier pull). Without the image
        // config, the bundle builder has no way to know the image's
        // entrypoint/cmd/env/workdir/user and falls back to `/bin/sh`,
        // which exits immediately and kills the container. Fetching the
        // config from cache is cheap (~1 KB) and non-fatal on miss.
        if matches!(policy, zlayer_spec::PullPolicy::Never) {
            tracing::debug!(image = %image, "pull policy is Never, skipping layer pull");
            match puller.pull_image_config(image, &auth).await {
                Ok(config) => {
                    tracing::info!(
                        image = %image,
                        has_entrypoint = config.entrypoint.is_some(),
                        has_cmd = config.cmd.is_some(),
                        "image config loaded from cache"
                    );
                    let mut configs = self.image_configs.write().await;
                    configs.insert(image.to_string(), config);
                }
                Err(e) => {
                    tracing::warn!(
                        image = %image,
                        error = %e,
                        "failed to load image config from cache under pull_policy=Never, container will use spec defaults"
                    );
                }
            }
            return Ok(());
        }

        // For IfNotPresent, check if image layers are in cache by trying to pull
        // Use the shared blob cache to avoid repeated opens and ensure persistence
        // For Always, force a round-trip to the registry by clearing the manifest cache.
        let force_refresh = matches!(policy, zlayer_spec::PullPolicy::Always);
        tracing::info!(image = %image, force_refresh, "pulling image layers to cache");

        // Pull image layers from registry (cached layers are retrieved from cache)
        let layers = puller
            .pull_image_with_policy(image, &auth, force_refresh)
            .await
            .map_err(|e| AgentError::PullFailed {
                image: image.to_string(),
                reason: format!("failed to pull image: {e}"),
            })?;

        tracing::info!(
            image = %image,
            layer_count = layers.len(),
            "image layers cached"
        );

        // Also pull and cache the image config (entrypoint, cmd, env, etc.)
        match puller
            .pull_image_config_with_policy(image, &auth, force_refresh)
            .await
        {
            Ok(config) => {
                tracing::info!(
                    image = %image,
                    has_entrypoint = config.entrypoint.is_some(),
                    has_cmd = config.cmd.is_some(),
                    "image config cached"
                );
                let mut configs = self.image_configs.write().await;
                configs.insert(image.to_string(), config);
            }
            Err(e) => {
                // Log but don't fail - the container can still run with spec defaults
                tracing::warn!(
                    image = %image,
                    error = %e,
                    "failed to pull image config, container will use spec defaults"
                );
            }
        }

        Ok(())
    }

    /// Create a container
    ///
    /// Creates an OCI bundle and uses libcontainer to create the container.
    /// Each container gets its own rootfs extracted from cached layers.
    #[instrument(
        skip(self, spec),
        fields(
            otel.name = "container.create",
            container.id = %self.container_id_str(id),
            service.name = %id.service,
            service.replica = %id.replica,
            container.image.name = %spec.image.name,
        )
    )]
    async fn create_container(&self, id: &ContainerId, spec: &ServiceSpec) -> Result<()> {
        let container_id = self.container_id_str(id);
        let image = &spec.image.name;

        // Clean up any stale container from a previous deploy (mirrors Docker runtime behavior)
        // See docker.rs:370-402 for the equivalent pattern
        let container_root = self.container_root(id);
        if container_root.exists() {
            tracing::warn!(
                container = %container_id,
                "stale container state found from previous deploy, cleaning up before re-create"
            );
            // Try to stop — process may already be dead, ignore errors
            if let Err(e) = self.stop_container(id, Duration::from_secs(5)).await {
                tracing::debug!(
                    container = %container_id,
                    error = %e,
                    "stop_container during stale cleanup (expected if process already dead)"
                );
            }
            // Remove container — does full cleanup: libcontainer delete + state dir + bundle + volumes
            if let Err(e) = self.remove_container(id).await {
                tracing::warn!(
                    container = %container_id,
                    error = %e,
                    "remove_container failed during stale cleanup, attempting manual cleanup"
                );
                // Fall back to manual cleanup if remove_container fails
                if container_root.exists() {
                    let _ = tokio::fs::remove_dir_all(&container_root).await;
                }
                let stale_bundle = self.bundle_path(id);
                if stale_bundle.exists() {
                    let _ = tokio::fs::remove_dir_all(&stale_bundle).await;
                }
            }
            tracing::info!(container = %container_id, "stale container cleaned up");
        }

        // Also clean up stale bundle directory if it exists but state dir didn't
        let bundle_path = self.bundle_path(id);
        if bundle_path.exists() {
            tracing::warn!(container = %container_id, "stale bundle directory found, removing");
            let _ = tokio::fs::remove_dir_all(&bundle_path).await;
        }

        let bundle_path = self.bundle_path(id);
        let rootfs_path = bundle_path.join("rootfs");

        tracing::info!("Creating container {} from image {}", container_id, image);

        // Create bundle directory structure
        fs::create_dir_all(&bundle_path)
            .await
            .map_err(|e| AgentError::CreateFailed {
                id: container_id.clone(),
                reason: format!("failed to create bundle directory: {e}"),
            })?;

        // Pull image layers (from cache if available). Honor the spec's pull policy so
        // that `Always` forces a manifest refresh, `IfNotPresent` serves from cache with
        // mutable-tag revalidation, and `Never` reuses cached blobs.
        let layers = self
            .pull_image_layers(image, spec.image.pull_policy)
            .await?;

        tracing::debug!(
            container = %container_id,
            layer_count = layers.len(),
            "extracting layers to container rootfs"
        );

        // Extract layers to this container's own rootfs
        let mut unpacker = zlayer_registry::LayerUnpacker::new(rootfs_path.clone());
        unpacker
            .unpack_layers(&layers)
            .await
            .map_err(|e| AgentError::CreateFailed {
                id: container_id.clone(),
                reason: format!("failed to extract rootfs: {e}"),
            })?;

        // Log rootfs diagnostics for debugging container creation failures
        match std::fs::read_dir(&rootfs_path) {
            Ok(entries) => {
                let count = entries.count();
                tracing::info!(
                    container = %container_id,
                    rootfs = %rootfs_path.display(),
                    entry_count = count,
                    "rootfs extracted successfully"
                );
            }
            Err(e) => {
                tracing::warn!(
                    container = %container_id,
                    rootfs = %rootfs_path.display(),
                    error = %e,
                    "rootfs directory not readable after extraction"
                );
            }
        }

        // Get cached image config (entrypoint, cmd, env, workdir, user)
        let img_config = self.get_image_config(image).await;

        // Prepare storage volumes
        let volume_paths = self.prepare_storage_volumes(id, spec).await?;

        // Generate OCI config.json via BundleBuilder (handles capabilities, devices,
        // resource limits, storage mounts, env resolution, and command resolution)
        let mut bundle_builder = crate::bundle::BundleBuilder::new(bundle_path.clone())
            .with_volume_paths(volume_paths)
            .with_host_network(spec.host_network);
        if let Some(config) = img_config {
            bundle_builder = bundle_builder.with_image_config(config);
        }

        // Inject auth env vars and socket mount so the container can talk to the host API
        if let Some(ref auth_ctx) = self.auth_context {
            let token = crate::auth::mint_container_token(
                &auth_ctx.jwt_secret,
                &id.service,
                &format!("{}-{}", id.service, id.replica),
                std::time::Duration::from_secs(86400 * 365),
            )
            .map_err(|e| crate::error::AgentError::CreateFailed {
                id: id.to_string(),
                reason: format!("Failed to mint container token: {e}"),
            })?;
            bundle_builder = bundle_builder
                .with_env("ZLAYER_API_URL".to_string(), auth_ctx.api_url.clone())
                .with_env("ZLAYER_TOKEN".to_string(), token)
                .with_env(
                    "ZLAYER_SOCKET".to_string(),
                    zlayer_paths::ZLayerDirs::default_socket_path(),
                )
                .with_socket_mount(&auth_ctx.socket_path);
        }

        bundle_builder.write_config(id, spec).await?;

        // Create log files
        let (stdout_path, stderr_path, stdout_fd, stderr_fd) = self.create_log_files(id).await?;

        // Use spawn_blocking for the synchronous libcontainer operations
        let config = self.config.clone();
        let container_id_clone = container_id.clone();
        let bundle_path_clone = bundle_path.clone();
        // Use state_dir as the root path - libcontainer appends container_id internally
        let state_dir_clone = self.config.state_dir.clone();
        let _container = tokio::task::spawn_blocking(move || {
            // Acquire process-global lock to serialize libcontainer operations.
            // libcontainer uses chdir() internally which affects the entire process,
            // so concurrent operations would race on the working directory.
            let _guard = LIBCONTAINER_LOCK
                .lock()
                .map_err(|e| AgentError::CreateFailed {
                    id: container_id_clone.clone(),
                    reason: format!("failed to acquire libcontainer lock: {e}"),
                })?;

            // Create container using libcontainer
            // Set stdout/stderr on ContainerBuilder BEFORE calling as_init()
            let container_builder =
                ContainerBuilder::new(container_id_clone.clone(), SyscallType::Linux)
                    .with_stdout(stdout_fd)
                    .with_stderr(stderr_fd);

            // Set container root path (base dir - libcontainer creates <root>/<container_id>)
            let container_builder =
                container_builder
                    .with_root_path(&state_dir_clone)
                    .map_err(|e| AgentError::CreateFailed {
                        id: container_id_clone.clone(),
                        reason: format!("failed to set root path: {e}"),
                    })?;

            // Configure as init container (creates new namespaces)
            let init_builder = container_builder
                .as_init(&bundle_path_clone)
                .with_systemd(config.use_systemd)
                .with_detach(true); // Run detached

            // Build the container (creates it but doesn't start)
            let container = init_builder.build().map_err(|e| AgentError::CreateFailed {
                id: container_id_clone.clone(),
                reason: format!("failed to create container: {e:?}"),
            })?;

            Ok::<Container, AgentError>(container)
        })
        .await
        .map_err(|e| AgentError::CreateFailed {
            id: container_id.clone(),
            reason: format!("task join error: {e}"),
        })??;

        // Store container info
        {
            let mut containers = self.containers.write().await;
            containers.insert(
                container_id.clone(),
                ContainerInfo {
                    image: image.clone(),
                    bundle_path,
                    rootfs_path,
                    stdout_path,
                    stderr_path,
                    pid: None,
                },
            );
        }

        tracing::info!("Container {} created successfully", container_id);
        Ok(())
    }

    /// Start a container
    ///
    /// Starts the container's init process.
    #[instrument(
        skip(self),
        fields(
            otel.name = "container.start",
            container.id = %self.container_id_str(id),
            service.name = %id.service,
        )
    )]
    async fn start_container(&self, id: &ContainerId) -> Result<()> {
        let container_id = self.container_id_str(id);
        let container_root = self.container_root(id);

        tracing::info!("Starting container {}", container_id);

        // Load and start the container using spawn_blocking
        let pid = tokio::task::spawn_blocking(move || {
            // Acquire process-global lock to serialize libcontainer operations.
            // libcontainer uses chdir() internally which affects the entire process,
            // so concurrent operations would race on the working directory.
            let _guard = LIBCONTAINER_LOCK
                .lock()
                .map_err(|e| AgentError::StartFailed {
                    id: container_id.clone(),
                    reason: format!("failed to acquire libcontainer lock: {e}"),
                })?;

            let mut container =
                Container::load(container_root).map_err(|e| AgentError::StartFailed {
                    id: container_id.clone(),
                    reason: format!("failed to load container: {e}"),
                })?;

            // Start the container
            container.start().map_err(|e| AgentError::StartFailed {
                id: container_id.clone(),
                reason: format!("failed to start container: {e}"),
            })?;

            // Get the PID after starting - access through state
            #[allow(clippy::cast_sign_loss)]
            let pid = container.pid().map(|p| p.as_raw() as u32);

            Ok::<Option<u32>, AgentError>(pid)
        })
        .await
        .map_err(|e| AgentError::StartFailed {
            id: id.to_string(),
            reason: format!("task join error: {e}"),
        })??;

        // Update container info with PID
        {
            let mut containers = self.containers.write().await;
            if let Some(info) = containers.get_mut(&self.container_id_str(id)) {
                info.pid = pid;
            }
        }

        tracing::info!(
            "Container {} started with PID {:?}",
            self.container_id_str(id),
            pid
        );
        Ok(())
    }

    /// Stop a container
    ///
    /// Sends SIGTERM, waits for timeout, then sends SIGKILL if needed.
    #[instrument(
        skip(self),
        fields(
            otel.name = "container.stop",
            container.id = %self.container_id_str(id),
            service.name = %id.service,
            timeout_ms = %timeout.as_millis(),
        )
    )]
    async fn stop_container(&self, id: &ContainerId, timeout: Duration) -> Result<()> {
        let container_id = self.container_id_str(id);
        let container_root = self.container_root(id);

        tracing::info!(
            "Stopping container {} with {:?} timeout",
            container_id,
            timeout
        );

        // Send SIGTERM first
        let container_root_clone = container_root.clone();
        let container_id_clone = container_id.clone();

        tokio::task::spawn_blocking(move || {
            let mut container =
                Container::load(container_root_clone).map_err(|e| AgentError::NotFound {
                    container: container_id_clone.clone(),
                    reason: format!("failed to load container: {e}"),
                })?;

            // Check if container can be killed
            if container.status().can_kill() {
                // Send SIGTERM
                use std::convert::TryFrom;
                let signal = Signal::try_from("SIGTERM").map_err(|e| AgentError::NotFound {
                    container: container_id_clone.clone(),
                    reason: format!("invalid signal: {e:?}"),
                })?;

                if let Err(e) = container.kill(signal, true) {
                    tracing::debug!("SIGTERM failed (container may already be stopped): {}", e);
                }
            }

            Ok::<(), AgentError>(())
        })
        .await
        .map_err(|e| AgentError::NotFound {
            container: container_id.clone(),
            reason: format!("task join error: {e}"),
        })??;

        // Wait for container to stop
        let start = std::time::Instant::now();
        loop {
            if start.elapsed() > timeout {
                break;
            }

            // Check container state
            let state = self.container_state(id).await?;
            if matches!(
                state,
                ContainerState::Exited { .. } | ContainerState::Failed { .. }
            ) {
                tracing::info!("Container {} stopped gracefully", container_id);
                return Ok(());
            }

            tokio::time::sleep(Duration::from_millis(100)).await;
        }

        // Timeout exceeded - send SIGKILL
        tracing::debug!(
            "Container {} did not stop gracefully, sending SIGKILL",
            container_id
        );

        let container_root_clone = container_root.clone();
        let container_id_clone = container_id.clone();

        tokio::task::spawn_blocking(move || {
            let mut container =
                Container::load(container_root_clone).map_err(|e| AgentError::NotFound {
                    container: container_id_clone.clone(),
                    reason: format!("failed to load container: {e}"),
                })?;

            if container.status().can_kill() {
                use std::convert::TryFrom;
                let signal = Signal::try_from("SIGKILL").map_err(|e| AgentError::NotFound {
                    container: container_id_clone.clone(),
                    reason: format!("invalid signal: {e:?}"),
                })?;

                if let Err(e) = container.kill(signal, true) {
                    tracing::warn!("SIGKILL failed: {}", e);
                }
            }

            Ok::<(), AgentError>(())
        })
        .await
        .map_err(|e| AgentError::NotFound {
            container: container_id.clone(),
            reason: format!("task join error: {e}"),
        })??;

        tracing::info!("Container {} killed", container_id);
        Ok(())
    }

    /// Remove a container
    ///
    /// Deletes the container and cleans up its bundle and state.
    /// Cleanup always proceeds even if libcontainer operations fail.
    #[instrument(
        skip(self),
        fields(
            otel.name = "container.remove",
            container.id = %self.container_id_str(id),
            service.name = %id.service,
        )
    )]
    async fn remove_container(&self, id: &ContainerId) -> Result<()> {
        let container_id = self.container_id_str(id);
        let container_root = self.container_root(id);

        tracing::info!("Removing container {}", container_id);

        // Attempt libcontainer delete, but don't fail if container not found
        let container_id_clone = container_id.clone();
        let container_root_clone = container_root.clone();

        let libcontainer_result = tokio::task::spawn_blocking(move || {
            match Container::load(container_root_clone) {
                Ok(mut container) => {
                    // Delete with force=true to handle any state
                    if let Err(e) = container.delete(true) {
                        tracing::warn!(
                            "libcontainer delete failed for {}: {}",
                            container_id_clone,
                            e
                        );
                    }
                }
                Err(e) => {
                    // Container may already be gone or state is in unexpected location
                    tracing::warn!(
                        "Container::load failed for {} (may already be removed): {}",
                        container_id_clone,
                        e
                    );
                }
            }
        })
        .await;

        if let Err(e) = libcontainer_result {
            tracing::warn!("spawn_blocking failed during remove: {}", e);
        }

        // ALWAYS clean up bundle regardless of libcontainer result
        if let Err(e) = self.cleanup_bundle(id).await {
            tracing::warn!("Failed to cleanup bundle for {}: {}", container_id, e);
        }

        // ALWAYS clean up state directory regardless of libcontainer result
        let state_dir = self.container_root(id);
        if state_dir.exists() {
            if let Err(e) = fs::remove_dir_all(&state_dir).await {
                tracing::warn!("Failed to remove state dir {}: {}", state_dir.display(), e);
            }
        }

        // Clean up storage volumes
        // Note: We need the spec to know what to clean up, but we don't have it here
        // For now, we'll just clean up anonymous volumes by container ID
        {
            let mut storage_manager = self.storage_manager.write().await;
            if let Err(e) = storage_manager.cleanup_anonymous(&container_id) {
                tracing::warn!(
                    container = %container_id,
                    error = %e,
                    "failed to cleanup anonymous volumes"
                );
            }
        }

        // Remove from local tracking
        {
            let mut containers = self.containers.write().await;
            containers.remove(&container_id);
        }

        tracing::info!("Container {} removed", container_id);
        Ok(())
    }

    /// Get container state
    #[instrument(
        skip(self),
        fields(
            otel.name = "container.state",
            container.id = %self.container_id_str(id),
        )
    )]
    async fn container_state(&self, id: &ContainerId) -> Result<ContainerState> {
        let container_id = self.container_id_str(id);
        let container_root = self.container_root(id);

        // Check if container root exists
        if !container_root.exists() {
            return Err(AgentError::NotFound {
                container: container_id.clone(),
                reason: "container state directory not found".to_string(),
            });
        }

        // Load container and get status
        let container_id_clone = container_id.clone();

        let status = tokio::task::spawn_blocking(move || {
            let mut container =
                Container::load(container_root).map_err(|e| AgentError::NotFound {
                    container: container_id_clone.clone(),
                    reason: format!("failed to load container: {e}"),
                })?;

            // Refresh status to get current state
            let _ = container.refresh_status();

            Ok::<ContainerStatus, AgentError>(container.status())
        })
        .await
        .map_err(|e| AgentError::NotFound {
            container: container_id.clone(),
            reason: format!("task join error: {e}"),
        })??;

        Ok(self.map_status(status))
    }

    /// Get container logs
    ///
    /// Reads from the container's stdout/stderr log files.
    async fn container_logs(&self, id: &ContainerId, tail: usize) -> Result<Vec<LogEntry>> {
        let container_id = self.container_id_str(id);

        // Get log paths from local state
        let (stdout_path, stderr_path) = {
            let containers = self.containers.read().await;
            match containers.get(&container_id) {
                Some(info) => (info.stdout_path.clone(), info.stderr_path.clone()),
                None => {
                    // Fall back to default paths
                    self.log_paths(id)
                }
            }
        };

        let now = chrono::Utc::now();
        let source = LogSource::Container(id.to_string());
        let mut entries = Vec::new();

        // Read stdout
        if stdout_path.exists() {
            if let Ok(content) = fs::read_to_string(&stdout_path).await {
                for line in content.lines() {
                    entries.push(LogEntry {
                        timestamp: now,
                        stream: LogStream::Stdout,
                        message: line.to_string(),
                        source: source.clone(),
                        service: None,
                        deployment: None,
                    });
                }
            }
        }

        // Read stderr
        if stderr_path.exists() {
            if let Ok(content) = fs::read_to_string(&stderr_path).await {
                for line in content.lines() {
                    entries.push(LogEntry {
                        timestamp: now,
                        stream: LogStream::Stderr,
                        message: line.to_string(),
                        source: source.clone(),
                        service: None,
                        deployment: None,
                    });
                }
            }
        }

        // Sort by timestamp (all same for legacy files, but correct for future use)
        entries.sort_by_key(|e| e.timestamp);

        // Apply tail limit
        if tail > 0 && entries.len() > tail {
            entries = entries.split_off(entries.len() - tail);
        }

        Ok(entries)
    }

    /// Execute a command in a running container
    ///
    /// Uses libcontainer's tenant builder to exec into the container's namespaces.
    #[allow(unsafe_code)]
    #[instrument(
        skip(self),
        fields(
            otel.name = "container.exec",
            container.id = %self.container_id_str(id),
            command = ?cmd,
        )
    )]
    async fn exec(&self, id: &ContainerId, cmd: &[String]) -> Result<(i32, String, String)> {
        let container_id = self.container_id_str(id);

        if cmd.is_empty() {
            return Err(AgentError::InvalidSpec(
                "exec command cannot be empty".to_string(),
            ));
        }

        tracing::debug!("Executing {:?} in container {}", cmd, container_id);

        // Create temporary files for exec output
        let exec_id = uuid::Uuid::new_v4().to_string();
        let exec_dir = self.config.state_dir.join(format!("exec-{exec_id}"));
        fs::create_dir_all(&exec_dir)
            .await
            .map_err(|e| AgentError::CreateFailed {
                id: exec_id.clone(),
                reason: format!("failed to create exec dir: {e}"),
            })?;

        let stdout_path = exec_dir.join("stdout");
        let stderr_path = exec_dir.join("stderr");

        // Create output files
        let stdout_file =
            std::fs::File::create(&stdout_path).map_err(|e| AgentError::CreateFailed {
                id: exec_id.clone(),
                reason: format!("failed to create stdout file: {e}"),
            })?;

        let stderr_file =
            std::fs::File::create(&stderr_path).map_err(|e| AgentError::CreateFailed {
                id: exec_id.clone(),
                reason: format!("failed to create stderr file: {e}"),
            })?;

        // SAFETY: `into_raw_fd()` transfers ownership of the fd, and
        // `OwnedFd::from_raw_fd` takes ownership. No double-close.
        let stdout_fd = unsafe {
            use std::os::unix::io::IntoRawFd;
            OwnedFd::from_raw_fd(stdout_file.into_raw_fd())
        };
        let stderr_fd = unsafe {
            use std::os::unix::io::IntoRawFd;
            OwnedFd::from_raw_fd(stderr_file.into_raw_fd())
        };

        let cmd_clone = cmd.to_vec();
        let container_id_clone = container_id.clone();
        // Use state_dir as the root path - libcontainer expects base dir
        let state_dir_clone = self.config.state_dir.clone();

        // Execute using tenant builder
        let exec_pid = tokio::task::spawn_blocking(move || {
            // Create container builder for tenant (joining existing container)
            // Set stdout/stderr on ContainerBuilder BEFORE calling as_tenant()
            let container_builder =
                ContainerBuilder::new(container_id_clone.clone(), SyscallType::Linux)
                    .with_stdout(stdout_fd)
                    .with_stderr(stderr_fd);

            let container_builder =
                container_builder
                    .with_root_path(&state_dir_clone)
                    .map_err(|e| AgentError::CreateFailed {
                        id: container_id_clone.clone(),
                        reason: format!("failed to set root path: {e}"),
                    })?;

            // Configure as tenant (joins existing namespaces)
            let tenant_builder = container_builder
                .as_tenant()
                .with_container_args(cmd_clone)
                .with_detach(false); // Wait for completion

            // Execute and wait
            let pid = tenant_builder
                .build()
                .map_err(|e| AgentError::CreateFailed {
                    id: container_id_clone.clone(),
                    reason: format!("failed to exec in container: {e}"),
                })?;

            // Return raw pid as i32 to avoid nix version conflicts
            // (libcontainer uses nix 0.29, we use nix 0.31)
            Ok::<i32, AgentError>(pid.as_raw())
        })
        .await
        .map_err(|e| AgentError::CreateFailed {
            id: container_id.clone(),
            reason: format!("task join error: {e}"),
        })??;

        // Wait for process to complete and get exit status
        let exit_code = tokio::task::spawn_blocking(move || {
            use nix::sys::wait::{waitpid, WaitStatus};
            use nix::unistd::Pid;
            // Convert raw pid back to our nix version's Pid type
            let pid = Pid::from_raw(exec_pid);
            match waitpid(pid, None) {
                Ok(WaitStatus::Exited(_, code)) => code,
                Ok(WaitStatus::Signaled(_, signal, _)) => 128 + signal as i32,
                Ok(_) | Err(_) => -1,
            }
        })
        .await
        .unwrap_or(-1);

        // Read output
        let stdout_content = fs::read_to_string(&stdout_path).await.unwrap_or_default();
        let stderr_content = fs::read_to_string(&stderr_path).await.unwrap_or_default();

        // Clean up exec directory
        let _ = fs::remove_dir_all(&exec_dir).await;

        Ok((exit_code, stdout_content, stderr_content))
    }

    /// Get container resource statistics from cgroups
    ///
    /// Reads CPU and memory statistics from the cgroups v2 filesystem.
    /// Supports both systemd and cgroupfs cgroup drivers.
    async fn get_container_stats(&self, id: &ContainerId) -> Result<ContainerStats> {
        let container_id = self.container_id_str(id);

        // Determine cgroup path based on cgroup driver
        let cgroup_path = if self.config.use_systemd {
            // systemd cgroup driver: /sys/fs/cgroup/system.slice/zlayer-{id}.scope
            PathBuf::from(format!(
                "/sys/fs/cgroup/system.slice/zlayer-{container_id}.scope"
            ))
        } else {
            // cgroupfs driver: /sys/fs/cgroup/zlayer/{id}
            PathBuf::from(format!("/sys/fs/cgroup/zlayer/{container_id}"))
        };

        tracing::debug!(
            container = %container_id,
            cgroup_path = %cgroup_path.display(),
            "reading container stats from cgroups"
        );

        cgroups_stats::read_container_stats(&cgroup_path)
            .await
            .map_err(|e| {
                AgentError::Internal(format!(
                    "failed to read cgroup stats for container {container_id}: {e}"
                ))
            })
    }

    /// Wait for a container to exit and return its exit code
    ///
    /// This polls the container state until it reaches an exited state.
    /// For libcontainer, we don't have a direct "wait" API, so we poll.
    async fn wait_container(&self, id: &ContainerId) -> Result<i32> {
        let container_id = self.container_id_str(id);
        let poll_interval = Duration::from_millis(100);
        let max_wait = Duration::from_secs(3600); // 1 hour max
        let start = std::time::Instant::now();

        tracing::debug!(
            container = %container_id,
            "waiting for container to exit"
        );

        loop {
            if start.elapsed() > max_wait {
                return Err(AgentError::Timeout { timeout: max_wait });
            }

            match self.container_state(id).await {
                Ok(ContainerState::Exited { code }) => {
                    tracing::debug!(
                        container = %container_id,
                        exit_code = code,
                        "container exited"
                    );
                    return Ok(code);
                }
                Ok(ContainerState::Failed { reason }) => {
                    tracing::warn!(
                        container = %container_id,
                        reason = %reason,
                        "container failed"
                    );
                    return Err(AgentError::Internal(format!("container failed: {reason}")));
                }
                Ok(_) => {
                    // Still running, wait and poll again
                    tokio::time::sleep(poll_interval).await;
                }
                Err(AgentError::NotFound { .. }) => {
                    // Container may have been removed - treat as exited with code 0
                    tracing::debug!(
                        container = %container_id,
                        "container not found, treating as exited"
                    );
                    return Ok(0);
                }
                Err(e) => {
                    return Err(e);
                }
            }
        }
    }

    /// Get container logs (stdout/stderr combined)
    ///
    /// Reads from the container's log files and returns as a vector of lines.
    async fn get_logs(&self, id: &ContainerId) -> Result<Vec<LogEntry>> {
        let container_id = self.container_id_str(id);

        // Get log paths
        let (stdout_path, stderr_path) = {
            let containers = self.containers.read().await;
            match containers.get(&container_id) {
                Some(info) => (info.stdout_path.clone(), info.stderr_path.clone()),
                None => self.log_paths(id),
            }
        };

        let now = chrono::Utc::now();
        let source = LogSource::Container(id.to_string());
        let mut entries = Vec::new();

        // Read stdout
        if stdout_path.exists() {
            if let Ok(content) = fs::read_to_string(&stdout_path).await {
                for line in content.lines() {
                    entries.push(LogEntry {
                        timestamp: now,
                        stream: LogStream::Stdout,
                        message: line.to_string(),
                        source: source.clone(),
                        service: None,
                        deployment: None,
                    });
                }
            }
        }

        // Read stderr
        if stderr_path.exists() {
            if let Ok(content) = fs::read_to_string(&stderr_path).await {
                for line in content.lines() {
                    entries.push(LogEntry {
                        timestamp: now,
                        stream: LogStream::Stderr,
                        message: line.to_string(),
                        source: source.clone(),
                        service: None,
                        deployment: None,
                    });
                }
            }
        }

        // Sort by timestamp
        entries.sort_by_key(|e| e.timestamp);

        Ok(entries)
    }

    /// Get the PID of a container's main process
    ///
    /// Returns:
    /// - `Ok(Some(pid))` for running containers
    /// - `Ok(None)` if the container exists but has no PID (not running or stopped)
    /// - `Err` if the container doesn't exist or there's an error loading it
    #[instrument(
        skip(self),
        fields(
            otel.name = "container.get_pid",
            container.id = %self.container_id_str(id),
        )
    )]
    async fn get_container_pid(&self, id: &ContainerId) -> Result<Option<u32>> {
        let container_id = self.container_id_str(id);
        let container_root = self.container_root(id);

        // Check if container root exists
        if !container_root.exists() {
            return Err(AgentError::NotFound {
                container: container_id.clone(),
                reason: "container state directory not found".to_string(),
            });
        }

        // Load container and get PID
        let container_id_clone = container_id.clone();

        let pid = tokio::task::spawn_blocking(move || {
            let mut container =
                Container::load(container_root).map_err(|e| AgentError::NotFound {
                    container: container_id_clone.clone(),
                    reason: format!("failed to load container: {e}"),
                })?;

            // Refresh status to get current state
            let _ = container.refresh_status();

            // Get PID - returns None if container is not running
            #[allow(clippy::cast_sign_loss)]
            let pid = container.pid().map(|p| p.as_raw() as u32);

            Ok::<Option<u32>, AgentError>(pid)
        })
        .await
        .map_err(|e| AgentError::NotFound {
            container: container_id.clone(),
            reason: format!("task join error: {e}"),
        })??;

        tracing::debug!(
            container = %container_id,
            pid = ?pid,
            "retrieved container PID"
        );

        Ok(pid)
    }

    async fn get_container_ip(&self, _id: &ContainerId) -> Result<Option<std::net::IpAddr>> {
        // Youki containers use OCI network namespaces — IP assignment comes
        // from the overlay manager, not the runtime itself.
        Ok(None)
    }

    /// Sync all named volumes to S3 before container removal.
    ///
    /// When the `s3` feature is enabled and a `LayerSyncManager` has been
    /// configured on the storage manager, this iterates all non-anonymous
    /// volumes and pushes any changes to S3. Errors are logged but do not
    /// prevent container removal.
    #[allow(unused_variables)]
    async fn sync_container_volumes(&self, id: &ContainerId) -> Result<()> {
        #[cfg(feature = "s3")]
        {
            let storage_manager = self.storage_manager.read().await;
            if storage_manager.layer_sync().is_some() {
                let container_id = self.container_id_str(id);
                tracing::info!(
                    container = %container_id,
                    "syncing volumes to S3 before container removal"
                );
                match storage_manager.sync_all_volumes().await {
                    Ok(synced) => {
                        if synced > 0 {
                            tracing::info!(
                                container = %container_id,
                                synced_count = synced,
                                "volume sync complete"
                            );
                        }
                    }
                    Err(e) => {
                        tracing::warn!(
                            container = %container_id,
                            error = %e,
                            "volume sync failed, data may not be persisted"
                        );
                    }
                }
            }
        }
        Ok(())
    }

    async fn list_images(&self) -> Result<Vec<ImageInfo>> {
        let keys = self
            .blob_cache
            .keys_with_prefix("manifest:")
            .await
            .map_err(|e| {
                AgentError::Internal(format!("failed to list manifest cache keys: {e}"))
            })?;

        let mut images = Vec::with_capacity(keys.len());
        for key in keys {
            // Strip the "manifest:" prefix
            let reference = match key.strip_prefix("manifest:") {
                Some(r) => r.to_string(),
                None => continue,
            };

            // Load manifest body to compute size and extract digest
            let Ok(Some(manifest_bytes)) = self.blob_cache.get(&key).await else {
                continue; // cache entry disappeared — skip
            };

            let Ok(manifest) = serde_json::from_slice::<OciImageManifest>(&manifest_bytes) else {
                continue; // corrupt entry — skip
            };

            // Sum layer sizes + config size
            let layers_size: i64 = manifest.layers.iter().map(|l| l.size).sum();
            let config_size = manifest.config.size;
            let total = layers_size.saturating_add(config_size);
            let size_bytes = if total > 0 {
                u64::try_from(total).ok()
            } else {
                None
            };

            // Look up the stored registry digest (Wave 2 convention)
            let digest_key = format!("manifest-digest:{reference}");
            let digest = self
                .blob_cache
                .get(&digest_key)
                .await
                .ok()
                .flatten()
                .and_then(|bytes| String::from_utf8(bytes).ok());

            images.push(ImageInfo {
                reference,
                digest,
                size_bytes,
            });
        }

        Ok(images)
    }

    async fn remove_image(&self, image: &str, _force: bool) -> Result<()> {
        let manifest_key = format!("manifest:{image}");
        let digest_key = format!("manifest-digest:{image}");

        // Load manifest to learn the blob digests it references
        let manifest_bytes = self
            .blob_cache
            .get(&manifest_key)
            .await
            .map_err(|e| AgentError::Internal(format!("failed to read manifest cache: {e}")))?;

        let Some(manifest_bytes) = manifest_bytes else {
            return Err(AgentError::NotFound {
                container: image.to_string(),
                reason: format!("image '{image}' not found in cache"),
            });
        };

        let manifest: OciImageManifest = serde_json::from_slice(&manifest_bytes).map_err(|e| {
            AgentError::Internal(format!(
                "failed to parse cached manifest for '{image}': {e}"
            ))
        })?;

        // Delete each referenced blob. Don't bail on individual failures — log and continue.
        for layer in &manifest.layers {
            if let Err(e) = self.blob_cache.delete(&layer.digest).await {
                tracing::warn!(
                    image = %image,
                    digest = %layer.digest,
                    error = %e,
                    "failed to delete layer blob"
                );
            }
        }
        if let Err(e) = self.blob_cache.delete(&manifest.config.digest).await {
            tracing::warn!(
                image = %image,
                digest = %manifest.config.digest,
                error = %e,
                "failed to delete config blob"
            );
        }

        // Delete the manifest body and its digest sidecar
        self.blob_cache
            .delete(&manifest_key)
            .await
            .map_err(|e| AgentError::Internal(format!("failed to delete manifest: {e}")))?;
        let _ = self.blob_cache.delete(&digest_key).await;

        Ok(())
    }

    async fn prune_images(&self) -> Result<PruneResult> {
        // 1. Collect all digests referenced by remaining manifests.
        let manifest_keys = self
            .blob_cache
            .keys_with_prefix("manifest:")
            .await
            .map_err(|e| AgentError::Internal(format!("failed to list manifests: {e}")))?;

        let mut referenced: std::collections::HashSet<String> = std::collections::HashSet::new();
        for key in &manifest_keys {
            let Ok(Some(bytes)) = self.blob_cache.get(key).await else {
                continue;
            };
            let Ok(manifest) = serde_json::from_slice::<OciImageManifest>(&bytes) else {
                continue;
            };
            referenced.insert(manifest.config.digest.clone());
            for layer in &manifest.layers {
                referenced.insert(layer.digest.clone());
            }
        }

        // 2. Walk all sha256:* blob keys and delete those not referenced.
        let all_blob_keys = self
            .blob_cache
            .keys_with_prefix("sha256:")
            .await
            .map_err(|e| AgentError::Internal(format!("failed to list blobs: {e}")))?;

        let mut deleted = Vec::new();
        let mut space_reclaimed: u64 = 0;

        for key in all_blob_keys {
            if referenced.contains(&key) {
                continue;
            }
            // Grab the blob size before deleting (best-effort).
            if let Ok(Some(bytes)) = self.blob_cache.get(&key).await {
                space_reclaimed = space_reclaimed.saturating_add(bytes.len() as u64);
            }
            if let Err(e) = self.blob_cache.delete(&key).await {
                tracing::warn!(
                    digest = %key,
                    error = %e,
                    "failed to delete orphaned blob during prune"
                );
                continue;
            }
            deleted.push(key);
        }

        Ok(PruneResult {
            deleted,
            space_reclaimed,
        })
    }

    #[instrument(
        skip(self),
        fields(
            otel.name = "container.kill",
            container.id = %self.container_id_str(id),
            service.name = %id.service,
            signal = ?signal,
        )
    )]
    async fn kill_container(&self, id: &ContainerId, signal: Option<&str>) -> Result<()> {
        use std::convert::TryFrom;

        let canonical = crate::runtime::validate_signal(signal.unwrap_or("SIGKILL"))?;
        let container_id = self.container_id_str(id);
        let container_root = self.container_root(id);

        tracing::info!(
            container = %container_id,
            signal = %canonical,
            "sending signal to container"
        );

        let container_id_clone = container_id.clone();
        let canonical_clone = canonical.clone();
        tokio::task::spawn_blocking(move || {
            let mut container =
                Container::load(container_root).map_err(|e| AgentError::NotFound {
                    container: container_id_clone.clone(),
                    reason: format!("failed to load container: {e}"),
                })?;

            if !container.status().can_kill() {
                return Err(AgentError::InvalidSpec(format!(
                    "container '{container_id_clone}' is not in a killable state ({:?})",
                    container.status()
                )));
            }

            let sig = Signal::try_from(canonical_clone.as_str()).map_err(|e| {
                AgentError::InvalidSpec(format!("invalid signal '{canonical_clone}': {e:?}"))
            })?;
            container.kill(sig, true).map_err(|e| {
                AgentError::Internal(format!(
                    "failed to deliver signal {canonical_clone} to '{container_id_clone}': {e}"
                ))
            })?;

            Ok::<(), AgentError>(())
        })
        .await
        .map_err(|e| AgentError::Internal(format!("task join error during kill: {e}")))??;

        Ok(())
    }

    #[instrument(
        skip(self),
        fields(
            otel.name = "image.tag",
            source = %source,
            target = %target,
        )
    )]
    async fn tag_image(&self, source: &str, target: &str) -> Result<()> {
        if source.trim().is_empty() || target.trim().is_empty() {
            return Err(AgentError::InvalidSpec(
                "source and target must be non-empty image references".to_string(),
            ));
        }
        if source == target {
            // Nothing to do; idempotent.
            return Ok(());
        }

        // Copy the source manifest and its digest sidecar under the target
        // reference. All blobs remain shared content-addressed in the cache.
        let src_manifest_key = format!("manifest:{source}");
        let manifest_bytes = self
            .blob_cache
            .get(&src_manifest_key)
            .await
            .map_err(|e| AgentError::Internal(format!("failed to read manifest cache: {e}")))?
            .ok_or_else(|| AgentError::NotFound {
                container: source.to_string(),
                reason: format!("source image '{source}' not found in cache"),
            })?;

        let dst_manifest_key = format!("manifest:{target}");
        self.blob_cache
            .put(&dst_manifest_key, &manifest_bytes)
            .await
            .map_err(|e| AgentError::Internal(format!("failed to write manifest for tag: {e}")))?;

        // Best-effort: carry over the registry digest sidecar if present.
        let src_digest_key = format!("manifest-digest:{source}");
        if let Ok(Some(digest_bytes)) = self.blob_cache.get(&src_digest_key).await {
            let dst_digest_key = format!("manifest-digest:{target}");
            if let Err(e) = self.blob_cache.put(&dst_digest_key, &digest_bytes).await {
                tracing::warn!(
                    source = %source,
                    target = %target,
                    error = %e,
                    "failed to copy manifest-digest sidecar for tag (non-fatal)"
                );
            }
        }

        tracing::info!(source = %source, target = %target, "tagged image");
        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_youki_config_default() {
        let config = YoukiConfig::default();
        let dirs = zlayer_paths::ZLayerDirs::system_default();

        assert_eq!(config.state_dir, dirs.containers());
        assert_eq!(config.rootfs_dir, dirs.rootfs());
        assert_eq!(config.bundle_dir, dirs.bundles());
        assert_eq!(config.cache_dir, dirs.cache());
        assert_eq!(config.volume_dir, dirs.volumes());
        assert!(!config.use_systemd);
        assert!(config.cache_type.is_none());
        assert!(config.log_base_dir.is_none());
        assert!(config.deployment_name.is_none());
    }

    #[test]
    fn test_container_id_str() {
        let id = ContainerId {
            service: "myservice".to_string(),
            replica: 1,
        };

        let expected = "myservice-1";
        assert_eq!(format!("{}-{}", id.service, id.replica), expected);
    }

    #[test]
    fn test_rootfs_path_sanitization() {
        // Test that image names are sanitized for filesystem paths
        let images = vec![
            (
                "docker.io/library/nginx:latest",
                "docker.io_library_nginx_latest",
            ),
            ("ghcr.io/owner/repo:v1.0", "ghcr.io_owner_repo_v1.0"),
            (
                "registry.example.com/image@sha256:abc123",
                "registry.example.com_image_sha256_abc123",
            ),
        ];

        for (image, expected_suffix) in images {
            let safe_name = image.replace(['/', ':', '@'], "_");
            assert_eq!(safe_name, expected_suffix);
        }
    }

    #[test]
    fn test_map_status() {
        // Test status mapping without runtime instance
        let mappings = vec![
            (ContainerStatus::Creating, "Pending"),
            (ContainerStatus::Created, "Pending"),
            (ContainerStatus::Running, "Running"),
            (ContainerStatus::Stopped, "Exited"),
            (ContainerStatus::Paused, "Stopping"),
        ];

        for (status, expected) in mappings {
            let state = match status {
                ContainerStatus::Creating | ContainerStatus::Created => ContainerState::Pending,
                ContainerStatus::Running => ContainerState::Running,
                ContainerStatus::Stopped => ContainerState::Exited { code: 0 },
                ContainerStatus::Paused => ContainerState::Stopping,
            };

            let state_str = match state {
                ContainerState::Pending => "Pending",
                ContainerState::Running => "Running",
                ContainerState::Exited { .. } => "Exited",
                ContainerState::Stopping => "Stopping",
                _ => "Other",
            };

            assert_eq!(state_str, expected);
        }
    }

    #[test]
    fn test_log_paths() {
        let config = YoukiConfig::default();
        let dirs = zlayer_paths::ZLayerDirs::system_default();
        let id = ContainerId {
            service: "testservice".to_string(),
            replica: 2,
        };

        let container_id = format!("{}-{}", id.service, id.replica);
        let state_dir = config.state_dir.join(&container_id);
        let stdout = state_dir.join("stdout.log");
        let stderr = state_dir.join("stderr.log");

        assert_eq!(stdout, dirs.containers().join("testservice-2/stdout.log"));
        assert_eq!(stderr, dirs.containers().join("testservice-2/stderr.log"));
    }

    #[test]
    fn test_youki_config_clone() {
        let config = YoukiConfig {
            state_dir: PathBuf::from("/custom/state"),
            rootfs_dir: PathBuf::from("/custom/rootfs"),
            bundle_dir: PathBuf::from("/custom/bundles"),
            cache_dir: PathBuf::from("/custom/cache"),
            volume_dir: PathBuf::from("/custom/volumes"),
            use_systemd: true,
            cache_type: Some(zlayer_registry::CacheType::memory()),
            log_base_dir: Some(PathBuf::from("/var/log/zlayer")),
            deployment_name: Some("myapp".to_string()),
        };

        let cloned = config.clone();

        assert_eq!(cloned.state_dir, config.state_dir);
        assert_eq!(cloned.rootfs_dir, config.rootfs_dir);
        assert_eq!(cloned.bundle_dir, config.bundle_dir);
        assert_eq!(cloned.cache_dir, config.cache_dir);
        assert_eq!(cloned.volume_dir, config.volume_dir);
        assert_eq!(cloned.use_systemd, config.use_systemd);
        assert!(cloned.cache_type.is_some());
        assert_eq!(cloned.log_base_dir, config.log_base_dir);
        assert_eq!(cloned.deployment_name, config.deployment_name);
    }

    /// Test that `YoukiRuntime::new()` creates directories
    #[tokio::test]
    async fn test_youki_runtime_directory_creation() {
        // Use a unique temp directory based on test run
        let temp_base = std::env::temp_dir().join(format!("youki_test_{}", std::process::id()));

        let config = YoukiConfig {
            state_dir: temp_base.join("state"),
            rootfs_dir: temp_base.join("rootfs"),
            bundle_dir: temp_base.join("bundles"),
            cache_dir: temp_base.join("cache"),
            volume_dir: temp_base.join("volumes"),
            use_systemd: false,
            cache_type: None,
            log_base_dir: None,
            deployment_name: None,
        };

        // Clean up any previous test run
        let _ = std::fs::remove_dir_all(&temp_base);

        // This should succeed and create all directories
        let result = YoukiRuntime::new(config.clone(), None).await;

        assert!(
            result.is_ok(),
            "Failed to create runtime: {:?}",
            result.err()
        );

        // Verify directories were created
        assert!(config.state_dir.exists());
        assert!(config.rootfs_dir.exists());
        assert!(config.bundle_dir.exists());
        assert!(config.cache_dir.exists());
        assert!(config.volume_dir.exists());

        // Clean up
        let _ = std::fs::remove_dir_all(&temp_base);
    }
}