wdl-engine 0.13.2

//! Support for using Apptainer (a.k.a. Singularity) container runtime.
//!
//! There are two primary responsibilities of this module: `.sif` image cache
//! management and command script generation.
//!
//! The entrypoint for both of these is [`ApptainerRuntime::generate_script`].

use std::collections::HashMap;
use std::fmt::Write as _;
use std::path::Path;
use std::path::PathBuf;
use std::path::absolute;
use std::process::Stdio;
use std::sync::Arc;
use std::sync::Mutex;

use anyhow::Context as _;
use anyhow::Result;
use anyhow::anyhow;
use anyhow::bail;
use tokio::process::Command;
use tokio::sync::OnceCell;
use tokio_retry2::Retry;
use tokio_retry2::RetryError;
use tokio_retry2::strategy::ExponentialBackoff;
use tokio_util::sync::CancellationToken;
use tracing::debug;
use tracing::warn;

use crate::Value;
use crate::backend::ExecuteTaskRequest;
use crate::backend::PullResults;
use crate::config::ApptainerConfig;
use crate::v1::requirements::ContainerSource;

/// The name of the images cache directory.
const IMAGES_CACHE_DIR: &str = "apptainer-images";

/// The guest working directory.
const GUEST_WORK_DIR: &str = "/mnt/task/work";

/// The guest path for the command file.
const GUEST_COMMAND_PATH: &str = "/mnt/task/command";

/// The path to the container's stdout.
const GUEST_STDOUT_PATH: &str = "/mnt/task/stdout";

/// The path to the container's stderr.
const GUEST_STDERR_PATH: &str = "/mnt/task/stderr";

/// The environment variable prefix for Apptainer.
const APPTAINER_ENV_PREFIX: &str = "APPTAINERENV";

/// The environment variable prefix for Singularity.
const SINGULARITY_ENV_PREFIX: &str = "SINGULARITYENV";

/// Represents the Apptainer container runtime.
#[derive(Debug)]
pub struct ApptainerRuntime {
    /// The cache directory for `.sif` images.
    cache_dir: PathBuf,
    /// The map of container source to `.sif` path.
    images: Mutex<HashMap<ContainerSource, Arc<OnceCell<PathBuf>>>>,
}

impl ApptainerRuntime {
    /// Creates a new [`ApptainerRuntime`] with the specified root directory.
    ///
    /// If `image_cache_dir` is provided, it is used as the directory for
    /// caching `.sif` images. Otherwise, a default subdirectory is created
    /// within the given root.
    pub fn new(root_dir: &Path, image_cache_dir: Option<&Path>) -> Result<Self> {
        let cache_dir = image_cache_dir
            .map(Path::to_path_buf)
            .unwrap_or_else(|| root_dir.join(IMAGES_CACHE_DIR));

        Ok(Self {
            cache_dir: absolute(&cache_dir).with_context(|| {
                format!(
                    "failed to make path `{path}` absolute",
                    path = cache_dir.display()
                )
            })?,
            images: Default::default(),
        })
    }

    /// Generates the script to run the given task using the Apptainer runtime.
    ///
    /// Returns the generated script along with the [`ContainerSource`] that
    /// was actually pulled and selected for execution.
    ///
    /// # Shared filesystem assumptions
    ///
    /// The returned script should be run in an environment that shares a
    /// filesystem with the environment where this method is invoked, except
    /// for node-specific mounts like `/tmp` and `/var`. This assumption
    /// typically holds on HPC systems with shared filesystems like Lustre or
    /// GPFS.
    pub async fn generate_script(
        &self,
        config: &ApptainerConfig,
        shell: &str,
        request: &ExecuteTaskRequest<'_>,
        token: CancellationToken,
    ) -> Result<Option<(String, ContainerSource)>> {
        let results = match self
            .pull_first_available_image(
                &config.executable,
                request
                    .constraints
                    .container
                    .as_deref()
                    .ok_or_else(|| anyhow!("task does not use a container"))?,
                token,
            )
            .await
        {
            Some(results) => results,
            None => return Ok(None),
        };

        let (container, path) = results
            .successful_container()
            .ok_or_else(|| anyhow!("{results}"))?;
        let container = container.clone();
        let path = path.clone();

        Ok(Some((
            self.generate_apptainer_script(config, shell, &path, request)
                .await?,
            container,
        )))
    }

    /// Generate the script, given a container path that's already assumed to be
    /// populated.
    ///
    /// This is a separate method in order to facilitate testing, and should not
    /// be called from outside this module.
    async fn generate_apptainer_script(
        &self,
        config: &ApptainerConfig,
        shell: &str,
        container_sif: &Path,
        request: &ExecuteTaskRequest<'_>,
    ) -> Result<String> {
        // Create a temp dir for the container's execution within the attempt dir
        // hierarchy. On many HPC systems, `/tmp` is mapped to a relatively
        // small, local scratch disk that can fill up easily. Mapping the
        // container's `/tmp` and `/var/tmp` paths to the filesystem we're using
        // for other inputs and outputs prevents this from being a capacity problem,
        // though potentially at the expense of execution speed if the
        // non-`/tmp` filesystem is significantly slower.
        let container_tmp_path = request.temp_dir.join("container_tmp");
        tokio::fs::DirBuilder::new()
            .recursive(true)
            .create(&container_tmp_path)
            .await
            .with_context(|| {
                format!(
                    "failed to create container /tmp directory at `{path}`",
                    path = container_tmp_path.display()
                )
            })?;
        let container_var_tmp_path = request.temp_dir.join("container_var_tmp");
        tokio::fs::DirBuilder::new()
            .recursive(true)
            .create(&container_var_tmp_path)
            .await
            .with_context(|| {
                format!(
                    "failed to create container /var/tmp directory at `{path}`",
                    path = container_var_tmp_path.display()
                )
            })?;

        let env_prefix = if config.executable.contains("singularity") {
            SINGULARITY_ENV_PREFIX
        } else {
            APPTAINER_ENV_PREFIX
        };

        let mut apptainer_command = String::new();
        writeln!(&mut apptainer_command, "#!/usr/bin/env bash")?;
        for (k, v) in request.env.iter() {
            writeln!(&mut apptainer_command, "export {env_prefix}_{k}={v:?}")?;
        }
        writeln!(&mut apptainer_command, "{} -v exec \\", config.executable)?;
        writeln!(&mut apptainer_command, "--pwd \"{GUEST_WORK_DIR}\" \\")?;
        writeln!(&mut apptainer_command, "--containall --cleanenv \\")?;
        for input in request.backend_inputs {
            writeln!(
                &mut apptainer_command,
                "--mount type=bind,src=\"{host_path}\",dst=\"{guest_path}\",ro \\",
                host_path = input
                    .local_path()
                    .ok_or_else(|| anyhow!("input not localized: {input:?}"))?
                    .display(),
                guest_path = input
                    .guest_path()
                    .ok_or_else(|| anyhow!("guest path missing: {input:?}"))?,
            )?;
        }
        writeln!(
            &mut apptainer_command,
            "--mount type=bind,src=\"{}\",dst=\"{GUEST_COMMAND_PATH}\",ro \\",
            request.command_path().display()
        )?;
        writeln!(
            &mut apptainer_command,
            "--mount type=bind,src=\"{}\",dst=\"{GUEST_WORK_DIR}\" \\",
            request.work_dir().display()
        )?;
        writeln!(
            &mut apptainer_command,
            "--mount type=bind,src=\"{}\",dst=\"/tmp\" \\",
            container_tmp_path.display()
        )?;
        writeln!(
            &mut apptainer_command,
            "--mount type=bind,src=\"{}\",dst=\"/var/tmp\" \\",
            container_var_tmp_path.display()
        )?;
        writeln!(
            &mut apptainer_command,
            "--mount type=bind,src=\"{}\",dst=\"{GUEST_STDOUT_PATH}\" \\",
            request.stdout_path().display()
        )?;
        writeln!(
            &mut apptainer_command,
            "--mount type=bind,src=\"{}\",dst=\"{GUEST_STDERR_PATH}\" \\",
            request.stderr_path().display()
        )?;

        if let Some(true) = request
            .requirements
            .get(wdl_ast::v1::TASK_REQUIREMENT_GPU)
            .and_then(Value::as_boolean)
        {
            writeln!(&mut apptainer_command, "--nv \\")?;
        }

        for arg in config
            .extra_apptainer_exec_args
            .as_deref()
            .unwrap_or_default()
        {
            writeln!(&mut apptainer_command, "{arg} \\")?;
        }

        writeln!(&mut apptainer_command, "\"{}\" \\", container_sif.display())?;
        writeln!(
            &mut apptainer_command,
            "{shell} -c \"\\\"{GUEST_COMMAND_PATH}\\\" > \\\"{GUEST_STDOUT_PATH}\\\" 2> \
             \\\"{GUEST_STDERR_PATH}\\\"\" \\"
        )?;
        let attempt_dir = request.attempt_dir;
        let apptainer_stdout_path = attempt_dir.join("apptainer.stdout");
        let apptainer_stderr_path = attempt_dir.join("apptainer.stderr");
        writeln!(
            &mut apptainer_command,
            "> \"{stdout}\" 2> \"{stderr}\"",
            stdout = apptainer_stdout_path.display(),
            stderr = apptainer_stderr_path.display()
        )?;
        Ok(apptainer_command)
    }

    /// Pulls the image for the given container source and returns the path to
    /// the image file (SIF).
    ///
    /// If the container source is already a SIF file, the given source path is
    /// returned.
    ///
    /// If the image has already been pulled, the pull is skipped and the path
    /// to the previous location is returned.
    pub(crate) async fn pull_image(
        &self,
        executable: &str,
        container: &ContainerSource,
        token: CancellationToken,
    ) -> Result<Option<PathBuf>> {
        // For local SIF files, return the path directly.
        if let ContainerSource::SifFile(path) = container {
            return Ok(Some(path.clone()));
        }

        // For unknown container sources, error early.
        if let ContainerSource::Unknown(s) = container {
            bail!("unknown container source `{s}`");
        }

        // For registry-based images, pull and cache.
        let once = {
            let mut map = self.images.lock().unwrap();
            map.entry(container.clone())
                .or_insert_with(|| Arc::new(OnceCell::new()))
                .clone()
        };

        let pull = once.get_or_try_init(|| async move {
            // SAFETY: the next two `unwrap` calls are safe because the source can't be a
            // file or an unknown source at this point
            let mut path = self.cache_dir.join(container.scheme().unwrap());
            for part in container.name().unwrap().split("/") {
                for part in part.split(':') {
                    path.push(part);
                }
            }

            path.add_extension("sif");

            if path.exists() {
                debug!(path = %path.display(), "Apptainer image `{container:#}` already cached; using existing image");
                return Ok(path);
            }

            if let Some(parent) = path.parent() {
                tokio::fs::create_dir_all(parent).await.with_context(|| {
                    format!(
                        "failed to create directory `{parent}`",
                        parent = parent.display()
                    )
                })?;
            }

            let container = format!("{container:#}");
            let executable = executable.to_string();

            Retry::spawn_notify(
                // TODO ACF 2025-09-22: configure the retry behavior based on actual experience
                // with flakiness of the container registries. This is a
                // finger-in-the-wind guess at some reasonable parameters that
                // shouldn't lead to us making our own problems worse by
                // overwhelming registries with repeated retries.
                ExponentialBackoff::from_millis(50)
                    .max_delay_millis(60_000)
                    .take(10),
                || Self::try_pull_image(&executable, &container, &path),
                {
                    let executable = executable.clone();
                    move |e: &anyhow::Error, _| {
                        warn!(e = %e, "`{executable} pull` failed");
                    }
                },
            )
            .await
            .with_context(|| format!("failed pulling Apptainer image `{container}`"))?;

            debug!(path = %path.display(), "Apptainer image `{container}` pulled successfully");
            Ok(path)
        });

        tokio::select! {
            _ = token.cancelled() => Ok(None),
            res = pull => res.map(|p| Some(p.clone())),
        }
    }

    /// Attempts to pull the first available image from a list of candidates.
    ///
    /// Iterates through the candidates in order, returning the path of the
    /// first image that pulls successfully. Returns a [`PullResults`]
    /// containing the outcome of each attempt, stopping after the first
    /// success. Returns `None` if a pull was cancelled.
    pub(crate) async fn pull_first_available_image(
        &self,
        executable: &str,
        candidates: &[ContainerSource],
        token: CancellationToken,
    ) -> Option<PullResults<PathBuf>> {
        let mut results = PullResults::default();

        for candidate in candidates {
            debug!("attempting to pull container image `{candidate:#}`");
            match self.pull_image(executable, candidate, token.clone()).await {
                Ok(Some(path)) => {
                    debug!("successfully pulled container image `{candidate:#}`");
                    results.push(candidate.clone(), Ok(path));
                    return Some(results);
                }
                Ok(None) => return None,
                Err(e) => {
                    warn!("failed to pull container image `{candidate:#}`: {e:#}");
                    results.push(candidate.clone(), Err(e));
                }
            }
        }

        Some(results)
    }

    /// Tries to pull an image.
    ///
    /// The tricky thing about this function is determining whether a failure is
    /// transient or permanent. When in doubt, choose transient; the downside is
    /// a permanent failure may take longer to finally bring down an
    /// execution, but this is better for a long-running task than letting a
    /// transient failure bring it down before a retry.
    ///
    /// `apptainer pull` doesn't have a well-defined interface for us to tell
    /// whether a failure is transient, but as we gain experience recognizing
    /// its output patterns, we can enhance the fidelity of the error
    /// handling.
    async fn try_pull_image(
        executable: &str,
        image: &str,
        path: &Path,
    ) -> Result<(), RetryError<anyhow::Error>> {
        debug!("spawning `{executable}` to pull image `{image}`");

        let child = Command::new(executable)
            .stdin(Stdio::null())
            .stdout(Stdio::piped())
            .stderr(Stdio::piped())
            .arg("pull")
            .arg(path)
            .arg(image)
            .spawn()
            .with_context(|| {
                format!(
                    "failed to spawn `{executable} pull '{path}' '{image}'`",
                    path = path.display()
                )
            })
            // If the system can't handle spawning a process, we're better off failing quickly
            .map_err(RetryError::permanent)?;

        let output = child
            .wait_with_output()
            .await
            .context(format!("failed to wait for `{executable}`"))
            .map_err(RetryError::permanent)?;
        if !output.status.success() {
            let permanent = if let Ok(stderr) = str::from_utf8(&output.stderr) {
                let mut permanent = false;
                // A collection of strings observed in `apptainer pull` stderr in unrecoverable
                // conditions. Finding one of these in the output marks the attempt as a
                // permanent failure.
                let needles = ["manifest unknown", "403 (Forbidden)"];
                for needle in needles {
                    if stderr.contains(needle) {
                        permanent = true;
                        break;
                    }
                }

                permanent
            } else {
                false
            };

            let e = anyhow!(
                "`{executable}` failed: {status}: {stderr}",
                status = output.status,
                stderr = str::from_utf8(&output.stderr)
                    .unwrap_or("<output not UTF-8>")
                    .trim()
            );
            return if permanent {
                Err(RetryError::permanent(e))
            } else {
                Err(RetryError::transient(e))
            };
        }

        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use indexmap::IndexMap;
    use tempfile::TempDir;
    use url::Url;

    use super::*;
    use crate::ONE_GIBIBYTE;
    use crate::TaskInputs;
    use crate::backend::ExecuteTaskRequest;
    use crate::backend::TaskExecutionConstraints;
    use crate::config::DEFAULT_TASK_SHELL;

    #[tokio::test]
    async fn example_task_generates() {
        let root = TempDir::new().unwrap();

        let mut env = IndexMap::new();
        env.insert("FOO".to_string(), "bar".to_string());
        env.insert("BAZ".to_string(), "\"quux\"".to_string());

        let runtime = ApptainerRuntime::new(&root.path().join("runs"), None).unwrap();
        let _ = runtime
            .generate_script(
                &ApptainerConfig::default(),
                DEFAULT_TASK_SHELL,
                &ExecuteTaskRequest {
                    id: "example-task",
                    command: "echo hello",
                    inputs: &TaskInputs::default(),
                    backend_inputs: &[],
                    requirements: &Default::default(),
                    hints: &Default::default(),
                    env: &env,
                    constraints: &TaskExecutionConstraints {
                        container: Some(vec![
                            String::from(
                                Url::from_file_path(root.path().join("non-existent.sif")).unwrap(),
                            )
                            .parse()
                            .unwrap(),
                        ]),
                        cpu: 1.0,
                        memory: ONE_GIBIBYTE as u64,
                        gpu: Default::default(),
                        fpga: Default::default(),
                        disks: Default::default(),
                    },
                    attempt_dir: &root.path().join("0"),
                    temp_dir: &root.path().join("temp"),
                },
                CancellationToken::new(),
            )
            .await
            .inspect_err(|e| eprintln!("{e:#?}"))
            .expect("example task script should generate");
    }

    // `shellcheck` works quite differently on Windows, and since we're not going to
    // run Apptainer on Windows anytime soon, we limit this test to Unixy
    // systems
    #[cfg(unix)]
    #[tokio::test]
    async fn example_task_shellchecks() {
        use tokio::process::Command;

        use crate::config::DEFAULT_TASK_SHELL;

        let root = TempDir::new().unwrap();

        let mut env = IndexMap::new();
        env.insert("FOO".to_string(), "bar".to_string());
        env.insert("BAZ".to_string(), "\"quux\"".to_string());

        let runtime = ApptainerRuntime::new(&root.path().join("runs"), None).unwrap();
        let (script, _) = runtime
            .generate_script(
                &ApptainerConfig::default(),
                DEFAULT_TASK_SHELL,
                &ExecuteTaskRequest {
                    id: "example-task",
                    command: "echo hello",
                    inputs: &TaskInputs::default(),
                    backend_inputs: &[],
                    requirements: &Default::default(),
                    hints: &Default::default(),
                    env: &env,
                    constraints: &TaskExecutionConstraints {
                        container: Some(vec![
                            String::from(
                                Url::from_file_path(root.path().join("non-existent.sif")).unwrap(),
                            )
                            .parse()
                            .unwrap(),
                        ]),
                        cpu: 1.0,
                        memory: ONE_GIBIBYTE as u64,
                        gpu: Default::default(),
                        fpga: Default::default(),
                        disks: Default::default(),
                    },
                    attempt_dir: &root.path().join("0"),
                    temp_dir: &root.path().join("temp"),
                },
                CancellationToken::new(),
            )
            .await
            .inspect_err(|e| eprintln!("{e:#?}"))
            .expect("example task script should generate")
            .expect("operation should not be canceled");
        let script_file = root.path().join("apptainer_script");
        tokio::fs::write(&script_file, &script)
            .await
            .expect("can write script to disk");
        let shellcheck_status = Command::new("shellcheck")
            .arg("--shell=bash")
            .arg("--severity=style")
            // all the quotes in the generated `--mount` args look suspicious but are okay
            .arg("--exclude=SC2140")
            .arg(&script_file)
            .status()
            .await
            .unwrap();
        assert!(shellcheck_status.success());
    }
}