podup 1.1.1

Translate and run docker-compose files on rootless Podman
Documentation
//! Health and completion polling for service dependency ordering.
//!
//! [`Engine::wait_healthy`] polls until the container reports `healthy` (used when
//! a dependent service declares `condition: service_healthy`).
//! [`Engine::wait_completed`] polls until the container exits with code 0 (used for
//! `condition: service_completed_successfully`).

use std::time::Duration;

use serde::Deserialize;

use crate::compose::types::{ComposeFile, Service};
use crate::error::{ComposeError, Result};
use crate::libpod::types::container::ContainerInspect;
use crate::libpod::API_PREFIX;

use super::Engine;

/// Response from `GET {API_PREFIX}/containers/{name}/healthcheck`, which *runs*
/// the container's healthcheck on demand and reports the resulting status.
#[derive(Deserialize)]
struct HealthCheckRun {
	#[serde(rename = "Status")]
	status: Option<String>,
}

/// Per-poll verdict while waiting for `service_healthy`.
enum HealthVerdict {
	/// The runtime reports the container as `healthy`.
	Healthy,
	/// The container has no effective healthcheck, so `healthy` is unreachable;
	/// treat the dependency as satisfied rather than blocking until timeout.
	NoHealthcheck,
	/// Not healthy yet — keep polling.
	Pending,
}

/// Classify a container inspect while waiting for `service_healthy`.
///
/// Pure decision logic for [`Engine::wait_healthy`], split out so the gating
/// behaviour can be unit-tested without a live Podman socket.
fn classify_health(info: &ContainerInspect) -> HealthVerdict {
	if let Some(state) = &info.state {
		if let Some(health) = &state.health {
			if health.status.as_deref() == Some("healthy") {
				return HealthVerdict::Healthy;
			}
		}
	}
	if !info
		.config
		.as_ref()
		.map(|c| c.has_healthcheck())
		.unwrap_or(false)
	{
		return HealthVerdict::NoHealthcheck;
	}
	HealthVerdict::Pending
}

/// Compute the `(poll_interval_secs, iterations)` for [`Engine::wait_healthy`]
/// from a healthcheck's `interval`/`start_period`/`retries`. Poll at `interval`
/// when given (>=1s) else 2s; run `retries` (default 30) probes plus enough
/// extra probes to span `start_period`. Pure so the timing can be unit-tested.
fn health_poll_plan(
	interval: Option<&str>,
	start_period: Option<&str>,
	retries: Option<u32>,
) -> (u64, u64) {
	let poll_secs = interval
		.and_then(crate::size::parse_duration_secs)
		.filter(|s| *s >= 1)
		.unwrap_or(2);
	let start_secs = start_period
		.and_then(crate::size::parse_duration_secs)
		.unwrap_or(0);
	let iterations = retries.unwrap_or(30) as u64 + start_secs / poll_secs;
	(poll_secs, iterations)
}

impl Engine {
	/// Poll a container until its health status is `healthy` or timeout.
	///
	/// Polls at the compose `healthcheck.interval` (default 2 s) for
	/// `healthcheck.retries` (default 30) probes, plus extra probes covering
	/// `healthcheck.start_period` so a slow-starting service is not timed out early.
	///
	/// The wait is driven by the container's *effective* healthcheck reported by
	/// the runtime, so healthchecks inherited from the image count too — not just
	/// those declared in compose. If the container has no effective healthcheck at
	/// all (none in the image or compose), it can never report `healthy`, so the
	/// wait short-circuits as satisfied rather than blocking until timeout.
	/// Wait until every targeted service's first replica is healthy (`up
	/// --wait`). A service with no effective healthcheck is treated as ready
	/// once started. All services when `target_services` is empty.
	pub async fn wait_services_healthy(
		&self,
		file: &ComposeFile,
		target_services: &[String],
	) -> Result<()> {
		for (name, service) in &file.services {
			if !target_services.is_empty() && !target_services.iter().any(|t| t == name) {
				continue;
			}
			let container = self.first_replica_name(name, service);
			self.wait_healthy(&container, service).await?;
		}
		Ok(())
	}

	pub(super) async fn wait_healthy(&self, container_name: &str, service: &Service) -> Result<()> {
		let hc = service.healthcheck.as_ref();
		let (poll_secs, iterations) = health_poll_plan(
			hc.and_then(|h| h.interval.as_deref()),
			hc.and_then(|h| h.start_period.as_deref()),
			hc.and_then(|h| h.retries),
		);

		// One inspect decides the short-circuits: already healthy, or no effective
		// healthcheck at all (image or compose) — in which case a server-side
		// `wait?condition=healthy` would block forever, so treat it as satisfied.
		let info = self
			.client
			.get_json::<crate::libpod::types::container::ContainerInspect>(&format!(
				"{API_PREFIX}/containers/{}/json",
				crate::libpod::urlencoded(container_name),
			))
			.await
			.map_err(ComposeError::Podman)?;
		match classify_health(&info) {
			HealthVerdict::Healthy => return Ok(()),
			HealthVerdict::NoHealthcheck => {
				tracing::debug!(
					"{container_name} has no effective healthcheck; treating service_healthy as satisfied"
				);
				return Ok(());
			}
			HealthVerdict::Pending => {}
		}

		// Actively drive the healthcheck on demand. A server-side
		// `wait?condition=healthy` only returns once the health *status* flips to
		// `healthy`, but Podman updates that status only when the healthcheck runs
		// — and it schedules those runs via systemd transient timers. Without
		// systemd (containers, minimal hosts) the timer never fires and the status
		// stays `starting`, so the wait would block until the whole budget elapsed.
		//
		// `GET {API_PREFIX}/containers/{name}/healthcheck` *runs* the check and
		// returns the resulting status, so polling it works with or without
		// systemd, on Podman 4.9.3 and 5.x alike. Extra on-demand runs on a
		// systemd host are harmless.
		let path = format!(
			"{API_PREFIX}/containers/{}/healthcheck",
			crate::libpod::urlencoded(container_name),
		);
		for _ in 0..iterations {
			match self.client.get_json::<HealthCheckRun>(&path).await {
				Ok(run) if run.status.as_deref() == Some("healthy") => return Ok(()),
				Ok(_) => {}
				// A transient error (container not yet running, 409, 500, …) just
				// means "not healthy yet" — keep polling rather than failing hard.
				Err(e) => tracing::debug!("{container_name} healthcheck run failed: {e}"),
			}
			tokio::time::sleep(Duration::from_secs(poll_secs)).await;
		}
		Err(ComposeError::HealthCheckTimeout(container_name.into()))
	}

	/// Wait until a container exits, then require status 0.
	///
	/// Blocks server-side on `wait?condition=stopped` (which returns the exit
	/// code) instead of polling inspect, bounded by a 600 s client-side timeout.
	/// Errors if the container exits non-zero or the deadline is exceeded.
	pub(super) async fn wait_completed(&self, container_name: &str) -> Result<()> {
		let path = format!(
			"{API_PREFIX}/containers/{}/wait?condition=stopped",
			crate::libpod::urlencoded(container_name),
		);
		let budget = std::time::Duration::from_secs(600);
		match tokio::time::timeout(budget, self.client.post_empty_json::<i64>(&path)).await {
			Ok(Ok(0)) => Ok(()),
			Ok(Ok(code)) => Err(ComposeError::HealthCheckTimeout(format!(
				"{container_name} exited with non-zero status {code}"
			))),
			Ok(Err(e)) => {
				tracing::debug!("{container_name} wait?condition=stopped failed: {e}");
				Err(ComposeError::HealthCheckTimeout(container_name.into()))
			}
			Err(_elapsed) => Err(ComposeError::HealthCheckTimeout(container_name.into())),
		}
	}
}

#[cfg(test)]
mod tests {
	use super::*;

	fn inspect(json: &str) -> ContainerInspect {
		serde_json::from_str(json).expect("fixture parses")
	}

	// --- wait_healthy poll plan (#418) ---------------------------------------

	#[test]
	fn poll_plan_defaults_match_legacy_60s() {
		// No healthcheck timing set → 2s poll, 30 probes (the historical budget).
		assert_eq!(super::health_poll_plan(None, None, None), (2, 30));
	}

	#[test]
	fn poll_plan_uses_interval_and_honors_start_period() {
		// interval=10s, start_period=60s, retries=3 → poll 10s, 3 + 60/10 = 9 probes.
		let (poll, iters) = super::health_poll_plan(Some("10s"), Some("60s"), Some(3));
		assert_eq!((poll, iters), (10, 9));
	}

	#[test]
	fn poll_plan_sub_second_interval_floors_to_default() {
		// An interval below 1s falls back to the 2s default (no busy-poll).
		let (poll, _) = super::health_poll_plan(Some("500ms"), None, Some(5));
		assert_eq!(poll, 2);
	}

	// --- wait_healthy gating (service_healthy) -------------------------------

	#[test]
	fn health_reported_healthy() {
		let info = inspect(r#"{"State":{"Status":"running","Health":{"Status":"healthy"}}}"#);
		assert!(matches!(classify_health(&info), HealthVerdict::Healthy));
	}

	#[test]
	fn health_no_effective_healthcheck_is_satisfied() {
		// A disabled healthcheck (Test ["NONE"]) can never report healthy, so the
		// dependency short-circuits as satisfied rather than blocking to timeout.
		let info =
			inspect(r#"{"State":{"Status":"running"},"Config":{"Healthcheck":{"Test":["NONE"]}}}"#);
		assert!(matches!(
			classify_health(&info),
			HealthVerdict::NoHealthcheck
		));
	}

	#[test]
	fn health_starting_with_healthcheck_pends() {
		let info = inspect(
			r#"{"State":{"Status":"running","Health":{"Status":"starting"}},"Config":{"Healthcheck":{"Test":["CMD","true"]}}}"#,
		);
		assert!(matches!(classify_health(&info), HealthVerdict::Pending));
	}
}