rustlift 2.0.2

A typestate-driven deployment agent for Azure Web Apps
Documentation
// Copyright (c) 2026 Hamze Ghalebi. All rights reserved.
// Licensed under the Rustlift Non-Commercial Licence v1.0.

//! Exponential-backoff retry wrapper for transient cloud operations.
//!
//! Cloud APIs are inherently unreliable: Azure throttles requests,
//! DNS resolution hiccups, TLS sessions time out. This module provides
//! a single function — [`reliable_op`] — that wraps any async operation
//! with automatic retry logic.
//!
//! # Design Rationale
//!
//! Without centralised retry logic, each call site in the pipeline would
//! need its own loop, sleep, and error classification. This creates
//! duplication and inconsistency. By routing every fallible operation
//! through `reliable_op`, the retry *policy* is defined once and the
//! error *classification* lives in one `match` block.
//!
//! # Retry Policy
//!
//! | Parameter          | Value   | Reasoning                                    |
//! |--------------------|---------|----------------------------------------------|
//! | Max elapsed time   | 5 min   | Enough for Azure cold-starts and DNS propagation |
//! | Max interval       | 30 s    | Prevents overwhelming the API during outages |
//! | Multiplier         | 1.5×    | Gentler growth than the default 2×           |
//!
//! # Learning: Higher-Order Functions in Rust
//!
//! `reliable_op` is a **higher-order function** — it accepts a *closure*
//! as an argument. The closure is called on every retry attempt. In Rust,
//! closures are expressed through the `Fn`, `FnMut`, and `FnOnce` traits.
//!
//! Here, the closure must implement `Fn()` (callable multiple times),
//! because the retry loop may call it repeatedly. If it were `FnOnce`,
//! only one attempt would be possible.

use backoff::future::retry;
use backoff::ExponentialBackoff;
use std::time::Duration;

use crate::errors::DeployError;

/// Executes an async closure with exponential backoff, classifying errors
/// as fatal (abort) or transient (retry) based on the [`DeployError`]
/// variant.
///
/// # Arguments
///
/// * `op_name` — A human-readable label emitted in retry and failure log
///   lines (e.g. `"Auth Handshake"`, `"CLI ZipDeploy"`).
/// * `f` — A closure that returns a `Future<Output = Result<T, DeployError>>`.
///   The closure is called on *every* attempt; it **must not** capture
///   mutable state that would violate the retry contract.
///
/// # Error Classification
///
/// The classification lives in the `match` block inside this function:
///
/// - **Fatal** (`Config`, `Dependency`, `Build`, `PathEncoding`):
///   User mistakes — retrying cannot fix them.
/// - **Transient** (everything else):
///   External systems — they often self-heal.
///
/// # Errors
///
/// Returns the **last** [`DeployError`] if:
/// - A fatal variant is encountered (immediate abort), or
/// - The operation never succeeds within the 5-minute window.
///
/// # Examples
///
/// ```
/// use rustlift::resilience::reliable_op;
/// use rustlift::errors::DeployError;
/// use std::sync::{
///     atomic::{AtomicUsize, Ordering},
///     Arc,
/// };
///
/// let attempts = Arc::new(AtomicUsize::new(0));
/// let attempts_for_retry = Arc::clone(&attempts);
///
/// let output = tokio::runtime::Runtime::new()
///     .unwrap()
///     .block_on(async move {
///         reliable_op("Transient Demo", || {
///             let attempts_for_try = Arc::clone(&attempts_for_retry);
///             async move {
///                 let seen = attempts_for_try.fetch_add(1, Ordering::SeqCst);
///                 if seen == 0 {
///                     Err(DeployError::Infra("temporary failure".into()))
///                 } else {
///                     Ok("ok")
///                 }
///             }
///         })
///         .await
///     })
///     .unwrap();
///
/// assert_eq!(output, "ok");
/// assert_eq!(attempts.load(Ordering::SeqCst), 2);
/// ```
///
/// # Panics
///
/// This function does not panic.
///
/// # Safety
///
/// Safe to call. It uses safe async primitives and does not require unsafe
/// caller guarantees.
///
/// # Learning: Generic Bounds
///
/// The signature `<F, Fut, T>` uses three generic parameters:
///
/// - `F: Fn() -> Fut` — the closure type.
/// - `Fut: Future<Output = Result<T, DeployError>>` — the future it returns.
/// - `T` — the success value.
///
/// This pattern is called **static dispatch**: the compiler generates
/// specialised code for each concrete `F`, so there is **zero runtime
/// overhead** compared to passing a function pointer or trait object.
pub async fn reliable_op<Operation, OperationFuture, SuccessValue>(
    op_name: &str,
    operation: Operation,
) -> std::result::Result<SuccessValue, DeployError>
where
    Operation: Fn() -> OperationFuture,
    OperationFuture: std::future::Future<Output = std::result::Result<SuccessValue, DeployError>>,
{
    // Build the backoff policy — these values are tuned for Azure workloads.
    let policy = ExponentialBackoff {
        max_elapsed_time: Some(Duration::from_secs(300)), // 5 minutes total
        max_interval: Duration::from_secs(30),            // cap between retries
        multiplier: 1.5,                                  // gentler than 2×
        ..Default::default()
    };

    retry(policy, || async {
        match operation().await {
            Ok(val) => Ok(val),
            Err(e) => {
                match e {
                    // Fatal errors — user must fix these manually
                    DeployError::Config(_)
                    | DeployError::Dependency(_)
                    | DeployError::Build(_)
                    | DeployError::PathEncoding(_) => {
                        tracing::error!("⛔ Fatal Error in '{}': {}", op_name, e);
                        Err(backoff::Error::Permanent(e))
                    }
                    // Transient errors — retry with increasing delay
                    _ => {
                        tracing::warn!("⚠️  Transient Error in '{}': {}. Retrying...", op_name, e);
                        Err(backoff::Error::transient(e))
                    }
                }
            }
        }
    })
    .await
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use std::sync::{Arc, Mutex};

    /// Verifies that transient errors are retried until success.
    ///
    /// The closure fails twice with `Infra` (transient), then succeeds
    /// on the third attempt. The test asserts that all three calls
    /// happened.
    #[tokio::test]
    async fn retry_succeeds_eventually() {
        let attempts = Arc::new(Mutex::new(0));
        let attempts_clone = attempts.clone();

        let result = reliable_op("Test Op", || async {
            let mut count = attempts_clone.lock().unwrap();
            *count += 1;
            if *count < 3 {
                Err(DeployError::Infra("Transient boom".into()))
            } else {
                Ok("Success")
            }
        })
        .await;

        assert_eq!(result.unwrap(), "Success");
        assert_eq!(*attempts.lock().unwrap(), 3);
    }

    /// Verifies that fatal errors abort immediately without retrying.
    ///
    /// The closure returns a `Config` error (fatal). The test asserts
    /// that it was called exactly once — no retries.
    #[tokio::test]
    async fn fatal_error_aborts_immediately() {
        let attempts = Arc::new(Mutex::new(0));
        let attempts_clone = attempts.clone();

        let result: Result<(), _> = reliable_op("Fatal Op", || async {
            let mut count = attempts_clone.lock().unwrap();
            *count += 1;
            Err(DeployError::Config("Wrong Env".into()))
        })
        .await;

        assert!(matches!(result, Err(DeployError::Config(_))));
        assert_eq!(*attempts.lock().unwrap(), 1); // No retries
    }
}