vtcode-core 0.104.1

//! Orchestrator retry and error handling module
//!
//! This module provides robust error handling for orchestrator response failures,
//! including retry mechanisms with exponential backoff and fallback strategies.

use crate::config::models::ModelId;
use crate::error::{ErrorCode, Result as VtCodeResult, VtCodeError};
use crate::retry::RetryPolicy;
use serde::{Deserialize, Serialize};
use std::future::Future;
use std::result::Result as StdResult;
use std::time::{Duration, Instant};
use tokio::time::sleep;
use tracing::{info, warn};

/// Configuration for retry behavior.
///
/// This is a thin wrapper kept for backward compatibility. New code should
/// construct [`RetryPolicy`] directly.
#[deprecated(note = "Use RetryPolicy directly for new code")]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RetryConfig {
    pub max_retries: u32,
    pub initial_delay_secs: u64,
    pub max_delay_secs: u64,
    pub backoff_multiplier: f64,
}

#[allow(deprecated)]
impl Default for RetryConfig {
    fn default() -> Self {
        Self {
            max_retries: 5,
            initial_delay_secs: 1,
            max_delay_secs: 60,
            backoff_multiplier: 2.0,
        }
    }
}

#[allow(deprecated)]
impl From<RetryConfig> for RetryPolicy {
    fn from(config: RetryConfig) -> Self {
        RetryPolicy::from_retries(
            config.max_retries,
            Duration::from_secs(config.initial_delay_secs),
            Duration::from_secs(config.max_delay_secs),
            config.backoff_multiplier,
        )
    }
}

/// Statistics about retry attempts
#[derive(Debug, Clone, Default)]
pub struct RetryStats {
    pub total_attempts: u32,
    pub successful_retries: u32,
    pub failed_retries: u32,
    pub fallback_activations: u32,
    pub total_backoff_time: Duration,
}

/// Retry manager for orchestrator operations.
///
/// Uses [`RetryPolicy`] directly for retry decisions, eliminating the
/// redundant `RetryConfig` → `RetryPolicy` conversion overhead.
#[derive(Debug)]
pub struct RetryManager {
    policy: RetryPolicy,
    stats: RetryStats,
}

impl Default for RetryManager {
    fn default() -> Self {
        Self::new()
    }
}

impl RetryManager {
    /// Create a new retry manager with default configuration
    pub fn new() -> Self {
        Self {
            policy: RetryPolicy::from_retries(
                5,
                Duration::from_secs(1),
                Duration::from_secs(60),
                2.0,
            ),
            stats: RetryStats::default(),
        }
    }

    /// Create a new retry manager with a custom retry policy
    pub fn with_policy(policy: RetryPolicy) -> Self {
        Self {
            policy,
            stats: RetryStats::default(),
        }
    }

    /// Create a new retry manager with custom configuration (backward compat)
    #[allow(deprecated)]
    pub fn with_config(config: RetryConfig) -> Self {
        Self::with_policy(config.into())
    }

    /// Get the current retry statistics
    pub fn stats(&self) -> &RetryStats {
        &self.stats
    }

    /// Reset retry statistics
    pub fn reset_stats(&mut self) {
        self.stats = RetryStats::default();
    }

    /// Execute an operation with retry and fallback logic
    pub async fn execute_with_retry<F, Fut, T, E>(
        &mut self,
        operation_name: &str,
        primary_model: &ModelId,
        fallback_model: Option<&ModelId>,
        operation: F,
    ) -> VtCodeResult<T>
    where
        F: Fn(ModelId) -> Fut,
        Fut: Future<Output = StdResult<T, E>>,
        E: Into<VtCodeError>,
        T: Clone,
    {
        let start_time = Instant::now();
        let policy = &self.policy;
        let mut last_error: Option<VtCodeError> = None;

        // Try with primary model first
        for attempt in 0..policy.max_attempts {
            self.stats.total_attempts += 1;

            info!(
                attempt = attempt + 1,
                max_attempts = policy.max_attempts,
                operation = operation_name,
                model = ?primary_model,
                "retry attempt starting"
            );

            match operation(*primary_model).await {
                Ok(result) => {
                    if attempt > 0 {
                        self.stats.successful_retries += 1;
                        info!(
                            attempt = attempt + 1,
                            operation = operation_name,
                            model = ?primary_model,
                            "operation succeeded after retry"
                        );
                    }
                    return Ok(result);
                }
                Err(err) => {
                    let err: VtCodeError = err.into();
                    let decision = policy.decision_for_vtcode_error(&err, attempt, None);
                    last_error = Some(err);

                    warn!(
                        attempt = attempt + 1,
                        max_attempts = policy.max_attempts,
                        operation = operation_name,
                        model = ?primary_model,
                        error = %last_error.as_ref().expect("retry error should exist"),
                        category = ?decision.category,
                        "operation attempt failed"
                    );

                    if !decision.retryable {
                        if attempt + 1 == policy.max_attempts
                            && last_error
                                .as_ref()
                                .is_some_and(|error| error.category.is_retryable())
                        {
                            self.stats.failed_retries += 1;
                        }
                        let err = last_error.expect("non-retryable error should exist");
                        warn!(
                            operation = operation_name,
                            error = %err,
                            category = ?decision.category,
                            "non-retryable error"
                        );
                        return Err(err);
                    }

                    let backoff_duration = decision.delay.expect("retryable decisions need delay");
                    self.stats.total_backoff_time += backoff_duration;
                    if attempt + 2 == policy.max_attempts {
                        self.stats.failed_retries += 1;
                    }

                    info!(
                        delay_ms = backoff_duration.as_millis() as u64,
                        next_attempt = attempt + 2,
                        operation = operation_name,
                        category = ?decision.category,
                        "backing off before retry"
                    );

                    sleep(backoff_duration).await;
                }
            }
        }

        // If we have a fallback model and primary failed, try fallback
        if let Some(fallback) = fallback_model {
            warn!(
                operation = operation_name,
                attempts = policy.max_attempts,
                primary_model = ?primary_model,
                fallback_model = ?fallback,
                "primary model failed; attempting fallback"
            );
            self.stats.fallback_activations += 1;

            match operation(*fallback).await {
                Ok(result) => {
                    info!(operation = operation_name, model = ?fallback, "fallback model succeeded");
                    return Ok(result);
                }
                Err(err) => {
                    let err: VtCodeError = err.into();
                    last_error = Some(err.with_context(format!(
                        "fallback model '{fallback}' failed for operation '{operation_name}'"
                    )));
                    warn!(
                        operation = operation_name,
                        model = ?fallback,
                        error = %last_error.as_ref().expect("fallback error should exist"),
                        "fallback model failed"
                    );
                }
            }
        }

        let total_time = start_time.elapsed();
        warn!(
            operation = operation_name,
            attempts = policy.max_attempts,
            total_time = %humantime::format_duration(total_time),
            primary_model = ?primary_model,
            fallback_model = ?fallback_model,
            "operation failed after retries"
        );

        Err(last_error.unwrap_or_else(|| {
            VtCodeError::execution(
                ErrorCode::ToolExecutionFailed,
                format!(
                    "operation '{operation_name}' failed after {} attempts",
                    policy.max_attempts
                ),
            )
            .with_context(format!(
                "primary model: {primary_model}, fallback model: {fallback_model:?}"
            ))
        }))
    }
}

/// Check if a response is considered empty or invalid
pub fn is_empty_response(response: &serde_json::Value) -> bool {
    match response {
        serde_json::Value::Null => true,
        serde_json::Value::String(s) => s.trim().is_empty(),
        serde_json::Value::Object(obj) => {
            obj.is_empty() ||
            // Check for common empty response patterns
            (obj.get("candidates").is_some_and(|c| c.as_array().is_some_and(|arr| arr.is_empty()))) ||
            (obj.get("content").is_some_and(|c| match c {
                serde_json::Value::String(s) => s.trim().is_empty(),
                serde_json::Value::Array(arr) => arr.is_empty(),
                _ => false,
            }))
        }
        serde_json::Value::Array(arr) => arr.is_empty(),
        _ => false,
    }
}

/// Detect if an error indicates a temporary failure that should be retried.
/// Uses the shared VT Code retry policy for typed and fallback classification.
pub fn is_retryable_error(error: &anyhow::Error) -> bool {
    RetryPolicy::default()
        .decision_for_anyhow(error, 0, None)
        .retryable
}

#[cfg(test)]
#[allow(deprecated)]
mod tests {
    use super::*;
    use crate::error::{ErrorCode, VtCodeError};
    use anyhow::anyhow;
    use serde_json::json;
    use std::sync::{Arc, Mutex};

    #[test]
    fn test_empty_response_detection() {
        assert!(is_empty_response(&serde_json::Value::Null));
        assert!(is_empty_response(&json!("")));
        assert!(is_empty_response(&json!("  ")));
        assert!(is_empty_response(&json!({})));
        assert!(is_empty_response(&json!([])));
        assert!(is_empty_response(&json!({"candidates": []})));
        assert!(is_empty_response(&json!({"content": ""})));
        assert!(is_empty_response(&json!({"content": []})));

        assert!(!is_empty_response(&json!("hello")));
        assert!(!is_empty_response(&json!({"content": "hello"})));
        assert!(!is_empty_response(
            &json!({"candidates": [{"content": "hello"}]})
        ));
    }

    #[test]
    fn test_retryable_error_detection() {
        assert!(is_retryable_error(&anyhow!("Connection timeout")));
        assert!(is_retryable_error(&anyhow!("Rate limit exceeded")));
        assert!(is_retryable_error(&anyhow!("HTTP 503 Service Unavailable")));
        assert!(is_retryable_error(&anyhow!("Network error")));

        assert!(is_retryable_error(&anyhow!("HTTP 429 Too Many Requests")));
        assert!(is_retryable_error(&anyhow!("Error 429: rate limited")));

        assert!(!is_retryable_error(&anyhow!("Invalid API key")));
        assert!(!is_retryable_error(&anyhow!("Permission denied")));
        assert!(!is_retryable_error(&anyhow!("Invalid model")));
        assert!(!is_retryable_error(&anyhow!(
            "You exceeded your current quota"
        )));
        assert!(!is_retryable_error(&anyhow!("insufficient_quota")));
        assert!(!is_retryable_error(&anyhow!("429 quota exceeded")));
    }

    #[test]
    fn test_retry_config_defaults() {
        let config = RetryConfig::default();
        assert_eq!(config.max_retries, 5);
        assert_eq!(config.initial_delay_secs, 1);
        assert_eq!(config.max_delay_secs, 60);
        assert_eq!(config.backoff_multiplier, 2.0);
    }

    #[tokio::test]
    async fn test_retry_manager_success_first_attempt() {
        let mut manager = RetryManager::new();
        let result = manager
            .execute_with_retry(
                "test_operation",
                &ModelId::Gemini3FlashPreview,
                Some(&ModelId::Gemini31ProPreview),
                |_model| async { Ok::<String, anyhow::Error>("success".to_owned()) },
            )
            .await;

        assert!(result.is_ok());
        assert_eq!(result.unwrap(), "success");
        assert_eq!(manager.stats().total_attempts, 1);
        assert_eq!(manager.stats().successful_retries, 0);
        assert_eq!(manager.stats().fallback_activations, 0);
    }

    #[tokio::test]
    async fn test_retry_manager_success_after_retry() {
        let mut manager = RetryManager::with_config(RetryConfig {
            max_retries: 2,
            initial_delay_secs: 0, // No delay for test
            max_delay_secs: 1,
            backoff_multiplier: 2.0,
        });

        let attempt_count = Arc::new(Mutex::new(0));
        let attempt_count_clone = attempt_count.clone();
        let result = manager
            .execute_with_retry(
                "test_operation",
                &ModelId::Gemini3FlashPreview,
                Some(&ModelId::Gemini31ProPreview),
                move |_model| {
                    let attempt_count = attempt_count_clone.clone();
                    async move {
                        let mut count = attempt_count.lock().unwrap();
                        *count += 1;
                        if *count < 2 {
                            Err(VtCodeError::network(
                                ErrorCode::ConnectionFailed,
                                "temporary failure",
                            ))
                        } else {
                            Ok::<String, VtCodeError>("success".to_owned())
                        }
                    }
                },
            )
            .await;

        assert!(result.is_ok());
        assert_eq!(result.unwrap(), "success");
        assert_eq!(manager.stats().total_attempts, 2);
        assert_eq!(manager.stats().successful_retries, 1);
    }
}