Skip to main content

spider_middleware/
retry.rs

1//! Retry middleware.
2//!
3//! [`RetryMiddleware`] retries selected HTTP responses and request failures using
4//! exponential backoff and a configurable retry limit.
5
6use async_trait::async_trait;
7use log::{info, trace, warn};
8use std::time::Duration;
9
10use crate::middleware::{Middleware, MiddlewareAction};
11use spider_util::error::SpiderError;
12use spider_util::request::Request;
13use spider_util::response::Response;
14
15/// Middleware that retries failed requests.
16#[derive(Debug, Clone)]
17pub struct RetryMiddleware {
18    /// Maximum number of times to retry a request.
19    pub max_retries: u32,
20    /// HTTP status codes that should trigger a retry.
21    pub retry_http_codes: Vec<u16>,
22    /// Factor for exponential backoff (delay = backoff_factor * (2^retries)).
23    pub backoff_factor: f64,
24    /// Maximum delay between retries.
25    pub max_delay: Duration,
26}
27
28impl Default for RetryMiddleware {
29    fn default() -> Self {
30        let middleware = RetryMiddleware {
31            max_retries: 3,
32            retry_http_codes: vec![500, 502, 503, 504, 408, 429],
33            backoff_factor: 1.0,
34            max_delay: Duration::from_secs(180),
35        };
36        info!("Initializing RetryMiddleware with config: {:?}", middleware);
37        middleware
38    }
39}
40
41impl RetryMiddleware {
42    /// Creates a new `RetryMiddleware` with default settings.
43    pub fn new() -> Self {
44        Self::default()
45    }
46
47    /// Sets the maximum number of times to retry a request.
48    pub fn max_retries(mut self, max_retries: u32) -> Self {
49        self.max_retries = max_retries;
50        self
51    }
52
53    /// Sets the HTTP status codes that should trigger a retry.
54    pub fn retry_http_codes<I>(mut self, retry_http_codes: I) -> Self
55    where
56        I: IntoIterator<Item = u16>,
57    {
58        self.retry_http_codes = retry_http_codes.into_iter().collect();
59        self
60    }
61
62    /// Sets the factor for exponential backoff.
63    pub fn backoff_factor(mut self, backoff_factor: f64) -> Self {
64        self.backoff_factor = backoff_factor;
65        self
66    }
67
68    /// Sets the maximum delay between retries.
69    pub fn max_delay(mut self, max_delay: Duration) -> Self {
70        self.max_delay = max_delay;
71        self
72    }
73}
74
75#[async_trait]
76impl<C: Send + Sync> Middleware<C> for RetryMiddleware {
77    fn name(&self) -> &str {
78        "RetryMiddleware"
79    }
80
81    async fn process_request(
82        &self,
83        _client: &C,
84        request: Request,
85    ) -> Result<MiddlewareAction<Request>, SpiderError> {
86        Ok(MiddlewareAction::Continue(request))
87    }
88
89    async fn process_response(
90        &self,
91        response: Response,
92    ) -> Result<MiddlewareAction<Response>, SpiderError> {
93        trace!(
94            "Processing response for URL: {} with status: {}",
95            response.url, response.status
96        );
97
98        if self.retry_http_codes.contains(&response.status.as_u16()) {
99            let mut request = response.request_from_response();
100            let current_attempts = request.get_retry_attempts();
101
102            if current_attempts < self.max_retries {
103                request.increment_retry_attempts();
104                let delay = self.calculate_delay(current_attempts);
105                info!(
106                    "Retrying {} (status: {}, attempt {}/{}) after {:?}",
107                    request.url,
108                    response.status,
109                    current_attempts + 1,
110                    self.max_retries,
111                    delay
112                );
113                return Ok(MiddlewareAction::Retry(Box::new(request), delay));
114            } else {
115                warn!(
116                    "Max retries ({}) reached for {} (status: {}). Dropping response.",
117                    self.max_retries, request.url, response.status
118                );
119                return Ok(MiddlewareAction::Drop);
120            }
121        } else {
122            trace!(
123                "Response status {} is not in retry codes, continuing",
124                response.status
125            );
126        }
127
128        Ok(MiddlewareAction::Continue(response))
129    }
130
131    async fn handle_error(
132        &self,
133        request: &Request,
134        error: &SpiderError,
135    ) -> Result<MiddlewareAction<Request>, SpiderError> {
136        trace!("Handling error for request {}: {:?}", request.url, error);
137
138        if let SpiderError::ReqwestError(err_details) = error
139            && (err_details.is_connect || err_details.is_timeout)
140        {
141            let mut new_request = request.clone();
142            let current_attempts = new_request.get_retry_attempts();
143
144            if current_attempts < self.max_retries {
145                new_request.increment_retry_attempts();
146                let delay = self.calculate_delay(current_attempts);
147                info!(
148                    "Retrying {} (error: {}, attempt {}/{}) after {:?}",
149                    new_request.url,
150                    err_details.message,
151                    current_attempts + 1,
152                    self.max_retries,
153                    delay
154                );
155                return Ok(MiddlewareAction::Retry(Box::new(new_request), delay));
156            } else {
157                warn!(
158                    "Max retries ({}) reached for {} (error: {}). Dropping request.",
159                    self.max_retries, new_request.url, err_details.message
160                );
161                return Ok(MiddlewareAction::Drop);
162            }
163        } else {
164            trace!("Error is not a retryable error, returning original error");
165        }
166
167        Err(error.clone())
168    }
169}
170
171impl RetryMiddleware {
172    fn calculate_delay(&self, retries: u32) -> Duration {
173        let delay_secs = self.backoff_factor * (2.0f64.powi(retries as i32));
174        let delay = Duration::from_secs_f64(delay_secs);
175        delay.min(self.max_delay)
176    }
177}