spider_middleware/
middleware.rs

1//! Middleware trait and control-flow types.
2//!
3//! Middleware can inspect or rewrite requests before download, inspect or
4//! rewrite responses afterwards, and decide what should happen next.
5
6use async_trait::async_trait;
7use std::any::Any;
8use std::time::Duration;
9
10use spider_util::error::SpiderError;
11use spider_util::request::Request;
12use spider_util::response::Response;
13
14#[allow(clippy::large_enum_variant)]
15/// Control-flow result returned by middleware hooks.
16///
17/// Not every variant is meaningful in every hook:
18/// - request hooks typically return `Continue`, `Drop`, or `ReturnResponse`
19/// - response hooks typically return `Continue`, `Drop`, or `Retry`
20/// - error hooks typically return `Continue`, `Drop`, or `Retry`
21pub enum MiddlewareAction<T> {
22    /// Continue processing with the provided item.
23    Continue(T),
24    /// Retry the Request after the specified duration. (Only valid for Response processing)
25    Retry(Box<Request>, Duration),
26    /// Drop the item, stopping further processing.
27    Drop,
28    /// Return a Response directly, bypassing the downloader. (Only valid for Request processing)
29    ReturnResponse(Response),
30}
31
32/// Trait implemented by request/response middleware.
33///
34/// Middleware runs around the downloader boundary:
35///
36/// 1. `process_request` sees outgoing requests before download
37/// 2. the downloader executes the request unless middleware short-circuits it
38/// 3. `process_response` sees successful responses
39/// 4. `handle_error` sees download failures
40///
41/// Each hook can continue normal processing, stop it, or redirect control
42/// flow through [`MiddlewareAction`].
43#[async_trait]
44pub trait Middleware<C: Send + Sync>: Any + Send + Sync + 'static {
45    /// Returns a human-readable middleware name for logs and diagnostics.
46    fn name(&self) -> &str;
47
48    /// Intercepts an outgoing request before the downloader runs.
49    ///
50    /// Typical uses include header injection, request filtering, cache lookup,
51    /// throttling, or proxy selection.
52    ///
53    /// Return:
54    /// - `Continue(request)` to keep normal processing
55    /// - `Drop` to stop processing that request entirely
56    /// - `ReturnResponse(response)` to bypass the downloader
57    async fn process_request(
58        &self,
59        _client: &C,
60        request: Request,
61    ) -> Result<MiddlewareAction<Request>, SpiderError> {
62        Ok(MiddlewareAction::Continue(request))
63    }
64    /// Intercepts a successful response after download.
65    ///
66    /// Typical uses include cache population, adaptive throttling, cookie
67    /// extraction, or retry decisions based on status/body.
68    ///
69    /// Return:
70    /// - `Continue(response)` to forward the response to later middleware and parsing
71    /// - `Drop` to stop processing the response
72    /// - `Retry(request, delay)` to reschedule work after an optional wait
73    async fn process_response(
74        &self,
75        response: Response,
76    ) -> Result<MiddlewareAction<Response>, SpiderError> {
77        Ok(MiddlewareAction::Continue(response))
78    }
79
80    /// Handles downloader errors for a request.
81    ///
82    /// The default behavior propagates the error unchanged. Override this for
83    /// retry policy, selective suppression, or custom recovery behavior.
84    ///
85    /// Return:
86    /// - `Continue(request)` to resubmit immediately
87    /// - `Drop` to swallow the error and stop processing
88    /// - `Retry(request, delay)` to resubmit after waiting
89    async fn handle_error(
90        &self,
91        _request: &Request,
92        error: &SpiderError,
93    ) -> Result<MiddlewareAction<Request>, SpiderError> {
94        // The default implementation is to just pass the error through by cloning it.
95        Err(error.clone())
96    }
97}
spider_middleware/middleware.rs

spider_middleware/
middleware.rs