spider_middleware/middleware.rs
1//! Middleware trait and control-flow types.
2//!
3//! Middleware can inspect or rewrite requests before download, inspect or
4//! rewrite responses afterwards, and decide what should happen next.
5
6use async_trait::async_trait;
7use std::any::Any;
8use std::time::Duration;
9
10use spider_util::error::SpiderError;
11use spider_util::request::Request;
12use spider_util::response::Response;
13
14#[allow(clippy::large_enum_variant)]
15/// Control-flow result returned by middleware hooks.
16///
17/// Not every variant is meaningful in every hook:
18/// - request hooks typically return `Continue`, `Drop`, or `ReturnResponse`
19/// - response hooks typically return `Continue`, `Drop`, or `Retry`
20/// - error hooks typically return `Continue`, `Drop`, or `Retry`
21pub enum MiddlewareAction<T> {
22 /// Continue processing with the provided item.
23 Continue(T),
24 /// Retry the Request after the specified duration. (Only valid for Response processing)
25 Retry(Box<Request>, Duration),
26 /// Drop the item, stopping further processing.
27 Drop,
28 /// Return a Response directly, bypassing the downloader. (Only valid for Request processing)
29 ReturnResponse(Response),
30}
31
32/// Trait implemented by request/response middleware.
33///
34/// Middleware runs around the downloader boundary:
35///
36/// 1. `process_request` sees outgoing requests before download
37/// 2. the downloader executes the request unless middleware short-circuits it
38/// 3. `process_response` sees successful responses
39/// 4. `handle_error` sees download failures
40///
41/// Each hook can continue normal processing, stop it, or redirect control
42/// flow through [`MiddlewareAction`].
43#[async_trait]
44pub trait Middleware<C: Send + Sync>: Any + Send + Sync + 'static {
45 /// Returns a human-readable middleware name for logs and diagnostics.
46 fn name(&self) -> &str;
47
48 /// Intercepts an outgoing request before the downloader runs.
49 ///
50 /// Typical uses include header injection, request filtering, cache lookup,
51 /// throttling, or proxy selection.
52 ///
53 /// Return:
54 /// - `Continue(request)` to keep normal processing
55 /// - `Drop` to stop processing that request entirely
56 /// - `ReturnResponse(response)` to bypass the downloader
57 async fn process_request(
58 &self,
59 _client: &C,
60 request: Request,
61 ) -> Result<MiddlewareAction<Request>, SpiderError> {
62 Ok(MiddlewareAction::Continue(request))
63 }
64 /// Intercepts a successful response after download.
65 ///
66 /// Typical uses include cache population, adaptive throttling, cookie
67 /// extraction, or retry decisions based on status/body.
68 ///
69 /// Return:
70 /// - `Continue(response)` to forward the response to later middleware and parsing
71 /// - `Drop` to stop processing the response
72 /// - `Retry(request, delay)` to reschedule work after an optional wait
73 async fn process_response(
74 &self,
75 response: Response,
76 ) -> Result<MiddlewareAction<Response>, SpiderError> {
77 Ok(MiddlewareAction::Continue(response))
78 }
79
80 /// Handles downloader errors for a request.
81 ///
82 /// The default behavior propagates the error unchanged. Override this for
83 /// retry policy, selective suppression, or custom recovery behavior.
84 ///
85 /// Return:
86 /// - `Continue(request)` to resubmit immediately
87 /// - `Drop` to swallow the error and stop processing
88 /// - `Retry(request, delay)` to resubmit after waiting
89 async fn handle_error(
90 &self,
91 _request: &Request,
92 error: &SpiderError,
93 ) -> Result<MiddlewareAction<Request>, SpiderError> {
94 // The default implementation is to just pass the error through by cloning it.
95 Err(error.clone())
96 }
97}