scrapling_spider/
request.rs

1//! Request and callback types for the spider crawl pipeline.
2//!
3//! This module defines the core data that flows through the crawler:
4//!
5//! - [`Request`] -- a URL to fetch, together with scheduling priority, session
6//!   routing, deduplication fingerprint, retry state, and an optional callback.
7//! - [`Callback`] -- a boxed closure that turns a fetched [`Response`] into zero
8//!   or more [`SpiderOutput`] values.
9//! - [`SpiderOutput`] -- the two things a callback can produce: a scraped data
10//!   item (`Item`) or a follow-up request (`FollowRequest`).
11//!
12//! Requests use a builder pattern (`Request::new(url).with_priority(10)`) and are
13//! ordered by priority so the [`Scheduler`](crate::scheduler::Scheduler) always
14//! processes the most important URLs first.
15
16use std::cmp::Ordering;
17use std::collections::HashMap;
18
19use sha1::{Digest, Sha1};
20
21use scrapling_fetch::Response;
22
23/// A boxed closure that processes an HTTP response and returns spider outputs.
24///
25/// Callbacks are attached to individual [`Request`]s via
26/// [`Request::with_callback`]. When the crawler receives a response for that
27/// request, it invokes the callback instead of the spider's default `parse`
28/// method. This lets you route different pages to different parsing logic.
29pub type Callback = Box<dyn Fn(Response) -> Vec<SpiderOutput> + Send + Sync>;
30
31/// The result of processing a response: either a scraped data item or a
32/// follow-up request to enqueue.
33///
34/// Your [`Spider::parse`](crate::spider::Spider::parse) implementation (or a
35/// per-request [`Callback`]) returns a `Vec<SpiderOutput>`. The crawler engine
36/// collects `Item` values into the final [`ItemList`](crate::result::ItemList)
37/// and feeds `FollowRequest` values back into the [`Scheduler`](crate::scheduler::Scheduler).
38#[derive(Debug)]
39pub enum SpiderOutput {
40    /// A scraped data item to be collected. The JSON value is passed through the
41    /// spider's `on_scraped_item` hook before being stored, which gives you a
42    /// chance to validate, transform, or drop it.
43    Item(serde_json::Value),
44    /// A new request to enqueue for crawling. The engine checks domain
45    /// restrictions and deduplication before actually scheduling it.
46    FollowRequest(Request),
47}
48
49/// A crawl request with URL, priority, metadata, and optional callback.
50///
51/// `Request` is the unit of work in the crawl pipeline. Create one with
52/// [`Request::new`], customize it with the builder methods (`with_priority`,
53/// `with_sid`, `with_callback`, etc.), and return it from your spider's `parse`
54/// method wrapped in [`SpiderOutput::FollowRequest`].
55///
56/// Two requests are considered equal if their fingerprints match (or, if no
57/// fingerprint has been computed, if their URLs match). Ordering is by priority
58/// (higher values are dequeued first).
59pub struct Request {
60    /// The URL to fetch. This is the only required field; everything else has
61    /// sensible defaults set by [`Request::new`].
62    pub url: String,
63    /// The session identifier used to select a fetcher from the
64    /// [`SessionManager`](crate::session::SessionManager). An empty string
65    /// means "use the default session."
66    pub sid: String,
67    /// An optional callback to process the response. When present, the engine
68    /// calls this closure instead of [`Spider::parse`](crate::spider::Spider::parse).
69    /// Because closures are not cloneable, use [`copy_without_callback`](Request::copy_without_callback)
70    /// when you need to duplicate a request for retries.
71    pub callback: Option<Callback>,
72    /// The name of the callback, kept for debugging output and checkpoint
73    /// serialization. It has no effect on routing; the actual closure in
74    /// `callback` is what gets invoked.
75    pub callback_name: Option<String>,
76    /// The scheduling priority. Higher values are dequeued first by the
77    /// [`Scheduler`](crate::scheduler::Scheduler). The default is 0. Use
78    /// negative values to de-prioritize retries or background pages.
79    pub priority: i32,
80    /// Whether to bypass the duplicate-request filter. Set this to `true` when
81    /// you intentionally want to re-fetch a URL -- for example, to poll a page
82    /// for updates or to retry after a transient failure.
83    pub dont_filter: bool,
84    /// Arbitrary metadata passed through the crawl pipeline. Whatever you put
85    /// here is available on the request when it reaches your callback, which is
86    /// useful for carrying context (e.g., a parent-page ID) between parse stages.
87    pub meta: HashMap<String, serde_json::Value>,
88    /// The number of times this request has been retried after receiving a
89    /// blocked response. The engine increments this automatically and stops
90    /// retrying once it exceeds [`Spider::max_blocked_retries`](crate::spider::Spider::max_blocked_retries).
91    pub retry_count: u32,
92    /// Additional keyword arguments forwarded to the session fetcher. Common
93    /// keys include `"method"`, `"headers"`, `"data"`, and `"json"`. These are
94    /// also factored into the deduplication fingerprint when
95    /// [`Spider::fp_include_kwargs`](crate::spider::Spider::fp_include_kwargs) is enabled.
96    pub session_kwargs: HashMap<String, serde_json::Value>,
97    fingerprint: Option<Vec<u8>>,
98}
99
100impl Request {
101    /// Creates a new request for the given URL with default settings.
102    ///
103    /// All fields are initialized to their zero/empty values: priority 0, no
104    /// session override, no callback, no metadata, and duplicate filtering
105    /// enabled. Use the `with_*` builder methods to customize.
106    pub fn new(url: impl Into<String>) -> Self {
107        Self {
108            url: url.into(),
109            sid: String::new(),
110            callback: None,
111            callback_name: None,
112            priority: 0,
113            dont_filter: false,
114            meta: HashMap::new(),
115            retry_count: 0,
116            session_kwargs: HashMap::new(),
117            fingerprint: None,
118        }
119    }
120
121    /// Sets the session identifier for this request, routing it to a specific
122    /// fetcher registered in the [`SessionManager`](crate::session::SessionManager).
123    /// Use this when your spider manages multiple sessions with different cookies,
124    /// proxies, or authentication contexts.
125    pub fn with_sid(mut self, sid: impl Into<String>) -> Self {
126        self.sid = sid.into();
127        self
128    }
129
130    /// Sets the scheduling priority for this request. Higher values are
131    /// dequeued first. Use positive values for important pages (e.g., product
132    /// detail pages) and negative values for low-priority background work.
133    pub fn with_priority(mut self, priority: i32) -> Self {
134        self.priority = priority;
135        self
136    }
137
138    /// Sets whether this request should bypass the duplicate-request filter.
139    /// Pass `true` to allow re-fetching a URL that has already been seen. This
140    /// is useful for polling pages that change over time or for manual retries.
141    pub fn with_dont_filter(mut self, dont_filter: bool) -> Self {
142        self.dont_filter = dont_filter;
143        self
144    }
145
146    /// Attaches arbitrary metadata to this request. The metadata map is carried
147    /// through the entire crawl pipeline and is accessible in your callback or
148    /// `parse` implementation, making it the standard way to pass context (such
149    /// as a parent URL or category label) between crawl stages.
150    pub fn with_meta(mut self, meta: HashMap<String, serde_json::Value>) -> Self {
151        self.meta = meta;
152        self
153    }
154
155    /// Attaches a named callback to process the response for this request.
156    /// When the engine receives the response, it will call this closure instead
157    /// of [`Spider::parse`](crate::spider::Spider::parse). The `name` is stored
158    /// for debugging and checkpoint serialization; it does not affect dispatch.
159    pub fn with_callback(mut self, name: &str, callback: Callback) -> Self {
160        self.callback_name = Some(name.to_owned());
161        self.callback = Some(callback);
162        self
163    }
164
165    /// Extracts the domain (host) from the request URL. Returns an empty
166    /// string if the URL cannot be parsed. This is used internally for domain
167    /// allowlisting and per-domain statistics, but you can also call it in your
168    /// own code to inspect which host a request targets.
169    pub fn domain(&self) -> String {
170        url::Url::parse(&self.url)
171            .ok()
172            .and_then(|u| u.host_str().map(|h| h.to_owned()))
173            .unwrap_or_default()
174    }
175
176    /// Computes and caches a SHA-1 fingerprint for deduplication, returning it
177    /// as a byte slice.
178    ///
179    /// The fingerprint is derived from the session ID, HTTP method, URL, and
180    /// request body. The boolean flags control whether session kwargs, headers,
181    /// and URL fragments are also included. Once computed, the fingerprint is
182    /// cached so subsequent calls are free. The [`Scheduler`](crate::scheduler::Scheduler)
183    /// calls this automatically when a request is enqueued.
184    pub fn update_fingerprint(
185        &mut self,
186        include_kwargs: bool,
187        include_headers: bool,
188        keep_fragments: bool,
189    ) -> &[u8] {
190        if let Some(ref fp) = self.fingerprint {
191            return fp;
192        }
193
194        let mut url = self.url.clone();
195        if !keep_fragments {
196            if let Some(pos) = url.find('#') {
197                url.truncate(pos);
198            }
199        }
200
201        let method = self
202            .session_kwargs
203            .get("method")
204            .and_then(|v| v.as_str())
205            .unwrap_or("GET")
206            .to_uppercase();
207
208        let body = self.extract_body_hex();
209
210        let mut parts = serde_json::Map::new();
211        parts.insert("sid".into(), serde_json::Value::String(self.sid.clone()));
212        parts.insert("method".into(), serde_json::Value::String(method));
213        parts.insert("url".into(), serde_json::Value::String(url));
214        parts.insert("body".into(), serde_json::Value::String(body));
215
216        if include_kwargs {
217            let mut keys: Vec<&String> = self.session_kwargs.keys().collect();
218            keys.sort();
219            let hex = hex::encode(format!("{keys:?}"));
220            parts.insert("kwargs".into(), serde_json::Value::String(hex));
221        }
222
223        if include_headers {
224            if let Some(headers) = self.session_kwargs.get("headers") {
225                let s = serde_json::to_string(headers).unwrap_or_default();
226                parts.insert("headers".into(), serde_json::Value::String(s));
227            }
228        }
229
230        let serialized = serde_json::to_vec(&parts).unwrap_or_default();
231        let mut hasher = Sha1::new();
232        hasher.update(&serialized);
233        let fp = hasher.finalize().to_vec();
234        self.fingerprint = Some(fp);
235        self.fingerprint.as_ref().unwrap()
236    }
237
238    fn extract_body_hex(&self) -> String {
239        if let Some(data) = self.session_kwargs.get("data") {
240            if let Some(s) = data.as_str() {
241                return hex::encode(s.as_bytes());
242            }
243            return hex::encode(serde_json::to_vec(data).unwrap_or_default());
244        }
245        if let Some(json) = self.session_kwargs.get("json") {
246            return hex::encode(serde_json::to_vec(json).unwrap_or_default());
247        }
248        String::new()
249    }
250
251    /// Returns the cached fingerprint, if one has been computed via
252    /// [`update_fingerprint`](Request::update_fingerprint). Returns `None` if
253    /// the fingerprint has never been calculated. The cache manager uses this to
254    /// look up previously stored responses.
255    pub fn fingerprint(&self) -> Option<&[u8]> {
256        self.fingerprint.as_deref()
257    }
258
259    /// Creates a clone of this request without the callback closure.
260    ///
261    /// Because [`Callback`] is a boxed `dyn Fn` and cannot be cloned, this
262    /// method copies every field except `callback` (which is set to `None`).
263    /// The engine uses this when creating retry requests for blocked responses.
264    pub fn copy_without_callback(&self) -> Self {
265        Self {
266            url: self.url.clone(),
267            sid: self.sid.clone(),
268            callback: None,
269            callback_name: self.callback_name.clone(),
270            priority: self.priority,
271            dont_filter: self.dont_filter,
272            meta: self.meta.clone(),
273            retry_count: self.retry_count,
274            session_kwargs: self.session_kwargs.clone(),
275            fingerprint: self.fingerprint.clone(),
276        }
277    }
278}
279
280impl std::fmt::Debug for Request {
281    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
282        f.debug_struct("Request")
283            .field("url", &self.url)
284            .field("priority", &self.priority)
285            .field("callback", &self.callback_name)
286            .finish()
287    }
288}
289
290impl std::fmt::Display for Request {
291    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
292        write!(f, "{}", self.url)
293    }
294}
295
296impl PartialEq for Request {
297    fn eq(&self, other: &Self) -> bool {
298        match (&self.fingerprint, &other.fingerprint) {
299            (Some(a), Some(b)) => a == b,
300            _ => self.url == other.url,
301        }
302    }
303}
304
305impl Eq for Request {}
306
307impl PartialOrd for Request {
308    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
309        Some(self.cmp(other))
310    }
311}
312
313impl Ord for Request {
314    fn cmp(&self, other: &Self) -> Ordering {
315        self.priority.cmp(&other.priority)
316    }
317}
scrapling_spider/request.rs

scrapling_spider/
request.rs