scrapling_spider/request.rs
1//! Request and callback types for the spider crawl pipeline.
2//!
3//! This module defines the core data that flows through the crawler:
4//!
5//! - [`Request`] -- a URL to fetch, together with scheduling priority, session
6//! routing, deduplication fingerprint, retry state, and an optional callback.
7//! - [`Callback`] -- a boxed closure that turns a fetched [`Response`] into zero
8//! or more [`SpiderOutput`] values.
9//! - [`SpiderOutput`] -- the two things a callback can produce: a scraped data
10//! item (`Item`) or a follow-up request (`FollowRequest`).
11//!
12//! Requests use a builder pattern (`Request::new(url).with_priority(10)`) and are
13//! ordered by priority so the [`Scheduler`](crate::scheduler::Scheduler) always
14//! processes the most important URLs first.
15
16use std::cmp::Ordering;
17use std::collections::HashMap;
18
19use sha1::{Digest, Sha1};
20
21use scrapling_fetch::Response;
22
23/// A boxed closure that processes an HTTP response and returns spider outputs.
24///
25/// Callbacks are attached to individual [`Request`]s via
26/// [`Request::with_callback`]. When the crawler receives a response for that
27/// request, it invokes the callback instead of the spider's default `parse`
28/// method. This lets you route different pages to different parsing logic.
29pub type Callback = Box<dyn Fn(Response) -> Vec<SpiderOutput> + Send + Sync>;
30
31/// The result of processing a response: either a scraped data item or a
32/// follow-up request to enqueue.
33///
34/// Your [`Spider::parse`](crate::spider::Spider::parse) implementation (or a
35/// per-request [`Callback`]) returns a `Vec<SpiderOutput>`. The crawler engine
36/// collects `Item` values into the final [`ItemList`](crate::result::ItemList)
37/// and feeds `FollowRequest` values back into the [`Scheduler`](crate::scheduler::Scheduler).
38#[derive(Debug)]
39pub enum SpiderOutput {
40 /// A scraped data item to be collected. The JSON value is passed through the
41 /// spider's `on_scraped_item` hook before being stored, which gives you a
42 /// chance to validate, transform, or drop it.
43 Item(serde_json::Value),
44 /// A new request to enqueue for crawling. The engine checks domain
45 /// restrictions and deduplication before actually scheduling it.
46 FollowRequest(Request),
47}
48
49/// A crawl request with URL, priority, metadata, and optional callback.
50///
51/// `Request` is the unit of work in the crawl pipeline. Create one with
52/// [`Request::new`], customize it with the builder methods (`with_priority`,
53/// `with_sid`, `with_callback`, etc.), and return it from your spider's `parse`
54/// method wrapped in [`SpiderOutput::FollowRequest`].
55///
56/// Two requests are considered equal if their fingerprints match (or, if no
57/// fingerprint has been computed, if their URLs match). Ordering is by priority
58/// (higher values are dequeued first).
59pub struct Request {
60 /// The URL to fetch. This is the only required field; everything else has
61 /// sensible defaults set by [`Request::new`].
62 pub url: String,
63 /// The session identifier used to select a fetcher from the
64 /// [`SessionManager`](crate::session::SessionManager). An empty string
65 /// means "use the default session."
66 pub sid: String,
67 /// An optional callback to process the response. When present, the engine
68 /// calls this closure instead of [`Spider::parse`](crate::spider::Spider::parse).
69 /// Because closures are not cloneable, use [`copy_without_callback`](Request::copy_without_callback)
70 /// when you need to duplicate a request for retries.
71 pub callback: Option<Callback>,
72 /// The name of the callback, kept for debugging output and checkpoint
73 /// serialization. It has no effect on routing; the actual closure in
74 /// `callback` is what gets invoked.
75 pub callback_name: Option<String>,
76 /// The scheduling priority. Higher values are dequeued first by the
77 /// [`Scheduler`](crate::scheduler::Scheduler). The default is 0. Use
78 /// negative values to de-prioritize retries or background pages.
79 pub priority: i32,
80 /// Whether to bypass the duplicate-request filter. Set this to `true` when
81 /// you intentionally want to re-fetch a URL -- for example, to poll a page
82 /// for updates or to retry after a transient failure.
83 pub dont_filter: bool,
84 /// Arbitrary metadata passed through the crawl pipeline. Whatever you put
85 /// here is available on the request when it reaches your callback, which is
86 /// useful for carrying context (e.g., a parent-page ID) between parse stages.
87 pub meta: HashMap<String, serde_json::Value>,
88 /// The number of times this request has been retried after receiving a
89 /// blocked response. The engine increments this automatically and stops
90 /// retrying once it exceeds [`Spider::max_blocked_retries`](crate::spider::Spider::max_blocked_retries).
91 pub retry_count: u32,
92 /// Additional keyword arguments forwarded to the session fetcher. Common
93 /// keys include `"method"`, `"headers"`, `"data"`, and `"json"`. These are
94 /// also factored into the deduplication fingerprint when
95 /// [`Spider::fp_include_kwargs`](crate::spider::Spider::fp_include_kwargs) is enabled.
96 pub session_kwargs: HashMap<String, serde_json::Value>,
97 fingerprint: Option<Vec<u8>>,
98}
99
100impl Request {
101 /// Creates a new request for the given URL with default settings.
102 ///
103 /// All fields are initialized to their zero/empty values: priority 0, no
104 /// session override, no callback, no metadata, and duplicate filtering
105 /// enabled. Use the `with_*` builder methods to customize.
106 pub fn new(url: impl Into<String>) -> Self {
107 Self {
108 url: url.into(),
109 sid: String::new(),
110 callback: None,
111 callback_name: None,
112 priority: 0,
113 dont_filter: false,
114 meta: HashMap::new(),
115 retry_count: 0,
116 session_kwargs: HashMap::new(),
117 fingerprint: None,
118 }
119 }
120
121 /// Sets the session identifier for this request, routing it to a specific
122 /// fetcher registered in the [`SessionManager`](crate::session::SessionManager).
123 /// Use this when your spider manages multiple sessions with different cookies,
124 /// proxies, or authentication contexts.
125 pub fn with_sid(mut self, sid: impl Into<String>) -> Self {
126 self.sid = sid.into();
127 self
128 }
129
130 /// Sets the scheduling priority for this request. Higher values are
131 /// dequeued first. Use positive values for important pages (e.g., product
132 /// detail pages) and negative values for low-priority background work.
133 pub fn with_priority(mut self, priority: i32) -> Self {
134 self.priority = priority;
135 self
136 }
137
138 /// Sets whether this request should bypass the duplicate-request filter.
139 /// Pass `true` to allow re-fetching a URL that has already been seen. This
140 /// is useful for polling pages that change over time or for manual retries.
141 pub fn with_dont_filter(mut self, dont_filter: bool) -> Self {
142 self.dont_filter = dont_filter;
143 self
144 }
145
146 /// Attaches arbitrary metadata to this request. The metadata map is carried
147 /// through the entire crawl pipeline and is accessible in your callback or
148 /// `parse` implementation, making it the standard way to pass context (such
149 /// as a parent URL or category label) between crawl stages.
150 pub fn with_meta(mut self, meta: HashMap<String, serde_json::Value>) -> Self {
151 self.meta = meta;
152 self
153 }
154
155 /// Attaches a named callback to process the response for this request.
156 /// When the engine receives the response, it will call this closure instead
157 /// of [`Spider::parse`](crate::spider::Spider::parse). The `name` is stored
158 /// for debugging and checkpoint serialization; it does not affect dispatch.
159 pub fn with_callback(mut self, name: &str, callback: Callback) -> Self {
160 self.callback_name = Some(name.to_owned());
161 self.callback = Some(callback);
162 self
163 }
164
165 /// Extracts the domain (host) from the request URL. Returns an empty
166 /// string if the URL cannot be parsed. This is used internally for domain
167 /// allowlisting and per-domain statistics, but you can also call it in your
168 /// own code to inspect which host a request targets.
169 pub fn domain(&self) -> String {
170 url::Url::parse(&self.url)
171 .ok()
172 .and_then(|u| u.host_str().map(|h| h.to_owned()))
173 .unwrap_or_default()
174 }
175
176 /// Computes and caches a SHA-1 fingerprint for deduplication, returning it
177 /// as a byte slice.
178 ///
179 /// The fingerprint is derived from the session ID, HTTP method, URL, and
180 /// request body. The boolean flags control whether session kwargs, headers,
181 /// and URL fragments are also included. Once computed, the fingerprint is
182 /// cached so subsequent calls are free. The [`Scheduler`](crate::scheduler::Scheduler)
183 /// calls this automatically when a request is enqueued.
184 pub fn update_fingerprint(
185 &mut self,
186 include_kwargs: bool,
187 include_headers: bool,
188 keep_fragments: bool,
189 ) -> &[u8] {
190 if let Some(ref fp) = self.fingerprint {
191 return fp;
192 }
193
194 let mut url = self.url.clone();
195 if !keep_fragments {
196 if let Some(pos) = url.find('#') {
197 url.truncate(pos);
198 }
199 }
200
201 let method = self
202 .session_kwargs
203 .get("method")
204 .and_then(|v| v.as_str())
205 .unwrap_or("GET")
206 .to_uppercase();
207
208 let body = self.extract_body_hex();
209
210 let mut parts = serde_json::Map::new();
211 parts.insert("sid".into(), serde_json::Value::String(self.sid.clone()));
212 parts.insert("method".into(), serde_json::Value::String(method));
213 parts.insert("url".into(), serde_json::Value::String(url));
214 parts.insert("body".into(), serde_json::Value::String(body));
215
216 if include_kwargs {
217 let mut keys: Vec<&String> = self.session_kwargs.keys().collect();
218 keys.sort();
219 let hex = hex::encode(format!("{keys:?}"));
220 parts.insert("kwargs".into(), serde_json::Value::String(hex));
221 }
222
223 if include_headers {
224 if let Some(headers) = self.session_kwargs.get("headers") {
225 let s = serde_json::to_string(headers).unwrap_or_default();
226 parts.insert("headers".into(), serde_json::Value::String(s));
227 }
228 }
229
230 let serialized = serde_json::to_vec(&parts).unwrap_or_default();
231 let mut hasher = Sha1::new();
232 hasher.update(&serialized);
233 let fp = hasher.finalize().to_vec();
234 self.fingerprint = Some(fp);
235 self.fingerprint.as_ref().unwrap()
236 }
237
238 fn extract_body_hex(&self) -> String {
239 if let Some(data) = self.session_kwargs.get("data") {
240 if let Some(s) = data.as_str() {
241 return hex::encode(s.as_bytes());
242 }
243 return hex::encode(serde_json::to_vec(data).unwrap_or_default());
244 }
245 if let Some(json) = self.session_kwargs.get("json") {
246 return hex::encode(serde_json::to_vec(json).unwrap_or_default());
247 }
248 String::new()
249 }
250
251 /// Returns the cached fingerprint, if one has been computed via
252 /// [`update_fingerprint`](Request::update_fingerprint). Returns `None` if
253 /// the fingerprint has never been calculated. The cache manager uses this to
254 /// look up previously stored responses.
255 pub fn fingerprint(&self) -> Option<&[u8]> {
256 self.fingerprint.as_deref()
257 }
258
259 /// Creates a clone of this request without the callback closure.
260 ///
261 /// Because [`Callback`] is a boxed `dyn Fn` and cannot be cloned, this
262 /// method copies every field except `callback` (which is set to `None`).
263 /// The engine uses this when creating retry requests for blocked responses.
264 pub fn copy_without_callback(&self) -> Self {
265 Self {
266 url: self.url.clone(),
267 sid: self.sid.clone(),
268 callback: None,
269 callback_name: self.callback_name.clone(),
270 priority: self.priority,
271 dont_filter: self.dont_filter,
272 meta: self.meta.clone(),
273 retry_count: self.retry_count,
274 session_kwargs: self.session_kwargs.clone(),
275 fingerprint: self.fingerprint.clone(),
276 }
277 }
278}
279
280impl std::fmt::Debug for Request {
281 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
282 f.debug_struct("Request")
283 .field("url", &self.url)
284 .field("priority", &self.priority)
285 .field("callback", &self.callback_name)
286 .finish()
287 }
288}
289
290impl std::fmt::Display for Request {
291 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
292 write!(f, "{}", self.url)
293 }
294}
295
296impl PartialEq for Request {
297 fn eq(&self, other: &Self) -> bool {
298 match (&self.fingerprint, &other.fingerprint) {
299 (Some(a), Some(b)) => a == b,
300 _ => self.url == other.url,
301 }
302 }
303}
304
305impl Eq for Request {}
306
307impl PartialOrd for Request {
308 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
309 Some(self.cmp(other))
310 }
311}
312
313impl Ord for Request {
314 fn cmp(&self, other: &Self) -> Ordering {
315 self.priority.cmp(&other.priority)
316 }
317}