spider_util/request.rs
1//! Request types used by the crawler runtime.
2//!
3//! [`Request`] is the runtime's transport-neutral request model. It stores the
4//! URL, method, headers, optional body, and a lazily allocated metadata map used
5//! by middleware and runtime internals.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_util::request::{Request, Body};
11//! use url::Url;
12//! use serde_json::json;
13//!
14//! // Create a simple GET request
15//! let url = Url::parse("https://example.com").unwrap();
16//! let request = Request::new(url);
17//!
18//! // Create a POST request with JSON body
19//! let post_request = Request::new(Url::parse("https://api.example.com/data").unwrap())
20//! .with_method(reqwest::Method::POST)
21//! .with_json(json!({"key": "value"}));
22//! ```
23
24use bytes::Bytes;
25use dashmap::DashMap;
26use http::header::HeaderMap;
27use reqwest::{Method, Url};
28use serde::{Deserialize, Serialize};
29use serde_json::Value;
30use std::collections::HashMap;
31use std::hash::Hasher;
32use std::str::FromStr;
33use std::sync::Arc;
34use twox_hash::XxHash64;
35
36use crate::error::SpiderError;
37
38/// Request body variants supported by the default downloader.
39///
40/// ## Example
41///
42/// ```rust,ignore
43/// use spider_util::request::Body;
44/// use serde_json::json;
45/// use dashmap::DashMap;
46/// use bytes::Bytes;
47///
48/// // JSON body
49/// let json_body = Body::Json(json!({"name": "test"}));
50///
51/// // Form data
52/// let mut form = DashMap::new();
53/// form.insert("key".to_string(), "value".to_string());
54/// let form_body = Body::Form(form);
55///
56/// // Raw bytes
57/// let bytes_body = Body::Bytes(Bytes::from("raw data"));
58/// ```
59#[derive(Debug, Clone)]
60pub enum Body {
61 /// JSON payload.
62 Json(serde_json::Value),
63 /// Form data (key-value pairs).
64 Form(DashMap<String, String>),
65 /// Raw binary data.
66 Bytes(Bytes),
67}
68
69// Custom serialization for Body enum
70impl Serialize for Body {
71 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
72 where
73 S: serde::Serializer,
74 {
75 use serde::ser::SerializeMap;
76 let mut map = serializer.serialize_map(Some(1))?;
77
78 match self {
79 Body::Json(value) => map.serialize_entry("Json", value)?,
80 Body::Form(dashmap) => {
81 let hmap: HashMap<String, String> = dashmap.clone().into_iter().collect();
82 map.serialize_entry("Form", &hmap)?
83 }
84 Body::Bytes(bytes) => map.serialize_entry("Bytes", bytes)?,
85 }
86
87 map.end()
88 }
89}
90
91impl<'de> Deserialize<'de> for Body {
92 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
93 where
94 D: serde::Deserializer<'de>,
95 {
96 use serde::de::{self, MapAccess, Visitor};
97 use std::fmt;
98
99 struct BodyVisitor;
100
101 impl<'de> Visitor<'de> for BodyVisitor {
102 type Value = Body;
103
104 fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
105 formatter.write_str("a body object")
106 }
107
108 fn visit_map<V>(self, mut map: V) -> Result<Body, V::Error>
109 where
110 V: MapAccess<'de>,
111 {
112 let entry = map.next_entry::<String, Value>()?;
113 let (key, value) = match entry {
114 Some((k, v)) => (k, v),
115 None => return Err(de::Error::custom("Expected a body variant")),
116 };
117
118 match key.as_str() {
119 "Json" => Ok(Body::Json(value)),
120 "Form" => {
121 let form_data: HashMap<String, String> =
122 serde_json::from_value(value).map_err(de::Error::custom)?;
123 let dashmap = DashMap::new();
124 for (k, v) in form_data {
125 dashmap.insert(k, v);
126 }
127 Ok(Body::Form(dashmap))
128 }
129 "Bytes" => {
130 let bytes: Bytes =
131 serde_json::from_value(value).map_err(de::Error::custom)?;
132 Ok(Body::Bytes(bytes))
133 }
134 _ => Err(de::Error::custom(format!("Unknown body variant: {}", key))),
135 }
136 }
137 }
138
139 deserializer.deserialize_map(BodyVisitor)
140 }
141}
142
143/// Outgoing HTTP request used by the crawler runtime.
144///
145/// [`Request`] is the handoff type between spiders, middleware, the scheduler,
146/// and the downloader. It is transport-neutral enough to be shared across the
147/// workspace, but expressive enough for custom methods, headers, bodies, and
148/// request-scoped metadata.
149///
150/// ## Example
151///
152/// ```rust,ignore
153/// use spider_util::request::Request;
154/// use url::Url;
155///
156/// // Create a basic GET request
157/// let request = Request::new(Url::parse("https://example.com").unwrap());
158///
159/// // Build a request with headers and method
160/// let post_request = Request::new(Url::parse("https://api.example.com").unwrap())
161/// .with_method(reqwest::Method::POST)
162/// .with_header("Accept", "application/json")
163/// .unwrap();
164/// ```
165#[derive(Debug, Clone)]
166pub struct Request {
167 /// The target URL for this request.
168 pub url: Url,
169 /// The HTTP method (GET, POST, etc.).
170 pub method: reqwest::Method,
171 /// HTTP headers for the request.
172 pub headers: http::header::HeaderMap,
173 /// Optional request body.
174 pub body: Option<Body>,
175 /// Lazy-initialized metadata - only allocated when actually used.
176 /// This reduces memory allocation for simple requests without metadata.
177 meta: Option<Arc<DashMap<String, Value>>>,
178}
179
180// Custom serialization for Request struct
181impl Serialize for Request {
182 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
183 where
184 S: serde::Serializer,
185 {
186 use serde::ser::SerializeStruct;
187 // Convert HeaderMap to a serializable format
188 let headers_vec: Vec<(String, String)> = self
189 .headers
190 .iter()
191 .filter_map(|(name, value)| {
192 value
193 .to_str()
194 .ok()
195 .map(|val_str| (name.as_str().to_string(), val_str.to_string()))
196 })
197 .collect();
198
199 let mut s = serializer.serialize_struct("Request", 5)?;
200 s.serialize_field("url", &self.url.as_str())?;
201 s.serialize_field("method", &self.method.as_str())?;
202 s.serialize_field("headers", &headers_vec)?;
203 s.serialize_field("body", &self.body)?;
204 // Serialize meta as empty HashMap if None (for backward compatibility)
205 let meta_map: HashMap<String, Value> = self
206 .meta
207 .as_ref()
208 .map(|m| {
209 m.iter()
210 .map(|e| (e.key().clone(), e.value().clone()))
211 .collect()
212 })
213 .unwrap_or_default();
214 s.serialize_field("meta", &meta_map)?;
215 s.end()
216 }
217}
218
219impl<'de> Deserialize<'de> for Request {
220 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
221 where
222 D: serde::Deserializer<'de>,
223 {
224 use serde::de::{self, MapAccess, Visitor};
225 use std::fmt;
226
227 #[derive(Deserialize)]
228 #[serde(field_identifier, rename_all = "lowercase")]
229 enum Field {
230 Url,
231 Method,
232 Headers,
233 Body,
234 Meta,
235 }
236
237 struct RequestVisitor;
238
239 impl<'de> Visitor<'de> for RequestVisitor {
240 type Value = Request;
241
242 fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
243 formatter.write_str("struct Request")
244 }
245
246 fn visit_map<V>(self, mut map: V) -> Result<Request, V::Error>
247 where
248 V: MapAccess<'de>,
249 {
250 let mut url = None;
251 let mut method = None;
252 let mut headers = None;
253 let mut body = None;
254 let mut meta = None;
255
256 while let Some(key) = map.next_key()? {
257 match key {
258 Field::Url => {
259 if url.is_some() {
260 return Err(de::Error::duplicate_field("url"));
261 }
262 let url_str: String = map.next_value()?;
263 let parsed_url = Url::parse(&url_str).map_err(de::Error::custom)?;
264 url = Some(parsed_url);
265 }
266 Field::Method => {
267 if method.is_some() {
268 return Err(de::Error::duplicate_field("method"));
269 }
270 let method_str: String = map.next_value()?;
271 let parsed_method =
272 Method::from_str(&method_str).map_err(de::Error::custom)?;
273 method = Some(parsed_method);
274 }
275 Field::Headers => {
276 if headers.is_some() {
277 return Err(de::Error::duplicate_field("headers"));
278 }
279 // Deserialize headers vector and convert back to HeaderMap
280 let headers_vec: Vec<(String, String)> = map.next_value()?;
281 let mut header_map = HeaderMap::new();
282 for (name, value) in headers_vec {
283 if let Ok(header_name) =
284 http::header::HeaderName::from_bytes(name.as_bytes())
285 && let Ok(header_value) =
286 http::header::HeaderValue::from_str(&value)
287 {
288 header_map.insert(header_name, header_value);
289 }
290 }
291 headers = Some(header_map);
292 }
293 Field::Body => {
294 if body.is_some() {
295 return Err(de::Error::duplicate_field("body"));
296 }
297 body = Some(map.next_value()?);
298 }
299 Field::Meta => {
300 // Deserialize meta HashMap and convert to DashMap
301 let meta_map: HashMap<String, Value> = map.next_value()?;
302 if !meta_map.is_empty() {
303 let dashmap = DashMap::new();
304 for (k, v) in meta_map {
305 dashmap.insert(k, v);
306 }
307 meta = Some(Arc::new(dashmap));
308 }
309 }
310 }
311 }
312
313 let url = url.ok_or_else(|| de::Error::missing_field("url"))?;
314 let method = method.ok_or_else(|| de::Error::missing_field("method"))?;
315 let headers = headers.ok_or_else(|| de::Error::missing_field("headers"))?;
316 let body = body; // Optional field
317
318 Ok(Request {
319 url,
320 method,
321 headers,
322 body,
323 meta, // May be None if no meta was serialized
324 })
325 }
326 }
327
328 const FIELDS: &[&str] = &["url", "method", "headers", "body", "meta"];
329 deserializer.deserialize_struct("Request", FIELDS, RequestVisitor)
330 }
331}
332
333impl Default for Request {
334 fn default() -> Self {
335 let default_url = match Url::parse("http://default.invalid") {
336 Ok(url) => url,
337 Err(err) => panic!("invalid hardcoded default URL: {}", err),
338 };
339 Self {
340 url: default_url,
341 method: reqwest::Method::GET,
342 headers: http::header::HeaderMap::new(),
343 body: None,
344 meta: None, // Lazy initialization - no allocation until needed
345 }
346 }
347}
348
349impl Request {
350 /// Creates a new [`Request`] with the given URL.
351 ///
352 /// This is the most common constructor used by spiders when enqueueing
353 /// follow-up pages. It does not allocate metadata storage unless
354 /// [`with_meta`](Request::with_meta) is called.
355 ///
356 /// ## Example
357 ///
358 /// ```rust,ignore
359 /// use spider_util::request::Request;
360 /// use url::Url;
361 ///
362 /// let request = Request::new(Url::parse("https://example.com").unwrap());
363 /// ```
364 pub fn new(url: Url) -> Self {
365 Request {
366 url,
367 method: reqwest::Method::GET,
368 headers: http::header::HeaderMap::new(),
369 body: None,
370 meta: None,
371 }
372 }
373
374 /// Sets the HTTP method for the request.
375 ///
376 /// Use this together with one of the body helpers for POST, PUT, or PATCH
377 /// workflows.
378 ///
379 /// ## Example
380 ///
381 /// ```rust,ignore
382 /// use spider_util::request::Request;
383 /// use url::Url;
384 ///
385 /// let request = Request::new(Url::parse("https://example.com").unwrap())
386 /// .with_method(reqwest::Method::POST);
387 /// ```
388 pub fn with_method(mut self, method: reqwest::Method) -> Self {
389 self.method = method;
390 self
391 }
392
393 /// Adds a header to the request.
394 ///
395 /// Returns an error if the header name or value is invalid.
396 ///
397 /// # Errors
398 ///
399 /// Returns a [`SpiderError::HeaderValueError`] if the header name or value is invalid.
400 ///
401 /// ## Example
402 ///
403 /// ```rust,ignore
404 /// use spider_util::request::Request;
405 /// use url::Url;
406 ///
407 /// let request = Request::new(Url::parse("https://example.com").unwrap())
408 /// .with_header("Accept", "application/json")
409 /// .unwrap();
410 /// ```
411 pub fn with_header(mut self, name: &str, value: &str) -> Result<Self, SpiderError> {
412 let header_name =
413 reqwest::header::HeaderName::from_bytes(name.as_bytes()).map_err(|e| {
414 SpiderError::HeaderValueError(format!("Invalid header name '{}': {}", name, e))
415 })?;
416 let header_value = reqwest::header::HeaderValue::from_str(value).map_err(|e| {
417 SpiderError::HeaderValueError(format!("Invalid header value '{}': {}", value, e))
418 })?;
419
420 self.headers.insert(header_name, header_value);
421 Ok(self)
422 }
423
424 /// Sets the body of the request and defaults the method to POST.
425 ///
426 /// ## Example
427 ///
428 /// ```rust,ignore
429 /// use spider_util::request::{Request, Body};
430 /// use url::Url;
431 /// use serde_json::json;
432 ///
433 /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
434 /// .with_body(Body::Json(json!({"key": "value"})));
435 /// ```
436 pub fn with_body(mut self, body: Body) -> Self {
437 self.body = Some(body);
438 self.with_method(reqwest::Method::POST)
439 }
440
441 /// Sets the body of the request to a JSON value and defaults the method to POST.
442 ///
443 /// This helper stores the payload body only. Add content-type headers
444 /// explicitly when the target service expects them.
445 ///
446 /// ## Example
447 ///
448 /// ```rust,ignore
449 /// use spider_util::request::Request;
450 /// use url::Url;
451 /// use serde_json::json;
452 ///
453 /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
454 /// .with_json(json!({"name": "test"}));
455 /// ```
456 pub fn with_json(self, json: serde_json::Value) -> Self {
457 self.with_body(Body::Json(json))
458 }
459
460 /// Sets the body of the request to form data and defaults the method to POST.
461 ///
462 /// ## Example
463 ///
464 /// ```rust,ignore
465 /// use spider_util::request::Request;
466 /// use url::Url;
467 /// use dashmap::DashMap;
468 ///
469 /// let mut form = DashMap::new();
470 /// form.insert("key".to_string(), "value".to_string());
471 ///
472 /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
473 /// .with_form(form);
474 /// ```
475 pub fn with_form(self, form: DashMap<String, String>) -> Self {
476 self.with_body(Body::Form(form))
477 }
478
479 /// Sets the body of the request to raw bytes and defaults the method to POST.
480 ///
481 /// ## Example
482 ///
483 /// ```rust,ignore
484 /// use spider_util::request::Request;
485 /// use url::Url;
486 /// use bytes::Bytes;
487 ///
488 /// let data = Bytes::from("binary data");
489 /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
490 /// .with_bytes(data);
491 /// ```
492 pub fn with_bytes(self, bytes: bytes::Bytes) -> Self {
493 self.with_body(Body::Bytes(bytes))
494 }
495
496 /// Adds a value to the request's metadata.
497 ///
498 /// Lazily allocates the metadata map on first use. Metadata is commonly
499 /// used to carry crawl context such as pagination state, source URLs, or
500 /// retry bookkeeping across middleware and parsing stages.
501 ///
502 /// ## Example
503 ///
504 /// ```rust,ignore
505 /// use spider_util::request::Request;
506 /// use url::Url;
507 /// use serde_json::json;
508 ///
509 /// let request = Request::new(Url::parse("https://example.com").unwrap())
510 /// .with_meta("priority", json!(1))
511 /// .with_meta("source", json!("manual"));
512 /// ```
513 pub fn with_meta(mut self, key: &str, value: serde_json::Value) -> Self {
514 self.meta
515 .get_or_insert_with(|| Arc::new(DashMap::new()))
516 .insert(key.to_string(), value);
517 self
518 }
519
520 /// Gets a reference to a metadata value, if it exists.
521 ///
522 /// Returns a cloned JSON value because metadata is stored in a shared
523 /// concurrent map. Returns `None` if the key doesn't exist or if metadata
524 /// hasn't been set.
525 pub fn get_meta(&self, key: &str) -> Option<serde_json::Value> {
526 self.meta
527 .as_ref()
528 .and_then(|m| m.get(key).map(|e| e.value().clone()))
529 }
530
531 /// Returns `true` if the request has metadata.
532 pub fn has_meta(&self) -> bool {
533 self.meta.as_ref().is_some_and(|m| !m.is_empty())
534 }
535
536 /// Returns a reference to the internal metadata map, if it exists.
537 pub fn meta_map(&self) -> Option<&Arc<DashMap<String, serde_json::Value>>> {
538 self.meta.as_ref()
539 }
540
541 /// Inserts a value into metadata, creating the map if needed.
542 ///
543 /// This is intended for internal framework use.
544 pub fn insert_meta(&mut self, key: String, value: serde_json::Value) {
545 self.meta
546 .get_or_insert_with(|| Arc::new(DashMap::new()))
547 .insert(key, value);
548 }
549
550 /// Gets a value from metadata using DashMap's API.
551 ///
552 /// This is intended for internal framework use where direct access is needed.
553 pub fn get_meta_ref(
554 &self,
555 key: &str,
556 ) -> Option<dashmap::mapref::one::Ref<'_, String, serde_json::Value>> {
557 self.meta.as_ref().and_then(|m| m.get(key))
558 }
559
560 /// Sets the metadata map directly.
561 ///
562 /// Used for internal framework operations.
563 pub fn set_meta_from_option(&mut self, meta: Option<Arc<DashMap<String, serde_json::Value>>>) {
564 self.meta = meta;
565 }
566
567 /// Clones the metadata map.
568 ///
569 /// Used for internal framework operations where metadata needs to be copied.
570 pub fn clone_meta(&self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
571 self.meta.clone()
572 }
573
574 /// Takes the metadata map, leaving `None` in its place.
575 ///
576 /// Used for internal framework operations.
577 pub fn take_meta(&mut self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
578 self.meta.take()
579 }
580
581 /// Returns a reference to the metadata Arc for internal framework use.
582 pub fn meta_inner(&self) -> &Option<Arc<DashMap<String, serde_json::Value>>> {
583 &self.meta
584 }
585
586 const RETRY_ATTEMPTS_KEY: &str = "retry_attempts";
587
588 /// Gets the number of times the request has been retried.
589 ///
590 /// Returns `0` if no retry attempts have been recorded.
591 pub fn get_retry_attempts(&self) -> u32 {
592 self.meta
593 .as_ref()
594 .and_then(|m| m.get(Self::RETRY_ATTEMPTS_KEY))
595 .and_then(|v| v.value().as_u64())
596 .unwrap_or(0) as u32
597 }
598
599 /// Increments the retry count for the request.
600 ///
601 /// Lazily allocates the metadata map if not already present.
602 pub fn increment_retry_attempts(&mut self) {
603 let current_attempts = self.get_retry_attempts();
604 self.meta
605 .get_or_insert_with(|| Arc::new(DashMap::new()))
606 .insert(
607 Self::RETRY_ATTEMPTS_KEY.to_string(),
608 serde_json::Value::from(current_attempts + 1),
609 );
610 }
611
612 /// Generates a unique fingerprint for the request based on its URL, method, and body.
613 ///
614 /// This is the stable identity used by runtime deduplication and related
615 /// components that need to recognize equivalent requests.
616 ///
617 /// The fingerprint is used for duplicate detection and caching. It combines:
618 /// - The request URL
619 /// - The HTTP method
620 /// - The request body (if present)
621 ///
622 /// ## Example
623 ///
624 /// ```rust,ignore
625 /// use spider_util::request::Request;
626 /// use url::Url;
627 ///
628 /// let request = Request::new(Url::parse("https://example.com").unwrap());
629 /// let fingerprint = request.fingerprint();
630 /// ```
631 pub fn fingerprint(&self) -> String {
632 let mut hasher = XxHash64::default();
633 hasher.write(self.url.as_str().as_bytes());
634 hasher.write(self.method.as_str().as_bytes());
635
636 if let Some(ref body) = self.body {
637 match body {
638 Body::Json(json_val) => {
639 if let Ok(serialized) = serde_json::to_string(json_val) {
640 hasher.write(serialized.as_bytes());
641 }
642 }
643 Body::Form(form_val) => {
644 // Optimized: hash components directly without building intermediate String
645 for r in form_val.iter() {
646 hasher.write(r.key().as_bytes());
647 hasher.write(r.value().as_bytes());
648 }
649 }
650 Body::Bytes(bytes_val) => {
651 hasher.write(bytes_val);
652 }
653 }
654 }
655 format!("{:x}", hasher.finish())
656 }
657}