Skip to main content

spider_util/
request.rs

1//! Request types used by the crawler runtime.
2//!
3//! [`Request`] is the runtime's transport-neutral request model. It stores the
4//! URL, method, headers, optional body, and a lazily allocated metadata map used
5//! by middleware and runtime internals.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_util::request::{Request, Body};
11//! use url::Url;
12//! use serde_json::json;
13//!
14//! // Create a simple GET request
15//! let url = Url::parse("https://example.com").unwrap();
16//! let request = Request::new(url);
17//!
18//! // Create a POST request with JSON body
19//! let post_request = Request::new(Url::parse("https://api.example.com/data").unwrap())
20//!     .with_method(reqwest::Method::POST)
21//!     .with_json(json!({"key": "value"}));
22//! ```
23
24use bytes::Bytes;
25use dashmap::DashMap;
26use http::header::HeaderMap;
27use reqwest::{Method, Url};
28use serde::{Deserialize, Serialize};
29use serde_json::Value;
30use std::collections::HashMap;
31use std::hash::Hasher;
32use std::str::FromStr;
33use std::sync::Arc;
34use twox_hash::XxHash64;
35
36use crate::error::SpiderError;
37
38/// Request body variants supported by the default downloader.
39///
40/// ## Example
41///
42/// ```rust,ignore
43/// use spider_util::request::Body;
44/// use serde_json::json;
45/// use dashmap::DashMap;
46/// use bytes::Bytes;
47///
48/// // JSON body
49/// let json_body = Body::Json(json!({"name": "test"}));
50///
51/// // Form data
52/// let mut form = DashMap::new();
53/// form.insert("key".to_string(), "value".to_string());
54/// let form_body = Body::Form(form);
55///
56/// // Raw bytes
57/// let bytes_body = Body::Bytes(Bytes::from("raw data"));
58/// ```
59#[derive(Debug, Clone)]
60pub enum Body {
61    /// JSON payload.
62    Json(serde_json::Value),
63    /// Form data (key-value pairs).
64    Form(DashMap<String, String>),
65    /// Raw binary data.
66    Bytes(Bytes),
67}
68
69// Custom serialization for Body enum
70impl Serialize for Body {
71    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
72    where
73        S: serde::Serializer,
74    {
75        use serde::ser::SerializeMap;
76        let mut map = serializer.serialize_map(Some(1))?;
77
78        match self {
79            Body::Json(value) => map.serialize_entry("Json", value)?,
80            Body::Form(dashmap) => {
81                let hmap: HashMap<String, String> = dashmap.clone().into_iter().collect();
82                map.serialize_entry("Form", &hmap)?
83            }
84            Body::Bytes(bytes) => map.serialize_entry("Bytes", bytes)?,
85        }
86
87        map.end()
88    }
89}
90
91impl<'de> Deserialize<'de> for Body {
92    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
93    where
94        D: serde::Deserializer<'de>,
95    {
96        use serde::de::{self, MapAccess, Visitor};
97        use std::fmt;
98
99        struct BodyVisitor;
100
101        impl<'de> Visitor<'de> for BodyVisitor {
102            type Value = Body;
103
104            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
105                formatter.write_str("a body object")
106            }
107
108            fn visit_map<V>(self, mut map: V) -> Result<Body, V::Error>
109            where
110                V: MapAccess<'de>,
111            {
112                let entry = map.next_entry::<String, Value>()?;
113                let (key, value) = match entry {
114                    Some((k, v)) => (k, v),
115                    None => return Err(de::Error::custom("Expected a body variant")),
116                };
117
118                match key.as_str() {
119                    "Json" => Ok(Body::Json(value)),
120                    "Form" => {
121                        let form_data: HashMap<String, String> =
122                            serde_json::from_value(value).map_err(de::Error::custom)?;
123                        let dashmap = DashMap::new();
124                        for (k, v) in form_data {
125                            dashmap.insert(k, v);
126                        }
127                        Ok(Body::Form(dashmap))
128                    }
129                    "Bytes" => {
130                        let bytes: Bytes =
131                            serde_json::from_value(value).map_err(de::Error::custom)?;
132                        Ok(Body::Bytes(bytes))
133                    }
134                    _ => Err(de::Error::custom(format!("Unknown body variant: {}", key))),
135                }
136            }
137        }
138
139        deserializer.deserialize_map(BodyVisitor)
140    }
141}
142
143/// Outgoing HTTP request used by the crawler runtime.
144///
145/// [`Request`] is the handoff type between spiders, middleware, the scheduler,
146/// and the downloader. It is transport-neutral enough to be shared across the
147/// workspace, but expressive enough for custom methods, headers, bodies, and
148/// request-scoped metadata.
149///
150/// ## Example
151///
152/// ```rust,ignore
153/// use spider_util::request::Request;
154/// use url::Url;
155///
156/// // Create a basic GET request
157/// let request = Request::new(Url::parse("https://example.com").unwrap());
158///
159/// // Build a request with headers and method
160/// let post_request = Request::new(Url::parse("https://api.example.com").unwrap())
161///     .with_method(reqwest::Method::POST)
162///     .with_header("Accept", "application/json")
163///     .unwrap();
164/// ```
165#[derive(Debug, Clone)]
166pub struct Request {
167    /// The target URL for this request.
168    pub url: Url,
169    /// The HTTP method (GET, POST, etc.).
170    pub method: reqwest::Method,
171    /// HTTP headers for the request.
172    pub headers: http::header::HeaderMap,
173    /// Optional request body.
174    pub body: Option<Body>,
175    /// Lazy-initialized metadata - only allocated when actually used.
176    /// This reduces memory allocation for simple requests without metadata.
177    meta: Option<Arc<DashMap<String, Value>>>,
178}
179
180// Custom serialization for Request struct
181impl Serialize for Request {
182    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
183    where
184        S: serde::Serializer,
185    {
186        use serde::ser::SerializeStruct;
187        // Convert HeaderMap to a serializable format
188        let headers_vec: Vec<(String, String)> = self
189            .headers
190            .iter()
191            .filter_map(|(name, value)| {
192                value
193                    .to_str()
194                    .ok()
195                    .map(|val_str| (name.as_str().to_string(), val_str.to_string()))
196            })
197            .collect();
198
199        let mut s = serializer.serialize_struct("Request", 5)?;
200        s.serialize_field("url", &self.url.as_str())?;
201        s.serialize_field("method", &self.method.as_str())?;
202        s.serialize_field("headers", &headers_vec)?;
203        s.serialize_field("body", &self.body)?;
204        // Serialize meta as empty HashMap if None (for backward compatibility)
205        let meta_map: HashMap<String, Value> = self
206            .meta
207            .as_ref()
208            .map(|m| {
209                m.iter()
210                    .map(|e| (e.key().clone(), e.value().clone()))
211                    .collect()
212            })
213            .unwrap_or_default();
214        s.serialize_field("meta", &meta_map)?;
215        s.end()
216    }
217}
218
219impl<'de> Deserialize<'de> for Request {
220    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
221    where
222        D: serde::Deserializer<'de>,
223    {
224        use serde::de::{self, MapAccess, Visitor};
225        use std::fmt;
226
227        #[derive(Deserialize)]
228        #[serde(field_identifier, rename_all = "lowercase")]
229        enum Field {
230            Url,
231            Method,
232            Headers,
233            Body,
234            Meta,
235        }
236
237        struct RequestVisitor;
238
239        impl<'de> Visitor<'de> for RequestVisitor {
240            type Value = Request;
241
242            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
243                formatter.write_str("struct Request")
244            }
245
246            fn visit_map<V>(self, mut map: V) -> Result<Request, V::Error>
247            where
248                V: MapAccess<'de>,
249            {
250                let mut url = None;
251                let mut method = None;
252                let mut headers = None;
253                let mut body = None;
254                let mut meta = None;
255
256                while let Some(key) = map.next_key()? {
257                    match key {
258                        Field::Url => {
259                            if url.is_some() {
260                                return Err(de::Error::duplicate_field("url"));
261                            }
262                            let url_str: String = map.next_value()?;
263                            let parsed_url = Url::parse(&url_str).map_err(de::Error::custom)?;
264                            url = Some(parsed_url);
265                        }
266                        Field::Method => {
267                            if method.is_some() {
268                                return Err(de::Error::duplicate_field("method"));
269                            }
270                            let method_str: String = map.next_value()?;
271                            let parsed_method =
272                                Method::from_str(&method_str).map_err(de::Error::custom)?;
273                            method = Some(parsed_method);
274                        }
275                        Field::Headers => {
276                            if headers.is_some() {
277                                return Err(de::Error::duplicate_field("headers"));
278                            }
279                            // Deserialize headers vector and convert back to HeaderMap
280                            let headers_vec: Vec<(String, String)> = map.next_value()?;
281                            let mut header_map = HeaderMap::new();
282                            for (name, value) in headers_vec {
283                                if let Ok(header_name) =
284                                    http::header::HeaderName::from_bytes(name.as_bytes())
285                                    && let Ok(header_value) =
286                                        http::header::HeaderValue::from_str(&value)
287                                {
288                                    header_map.insert(header_name, header_value);
289                                }
290                            }
291                            headers = Some(header_map);
292                        }
293                        Field::Body => {
294                            if body.is_some() {
295                                return Err(de::Error::duplicate_field("body"));
296                            }
297                            body = Some(map.next_value()?);
298                        }
299                        Field::Meta => {
300                            // Deserialize meta HashMap and convert to DashMap
301                            let meta_map: HashMap<String, Value> = map.next_value()?;
302                            if !meta_map.is_empty() {
303                                let dashmap = DashMap::new();
304                                for (k, v) in meta_map {
305                                    dashmap.insert(k, v);
306                                }
307                                meta = Some(Arc::new(dashmap));
308                            }
309                        }
310                    }
311                }
312
313                let url = url.ok_or_else(|| de::Error::missing_field("url"))?;
314                let method = method.ok_or_else(|| de::Error::missing_field("method"))?;
315                let headers = headers.ok_or_else(|| de::Error::missing_field("headers"))?;
316                let body = body; // Optional field
317
318                Ok(Request {
319                    url,
320                    method,
321                    headers,
322                    body,
323                    meta, // May be None if no meta was serialized
324                })
325            }
326        }
327
328        const FIELDS: &[&str] = &["url", "method", "headers", "body", "meta"];
329        deserializer.deserialize_struct("Request", FIELDS, RequestVisitor)
330    }
331}
332
333impl Default for Request {
334    fn default() -> Self {
335        let default_url = match Url::parse("http://default.invalid") {
336            Ok(url) => url,
337            Err(err) => panic!("invalid hardcoded default URL: {}", err),
338        };
339        Self {
340            url: default_url,
341            method: reqwest::Method::GET,
342            headers: http::header::HeaderMap::new(),
343            body: None,
344            meta: None, // Lazy initialization - no allocation until needed
345        }
346    }
347}
348
349impl Request {
350    /// Creates a new [`Request`] with the given URL.
351    ///
352    /// This is the most common constructor used by spiders when enqueueing
353    /// follow-up pages. It does not allocate metadata storage unless
354    /// [`with_meta`](Request::with_meta) is called.
355    ///
356    /// ## Example
357    ///
358    /// ```rust,ignore
359    /// use spider_util::request::Request;
360    /// use url::Url;
361    ///
362    /// let request = Request::new(Url::parse("https://example.com").unwrap());
363    /// ```
364    pub fn new(url: Url) -> Self {
365        Request {
366            url,
367            method: reqwest::Method::GET,
368            headers: http::header::HeaderMap::new(),
369            body: None,
370            meta: None,
371        }
372    }
373
374    /// Sets the HTTP method for the request.
375    ///
376    /// Use this together with one of the body helpers for POST, PUT, or PATCH
377    /// workflows.
378    ///
379    /// ## Example
380    ///
381    /// ```rust,ignore
382    /// use spider_util::request::Request;
383    /// use url::Url;
384    ///
385    /// let request = Request::new(Url::parse("https://example.com").unwrap())
386    ///     .with_method(reqwest::Method::POST);
387    /// ```
388    pub fn with_method(mut self, method: reqwest::Method) -> Self {
389        self.method = method;
390        self
391    }
392
393    /// Adds a header to the request.
394    ///
395    /// Returns an error if the header name or value is invalid.
396    ///
397    /// # Errors
398    ///
399    /// Returns a [`SpiderError::HeaderValueError`] if the header name or value is invalid.
400    ///
401    /// ## Example
402    ///
403    /// ```rust,ignore
404    /// use spider_util::request::Request;
405    /// use url::Url;
406    ///
407    /// let request = Request::new(Url::parse("https://example.com").unwrap())
408    ///     .with_header("Accept", "application/json")
409    ///     .unwrap();
410    /// ```
411    pub fn with_header(mut self, name: &str, value: &str) -> Result<Self, SpiderError> {
412        let header_name =
413            reqwest::header::HeaderName::from_bytes(name.as_bytes()).map_err(|e| {
414                SpiderError::HeaderValueError(format!("Invalid header name '{}': {}", name, e))
415            })?;
416        let header_value = reqwest::header::HeaderValue::from_str(value).map_err(|e| {
417            SpiderError::HeaderValueError(format!("Invalid header value '{}': {}", value, e))
418        })?;
419
420        self.headers.insert(header_name, header_value);
421        Ok(self)
422    }
423
424    /// Sets the body of the request and defaults the method to POST.
425    ///
426    /// ## Example
427    ///
428    /// ```rust,ignore
429    /// use spider_util::request::{Request, Body};
430    /// use url::Url;
431    /// use serde_json::json;
432    ///
433    /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
434    ///     .with_body(Body::Json(json!({"key": "value"})));
435    /// ```
436    pub fn with_body(mut self, body: Body) -> Self {
437        self.body = Some(body);
438        self.with_method(reqwest::Method::POST)
439    }
440
441    /// Sets the body of the request to a JSON value and defaults the method to POST.
442    ///
443    /// This helper stores the payload body only. Add content-type headers
444    /// explicitly when the target service expects them.
445    ///
446    /// ## Example
447    ///
448    /// ```rust,ignore
449    /// use spider_util::request::Request;
450    /// use url::Url;
451    /// use serde_json::json;
452    ///
453    /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
454    ///     .with_json(json!({"name": "test"}));
455    /// ```
456    pub fn with_json(self, json: serde_json::Value) -> Self {
457        self.with_body(Body::Json(json))
458    }
459
460    /// Sets the body of the request to form data and defaults the method to POST.
461    ///
462    /// ## Example
463    ///
464    /// ```rust,ignore
465    /// use spider_util::request::Request;
466    /// use url::Url;
467    /// use dashmap::DashMap;
468    ///
469    /// let mut form = DashMap::new();
470    /// form.insert("key".to_string(), "value".to_string());
471    ///
472    /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
473    ///     .with_form(form);
474    /// ```
475    pub fn with_form(self, form: DashMap<String, String>) -> Self {
476        self.with_body(Body::Form(form))
477    }
478
479    /// Sets the body of the request to raw bytes and defaults the method to POST.
480    ///
481    /// ## Example
482    ///
483    /// ```rust,ignore
484    /// use spider_util::request::Request;
485    /// use url::Url;
486    /// use bytes::Bytes;
487    ///
488    /// let data = Bytes::from("binary data");
489    /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
490    ///     .with_bytes(data);
491    /// ```
492    pub fn with_bytes(self, bytes: bytes::Bytes) -> Self {
493        self.with_body(Body::Bytes(bytes))
494    }
495
496    /// Adds a value to the request's metadata.
497    ///
498    /// Lazily allocates the metadata map on first use. Metadata is commonly
499    /// used to carry crawl context such as pagination state, source URLs, or
500    /// retry bookkeeping across middleware and parsing stages.
501    ///
502    /// ## Example
503    ///
504    /// ```rust,ignore
505    /// use spider_util::request::Request;
506    /// use url::Url;
507    /// use serde_json::json;
508    ///
509    /// let request = Request::new(Url::parse("https://example.com").unwrap())
510    ///     .with_meta("priority", json!(1))
511    ///     .with_meta("source", json!("manual"));
512    /// ```
513    pub fn with_meta(mut self, key: &str, value: serde_json::Value) -> Self {
514        self.meta
515            .get_or_insert_with(|| Arc::new(DashMap::new()))
516            .insert(key.to_string(), value);
517        self
518    }
519
520    /// Gets a reference to a metadata value, if it exists.
521    ///
522    /// Returns a cloned JSON value because metadata is stored in a shared
523    /// concurrent map. Returns `None` if the key doesn't exist or if metadata
524    /// hasn't been set.
525    pub fn get_meta(&self, key: &str) -> Option<serde_json::Value> {
526        self.meta
527            .as_ref()
528            .and_then(|m| m.get(key).map(|e| e.value().clone()))
529    }
530
531    /// Returns `true` if the request has metadata.
532    pub fn has_meta(&self) -> bool {
533        self.meta.as_ref().is_some_and(|m| !m.is_empty())
534    }
535
536    /// Returns a reference to the internal metadata map, if it exists.
537    pub fn meta_map(&self) -> Option<&Arc<DashMap<String, serde_json::Value>>> {
538        self.meta.as_ref()
539    }
540
541    /// Inserts a value into metadata, creating the map if needed.
542    ///
543    /// This is intended for internal framework use.
544    pub fn insert_meta(&mut self, key: String, value: serde_json::Value) {
545        self.meta
546            .get_or_insert_with(|| Arc::new(DashMap::new()))
547            .insert(key, value);
548    }
549
550    /// Gets a value from metadata using DashMap's API.
551    ///
552    /// This is intended for internal framework use where direct access is needed.
553    pub fn get_meta_ref(
554        &self,
555        key: &str,
556    ) -> Option<dashmap::mapref::one::Ref<'_, String, serde_json::Value>> {
557        self.meta.as_ref().and_then(|m| m.get(key))
558    }
559
560    /// Sets the metadata map directly.
561    ///
562    /// Used for internal framework operations.
563    pub fn set_meta_from_option(&mut self, meta: Option<Arc<DashMap<String, serde_json::Value>>>) {
564        self.meta = meta;
565    }
566
567    /// Clones the metadata map.
568    ///
569    /// Used for internal framework operations where metadata needs to be copied.
570    pub fn clone_meta(&self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
571        self.meta.clone()
572    }
573
574    /// Takes the metadata map, leaving `None` in its place.
575    ///
576    /// Used for internal framework operations.
577    pub fn take_meta(&mut self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
578        self.meta.take()
579    }
580
581    /// Returns a reference to the metadata Arc for internal framework use.
582    pub fn meta_inner(&self) -> &Option<Arc<DashMap<String, serde_json::Value>>> {
583        &self.meta
584    }
585
586    const RETRY_ATTEMPTS_KEY: &str = "retry_attempts";
587
588    /// Gets the number of times the request has been retried.
589    ///
590    /// Returns `0` if no retry attempts have been recorded.
591    pub fn get_retry_attempts(&self) -> u32 {
592        self.meta
593            .as_ref()
594            .and_then(|m| m.get(Self::RETRY_ATTEMPTS_KEY))
595            .and_then(|v| v.value().as_u64())
596            .unwrap_or(0) as u32
597    }
598
599    /// Increments the retry count for the request.
600    ///
601    /// Lazily allocates the metadata map if not already present.
602    pub fn increment_retry_attempts(&mut self) {
603        let current_attempts = self.get_retry_attempts();
604        self.meta
605            .get_or_insert_with(|| Arc::new(DashMap::new()))
606            .insert(
607                Self::RETRY_ATTEMPTS_KEY.to_string(),
608                serde_json::Value::from(current_attempts + 1),
609            );
610    }
611
612    /// Generates a unique fingerprint for the request based on its URL, method, and body.
613    ///
614    /// This is the stable identity used by runtime deduplication and related
615    /// components that need to recognize equivalent requests.
616    ///
617    /// The fingerprint is used for duplicate detection and caching. It combines:
618    /// - The request URL
619    /// - The HTTP method
620    /// - The request body (if present)
621    ///
622    /// ## Example
623    ///
624    /// ```rust,ignore
625    /// use spider_util::request::Request;
626    /// use url::Url;
627    ///
628    /// let request = Request::new(Url::parse("https://example.com").unwrap());
629    /// let fingerprint = request.fingerprint();
630    /// ```
631    pub fn fingerprint(&self) -> String {
632        let mut hasher = XxHash64::default();
633        hasher.write(self.url.as_str().as_bytes());
634        hasher.write(self.method.as_str().as_bytes());
635
636        if let Some(ref body) = self.body {
637            match body {
638                Body::Json(json_val) => {
639                    if let Ok(serialized) = serde_json::to_string(json_val) {
640                        hasher.write(serialized.as_bytes());
641                    }
642                }
643                Body::Form(form_val) => {
644                    // Optimized: hash components directly without building intermediate String
645                    for r in form_val.iter() {
646                        hasher.write(r.key().as_bytes());
647                        hasher.write(r.value().as_bytes());
648                    }
649                }
650                Body::Bytes(bytes_val) => {
651                    hasher.write(bytes_val);
652                }
653            }
654        }
655        format!("{:x}", hasher.finish())
656    }
657}