Skip to main content

spider_util/
request.rs

1//! Data structures for representing HTTP requests in `spider-lib`.
2//!
3//! This module defines the [`Request`] struct, which is a central component
4//! for constructing and managing outgoing HTTP requests within the
5//! `spider-lib` framework. It encapsulates all necessary details of an
6//! HTTP request, including:
7//! - The target URL and HTTP method
8//! - Request headers and an optional request body (supporting JSON, form data, or raw bytes)
9//! - Metadata for tracking retry attempts or other custom information
10//!
11//! Additionally, the module provides methods for building requests,
12//! incrementing retry counters, and generating unique fingerprints
13//! for request deduplication and caching.
14//!
15//! ## Example
16//!
17//! ```rust
18//! use spider_util::request::{Request, Body};
19//! use url::Url;
20//! use serde_json::json;
21//!
22//! // Create a simple GET request
23//! let url = Url::parse("https://example.com").unwrap();
24//! let request = Request::new(url);
25//!
26//! // Create a POST request with JSON body
27//! let post_request = Request::new(Url::parse("https://api.example.com/data").unwrap())
28//!     .with_method(reqwest::Method::POST)
29//!     .with_json(json!({"key": "value"}));
30//! ```
31
32use bytes::Bytes;
33use dashmap::DashMap;
34use http::header::HeaderMap;
35use reqwest::{Method, Url};
36use serde::{Deserialize, Serialize};
37use serde_json::Value;
38use std::collections::HashMap;
39use std::hash::Hasher;
40use std::str::FromStr;
41use std::sync::Arc;
42use twox_hash::XxHash64;
43
44use crate::error::SpiderError;
45
46/// The body of an HTTP request.
47///
48/// [`Body`] encapsulates the different types of request bodies that can be sent
49/// with an HTTP request. It supports JSON payloads, form data, and raw bytes.
50///
51/// ## Variants
52///
53/// - `Json`: A JSON value (typically an object or array)
54/// - `Form`: Key-value form data encoded as `application/x-www-form-urlencoded`
55/// - `Bytes`: Raw binary data
56///
57/// ## Example
58///
59/// ```rust
60/// use spider_util::request::Body;
61/// use serde_json::json;
62/// use dashmap::DashMap;
63/// use bytes::Bytes;
64///
65/// // JSON body
66/// let json_body = Body::Json(json!({"name": "test"}));
67///
68/// // Form data
69/// let mut form = DashMap::new();
70/// form.insert("key".to_string(), "value".to_string());
71/// let form_body = Body::Form(form);
72///
73/// // Raw bytes
74/// let bytes_body = Body::Bytes(Bytes::from("raw data"));
75/// ```
76#[derive(Debug, Clone)]
77pub enum Body {
78    /// JSON payload.
79    Json(serde_json::Value),
80    /// Form data (key-value pairs).
81    Form(DashMap<String, String>),
82    /// Raw binary data.
83    Bytes(Bytes),
84}
85
86// Custom serialization for Body enum
87impl Serialize for Body {
88    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
89    where
90        S: serde::Serializer,
91    {
92        use serde::ser::SerializeMap;
93        let mut map = serializer.serialize_map(Some(1))?;
94
95        match self {
96            Body::Json(value) => map.serialize_entry("Json", value)?,
97            Body::Form(dashmap) => {
98                let hmap: HashMap<String, String> = dashmap.clone().into_iter().collect();
99                map.serialize_entry("Form", &hmap)?
100            }
101            Body::Bytes(bytes) => map.serialize_entry("Bytes", bytes)?,
102        }
103
104        map.end()
105    }
106}
107
108impl<'de> Deserialize<'de> for Body {
109    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
110    where
111        D: serde::Deserializer<'de>,
112    {
113        use serde::de::{self, MapAccess, Visitor};
114        use std::fmt;
115
116        struct BodyVisitor;
117
118        impl<'de> Visitor<'de> for BodyVisitor {
119            type Value = Body;
120
121            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
122                formatter.write_str("a body object")
123            }
124
125            fn visit_map<V>(self, mut map: V) -> Result<Body, V::Error>
126            where
127                V: MapAccess<'de>,
128            {
129                let entry = map.next_entry::<String, Value>()?;
130                let (key, value) = match entry {
131                    Some((k, v)) => (k, v),
132                    None => return Err(de::Error::custom("Expected a body variant")),
133                };
134
135                match key.as_str() {
136                    "Json" => Ok(Body::Json(value)),
137                    "Form" => {
138                        let form_data: HashMap<String, String> =
139                            serde_json::from_value(value).map_err(de::Error::custom)?;
140                        let dashmap = DashMap::new();
141                        for (k, v) in form_data {
142                            dashmap.insert(k, v);
143                        }
144                        Ok(Body::Form(dashmap))
145                    }
146                    "Bytes" => {
147                        let bytes: Bytes =
148                            serde_json::from_value(value).map_err(de::Error::custom)?;
149                        Ok(Body::Bytes(bytes))
150                    }
151                    _ => Err(de::Error::custom(format!("Unknown body variant: {}", key))),
152                }
153            }
154        }
155
156        deserializer.deserialize_map(BodyVisitor)
157    }
158}
159
160/// An HTTP request to be processed by the crawler.
161///
162/// [`Request`] is the primary data structure for representing outgoing HTTP
163/// requests in the spider framework. It contains all information needed to
164/// execute an HTTP request, including the URL, method, headers, body, and
165/// optional metadata.
166///
167/// ## Memory Efficiency
168///
169/// The `meta` field uses lazy initialization - the metadata map is only
170/// allocated when actually used. This reduces memory overhead for simple
171/// requests that don't need metadata.
172///
173/// ## Example
174///
175/// ```rust
176/// use spider_util::request::Request;
177/// use url::Url;
178///
179/// // Create a basic GET request
180/// let request = Request::new(Url::parse("https://example.com").unwrap());
181///
182/// // Build a request with headers and method
183/// let post_request = Request::new(Url::parse("https://api.example.com").unwrap())
184///     .with_method(reqwest::Method::POST)
185///     .with_header("Accept", "application/json")
186///     .unwrap();
187/// ```
188#[derive(Debug, Clone)]
189pub struct Request {
190    /// The target URL for this request.
191    pub url: Url,
192    /// The HTTP method (GET, POST, etc.).
193    pub method: reqwest::Method,
194    /// HTTP headers for the request.
195    pub headers: http::header::HeaderMap,
196    /// Optional request body.
197    pub body: Option<Body>,
198    /// Lazy-initialized metadata - only allocated when actually used.
199    /// This reduces memory allocation for simple requests without metadata.
200    meta: Option<Arc<DashMap<String, Value>>>,
201}
202
203// Custom serialization for Request struct
204impl Serialize for Request {
205    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
206    where
207        S: serde::Serializer,
208    {
209        use serde::ser::SerializeStruct;
210        // Convert HeaderMap to a serializable format
211        let headers_vec: Vec<(String, String)> = self
212            .headers
213            .iter()
214            .filter_map(|(name, value)| {
215                value
216                    .to_str()
217                    .ok()
218                    .map(|val_str| (name.as_str().to_string(), val_str.to_string()))
219            })
220            .collect();
221
222        let mut s = serializer.serialize_struct("Request", 5)?;
223        s.serialize_field("url", &self.url.as_str())?;
224        s.serialize_field("method", &self.method.as_str())?;
225        s.serialize_field("headers", &headers_vec)?;
226        s.serialize_field("body", &self.body)?;
227        // Serialize meta as empty HashMap if None (for backward compatibility)
228        let meta_map: HashMap<String, Value> = self
229            .meta
230            .as_ref()
231            .map(|m| m.iter().map(|e| (e.key().clone(), e.value().clone())).collect())
232            .unwrap_or_default();
233        s.serialize_field("meta", &meta_map)?;
234        s.end()
235    }
236}
237
238impl<'de> Deserialize<'de> for Request {
239    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
240    where
241        D: serde::Deserializer<'de>,
242    {
243        use serde::de::{self, MapAccess, Visitor};
244        use std::fmt;
245
246        #[derive(Deserialize)]
247        #[serde(field_identifier, rename_all = "lowercase")]
248        enum Field {
249            Url,
250            Method,
251            Headers,
252            Body,
253            Meta,
254        }
255
256        struct RequestVisitor;
257
258        impl<'de> Visitor<'de> for RequestVisitor {
259            type Value = Request;
260
261            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
262                formatter.write_str("struct Request")
263            }
264
265            fn visit_map<V>(self, mut map: V) -> Result<Request, V::Error>
266            where
267                V: MapAccess<'de>,
268            {
269                let mut url = None;
270                let mut method = None;
271                let mut headers = None;
272                let mut body = None;
273                let mut meta = None;
274
275                while let Some(key) = map.next_key()? {
276                    match key {
277                        Field::Url => {
278                            if url.is_some() {
279                                return Err(de::Error::duplicate_field("url"));
280                            }
281                            let url_str: String = map.next_value()?;
282                            let parsed_url = Url::parse(&url_str).map_err(de::Error::custom)?;
283                            url = Some(parsed_url);
284                        }
285                        Field::Method => {
286                            if method.is_some() {
287                                return Err(de::Error::duplicate_field("method"));
288                            }
289                            let method_str: String = map.next_value()?;
290                            let parsed_method =
291                                Method::from_str(&method_str).map_err(de::Error::custom)?;
292                            method = Some(parsed_method);
293                        }
294                        Field::Headers => {
295                            if headers.is_some() {
296                                return Err(de::Error::duplicate_field("headers"));
297                            }
298                            // Deserialize headers vector and convert back to HeaderMap
299                            let headers_vec: Vec<(String, String)> = map.next_value()?;
300                            let mut header_map = HeaderMap::new();
301                            for (name, value) in headers_vec {
302                                if let Ok(header_name) =
303                                    http::header::HeaderName::from_bytes(name.as_bytes())
304                                    && let Ok(header_value) =
305                                        http::header::HeaderValue::from_str(&value)
306                                {
307                                    header_map.insert(header_name, header_value);
308                                }
309                            }
310                            headers = Some(header_map);
311                        }
312                        Field::Body => {
313                            if body.is_some() {
314                                return Err(de::Error::duplicate_field("body"));
315                            }
316                            body = Some(map.next_value()?);
317                        }
318                        Field::Meta => {
319                            // Deserialize meta HashMap and convert to DashMap
320                            let meta_map: HashMap<String, Value> = map.next_value()?;
321                            if !meta_map.is_empty() {
322                                let dashmap = DashMap::new();
323                                for (k, v) in meta_map {
324                                    dashmap.insert(k, v);
325                                }
326                                meta = Some(Arc::new(dashmap));
327                            }
328                        }
329                    }
330                }
331
332                let url = url.ok_or_else(|| de::Error::missing_field("url"))?;
333                let method = method.ok_or_else(|| de::Error::missing_field("method"))?;
334                let headers = headers.ok_or_else(|| de::Error::missing_field("headers"))?;
335                let body = body; // Optional field
336
337                Ok(Request {
338                    url,
339                    method,
340                    headers,
341                    body,
342                    meta, // May be None if no meta was serialized
343                })
344            }
345        }
346
347        const FIELDS: &[&str] = &["url", "method", "headers", "body", "meta"];
348        deserializer.deserialize_struct("Request", FIELDS, RequestVisitor)
349    }
350}
351
352impl Default for Request {
353    fn default() -> Self {
354        Self {
355            url: Url::parse("http://default.invalid").unwrap(),
356            method: reqwest::Method::GET,
357            headers: http::header::HeaderMap::new(),
358            body: None,
359            meta: None, // Lazy initialization - no allocation until needed
360        }
361    }
362}
363
364impl Request {
365    /// Creates a new [`Request`] with the given URL.
366    ///
367    /// Does not allocate memory for metadata unless [`with_meta`](Request::with_meta) is called.
368    ///
369    /// ## Example
370    ///
371    /// ```rust
372    /// use spider_util::request::Request;
373    /// use url::Url;
374    ///
375    /// let request = Request::new(Url::parse("https://example.com").unwrap());
376    /// ```
377    pub fn new(url: Url) -> Self {
378        Request {
379            url,
380            method: reqwest::Method::GET,
381            headers: http::header::HeaderMap::new(),
382            body: None,
383            meta: None,
384        }
385    }
386
387    /// Sets the HTTP method for the request.
388    ///
389    /// ## Example
390    ///
391    /// ```rust
392    /// use spider_util::request::Request;
393    /// use url::Url;
394    ///
395    /// let request = Request::new(Url::parse("https://example.com").unwrap())
396    ///     .with_method(reqwest::Method::POST);
397    /// ```
398    pub fn with_method(mut self, method: reqwest::Method) -> Self {
399        self.method = method;
400        self
401    }
402
403    /// Adds a header to the request.
404    ///
405    /// # Errors
406    ///
407    /// Returns a [`SpiderError::HeaderValueError`] if the header name or value is invalid.
408    ///
409    /// ## Example
410    ///
411    /// ```rust
412    /// use spider_util::request::Request;
413    /// use url::Url;
414    ///
415    /// let request = Request::new(Url::parse("https://example.com").unwrap())
416    ///     .with_header("Accept", "application/json")
417    ///     .unwrap();
418    /// ```
419    pub fn with_header(mut self, name: &str, value: &str) -> Result<Self, SpiderError> {
420        let header_name =
421            reqwest::header::HeaderName::from_bytes(name.as_bytes()).map_err(|e| {
422                SpiderError::HeaderValueError(format!("Invalid header name '{}': {}", name, e))
423            })?;
424        let header_value = reqwest::header::HeaderValue::from_str(value).map_err(|e| {
425            SpiderError::HeaderValueError(format!("Invalid header value '{}': {}", value, e))
426        })?;
427
428        self.headers.insert(header_name, header_value);
429        Ok(self)
430    }
431
432    /// Sets the body of the request and defaults the method to POST.
433    ///
434    /// ## Example
435    ///
436    /// ```rust
437    /// use spider_util::request::{Request, Body};
438    /// use url::Url;
439    /// use serde_json::json;
440    ///
441    /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
442    ///     .with_body(Body::Json(json!({"key": "value"})));
443    /// ```
444    pub fn with_body(mut self, body: Body) -> Self {
445        self.body = Some(body);
446        self.with_method(reqwest::Method::POST)
447    }
448
449    /// Sets the body of the request to a JSON value and defaults the method to POST.
450    ///
451    /// ## Example
452    ///
453    /// ```rust
454    /// use spider_util::request::Request;
455    /// use url::Url;
456    /// use serde_json::json;
457    ///
458    /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
459    ///     .with_json(json!({"name": "test"}));
460    /// ```
461    pub fn with_json(self, json: serde_json::Value) -> Self {
462        self.with_body(Body::Json(json))
463    }
464
465    /// Sets the body of the request to form data and defaults the method to POST.
466    ///
467    /// ## Example
468    ///
469    /// ```rust
470    /// use spider_util::request::Request;
471    /// use url::Url;
472    /// use dashmap::DashMap;
473    ///
474    /// let mut form = DashMap::new();
475    /// form.insert("key".to_string(), "value".to_string());
476    ///
477    /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
478    ///     .with_form(form);
479    /// ```
480    pub fn with_form(self, form: DashMap<String, String>) -> Self {
481        self.with_body(Body::Form(form))
482    }
483
484    /// Sets the body of the request to raw bytes and defaults the method to POST.
485    ///
486    /// ## Example
487    ///
488    /// ```rust
489    /// use spider_util::request::Request;
490    /// use url::Url;
491    /// use bytes::Bytes;
492    ///
493    /// let data = Bytes::from("binary data");
494    /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
495    ///     .with_bytes(data);
496    /// ```
497    pub fn with_bytes(self, bytes: bytes::Bytes) -> Self {
498        self.with_body(Body::Bytes(bytes))
499    }
500
501    /// Adds a value to the request's metadata.
502    ///
503    /// Lazily allocates the metadata map on first use.
504    ///
505    /// ## Example
506    ///
507    /// ```rust
508    /// use spider_util::request::Request;
509    /// use url::Url;
510    /// use serde_json::json;
511    ///
512    /// let request = Request::new(Url::parse("https://example.com").unwrap())
513    ///     .with_meta("priority", json!(1))
514    ///     .with_meta("source", json!("manual"));
515    /// ```
516    pub fn with_meta(mut self, key: &str, value: serde_json::Value) -> Self {
517        self.meta
518            .get_or_insert_with(|| Arc::new(DashMap::new()))
519            .insert(key.to_string(), value);
520        self
521    }
522
523    /// Gets a reference to a metadata value, if it exists.
524    ///
525    /// Returns `None` if the key doesn't exist or if metadata hasn't been set.
526    pub fn get_meta(&self, key: &str) -> Option<serde_json::Value> {
527        self.meta
528            .as_ref()
529            .and_then(|m| m.get(key).map(|e| e.value().clone()))
530    }
531
532    /// Returns `true` if the request has metadata.
533    pub fn has_meta(&self) -> bool {
534        self.meta.as_ref().is_some_and(|m| !m.is_empty())
535    }
536
537    /// Returns a reference to the internal metadata map, if it exists.
538    pub fn meta_map(&self) -> Option<&Arc<DashMap<String, serde_json::Value>>> {
539        self.meta.as_ref()
540    }
541
542    /// Inserts a value into metadata, creating the map if needed.
543    ///
544    /// This is intended for internal framework use.
545    pub fn insert_meta(&mut self, key: String, value: serde_json::Value) {
546        self.meta
547            .get_or_insert_with(|| Arc::new(DashMap::new()))
548            .insert(key, value);
549    }
550
551    /// Gets a value from metadata using DashMap's API.
552    ///
553    /// This is intended for internal framework use where direct access is needed.
554    pub fn get_meta_ref(&self, key: &str) -> Option<dashmap::mapref::one::Ref<'_, String, serde_json::Value>> {
555        self.meta.as_ref().and_then(|m| m.get(key))
556    }
557
558    /// Sets the metadata map directly.
559    ///
560    /// Used for internal framework operations.
561    pub fn set_meta_from_option(&mut self, meta: Option<Arc<DashMap<String, serde_json::Value>>>) {
562        self.meta = meta;
563    }
564
565    /// Clones the metadata map.
566    ///
567    /// Used for internal framework operations where metadata needs to be copied.
568    pub fn clone_meta(&self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
569        self.meta.clone()
570    }
571
572    /// Takes the metadata map, leaving `None` in its place.
573    ///
574    /// Used for internal framework operations.
575    pub fn take_meta(&mut self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
576        self.meta.take()
577    }
578
579    /// Returns a reference to the metadata Arc for internal framework use.
580    pub fn meta_inner(&self) -> &Option<Arc<DashMap<String, serde_json::Value>>> {
581        &self.meta
582    }
583
584    const RETRY_ATTEMPTS_KEY: &str = "retry_attempts";
585
586    /// Gets the number of times the request has been retried.
587    ///
588    /// Returns `0` if no retry attempts have been recorded.
589    pub fn get_retry_attempts(&self) -> u32 {
590        self.meta
591            .as_ref()
592            .and_then(|m| m.get(Self::RETRY_ATTEMPTS_KEY))
593            .and_then(|v| v.value().as_u64())
594            .unwrap_or(0) as u32
595    }
596
597    /// Increments the retry count for the request.
598    ///
599    /// Lazily allocates the metadata map if not already present.
600    pub fn increment_retry_attempts(&mut self) {
601        let current_attempts = self.get_retry_attempts();
602        self.meta
603            .get_or_insert_with(|| Arc::new(DashMap::new()))
604            .insert(Self::RETRY_ATTEMPTS_KEY.to_string(), serde_json::Value::from(current_attempts + 1));
605    }
606
607    /// Generates a unique fingerprint for the request based on its URL, method, and body.
608    ///
609    /// The fingerprint is used for duplicate detection and caching. It combines:
610    /// - The request URL
611    /// - The HTTP method
612    /// - The request body (if present)
613    ///
614    /// ## Example
615    ///
616    /// ```rust
617    /// use spider_util::request::Request;
618    /// use url::Url;
619    ///
620    /// let request = Request::new(Url::parse("https://example.com").unwrap());
621    /// let fingerprint = request.fingerprint();
622    /// ```
623    pub fn fingerprint(&self) -> String {
624        let mut hasher = XxHash64::default();
625        hasher.write(self.url.as_str().as_bytes());
626        hasher.write(self.method.as_str().as_bytes());
627
628        if let Some(ref body) = self.body {
629            match body {
630                Body::Json(json_val) => {
631                    if let Ok(serialized) = serde_json::to_string(json_val) {
632                        hasher.write(serialized.as_bytes());
633                    }
634                }
635                Body::Form(form_val) => {
636                    // Optimized: hash components directly without building intermediate String
637                    for r in form_val.iter() {
638                        hasher.write(r.key().as_bytes());
639                        hasher.write(r.value().as_bytes());
640                    }
641                }
642                Body::Bytes(bytes_val) => {
643                    hasher.write(bytes_val);
644                }
645            }
646        }
647        format!("{:x}", hasher.finish())
648    }
649}