Skip to main content

spider_util/
request.rs

1//! Data structures for representing HTTP requests in `spider-lib`.
2//!
3//! This module defines the [`Request`] struct, which is a central component
4//! for constructing and managing outgoing HTTP requests within the
5//! `spider-lib` framework. It encapsulates all necessary details of an
6//! HTTP request, including:
7//! - The target URL and HTTP method
8//! - Request headers and an optional request body (supporting JSON, form data, or raw bytes)
9//! - Metadata for tracking retry attempts or other custom information
10//!
11//! Additionally, the module provides methods for building requests,
12//! incrementing retry counters, and generating unique fingerprints
13//! for request deduplication and caching.
14//!
15//! ## Example
16//!
17//! ```rust
18//! use spider_util::request::{Request, Body};
19//! use url::Url;
20//! use serde_json::json;
21//!
22//! // Create a simple GET request
23//! let url = Url::parse("https://example.com").unwrap();
24//! let request = Request::new(url);
25//!
26//! // Create a POST request with JSON body
27//! let post_request = Request::new(Url::parse("https://api.example.com/data").unwrap())
28//!     .with_method(reqwest::Method::POST)
29//!     .with_json(json!({"key": "value"}));
30//! ```
31
32use bytes::Bytes;
33use dashmap::DashMap;
34use http::header::HeaderMap;
35use reqwest::{Method, Url};
36use serde::{Deserialize, Serialize};
37use serde_json::Value;
38use std::collections::HashMap;
39use std::hash::Hasher;
40use std::str::FromStr;
41use std::sync::Arc;
42use twox_hash::XxHash64;
43
44use crate::error::SpiderError;
45
46/// The body of an HTTP request.
47///
48/// [`Body`] encapsulates the different types of request bodies that can be sent
49/// with an HTTP request. It supports JSON payloads, form data, and raw bytes.
50///
51/// ## Variants
52///
53/// - `Json`: A JSON value (typically an object or array)
54/// - `Form`: Key-value form data encoded as `application/x-www-form-urlencoded`
55/// - `Bytes`: Raw binary data
56///
57/// ## Example
58///
59/// ```rust
60/// use spider_util::request::Body;
61/// use serde_json::json;
62/// use dashmap::DashMap;
63/// use bytes::Bytes;
64///
65/// // JSON body
66/// let json_body = Body::Json(json!({"name": "test"}));
67///
68/// // Form data
69/// let mut form = DashMap::new();
70/// form.insert("key".to_string(), "value".to_string());
71/// let form_body = Body::Form(form);
72///
73/// // Raw bytes
74/// let bytes_body = Body::Bytes(Bytes::from("raw data"));
75/// ```
76#[derive(Debug, Clone)]
77pub enum Body {
78    /// JSON payload.
79    Json(serde_json::Value),
80    /// Form data (key-value pairs).
81    Form(DashMap<String, String>),
82    /// Raw binary data.
83    Bytes(Bytes),
84}
85
86// Custom serialization for Body enum
87impl Serialize for Body {
88    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
89    where
90        S: serde::Serializer,
91    {
92        use serde::ser::SerializeMap;
93        let mut map = serializer.serialize_map(Some(1))?;
94
95        match self {
96            Body::Json(value) => map.serialize_entry("Json", value)?,
97            Body::Form(dashmap) => {
98                let hmap: HashMap<String, String> = dashmap.clone().into_iter().collect();
99                map.serialize_entry("Form", &hmap)?
100            }
101            Body::Bytes(bytes) => map.serialize_entry("Bytes", bytes)?,
102        }
103
104        map.end()
105    }
106}
107
108impl<'de> Deserialize<'de> for Body {
109    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
110    where
111        D: serde::Deserializer<'de>,
112    {
113        use serde::de::{self, MapAccess, Visitor};
114        use std::fmt;
115
116        struct BodyVisitor;
117
118        impl<'de> Visitor<'de> for BodyVisitor {
119            type Value = Body;
120
121            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
122                formatter.write_str("a body object")
123            }
124
125            fn visit_map<V>(self, mut map: V) -> Result<Body, V::Error>
126            where
127                V: MapAccess<'de>,
128            {
129                let entry = map.next_entry::<String, Value>()?;
130                let (key, value) = match entry {
131                    Some((k, v)) => (k, v),
132                    None => return Err(de::Error::custom("Expected a body variant")),
133                };
134
135                match key.as_str() {
136                    "Json" => Ok(Body::Json(value)),
137                    "Form" => {
138                        let form_data: HashMap<String, String> =
139                            serde_json::from_value(value).map_err(de::Error::custom)?;
140                        let dashmap = DashMap::new();
141                        for (k, v) in form_data {
142                            dashmap.insert(k, v);
143                        }
144                        Ok(Body::Form(dashmap))
145                    }
146                    "Bytes" => {
147                        let bytes: Bytes =
148                            serde_json::from_value(value).map_err(de::Error::custom)?;
149                        Ok(Body::Bytes(bytes))
150                    }
151                    _ => Err(de::Error::custom(format!("Unknown body variant: {}", key))),
152                }
153            }
154        }
155
156        deserializer.deserialize_map(BodyVisitor)
157    }
158}
159
160/// An HTTP request to be processed by the crawler.
161///
162/// [`Request`] is the primary data structure for representing outgoing HTTP
163/// requests in the spider framework. It contains all information needed to
164/// execute an HTTP request, including the URL, method, headers, body, and
165/// optional metadata.
166///
167/// ## Memory Efficiency
168///
169/// The `meta` field uses lazy initialization - the metadata map is only
170/// allocated when actually used. This reduces memory overhead for simple
171/// requests that don't need metadata.
172///
173/// ## Example
174///
175/// ```rust
176/// use spider_util::request::Request;
177/// use url::Url;
178///
179/// // Create a basic GET request
180/// let request = Request::new(Url::parse("https://example.com").unwrap());
181///
182/// // Build a request with headers and method
183/// let post_request = Request::new(Url::parse("https://api.example.com").unwrap())
184///     .with_method(reqwest::Method::POST)
185///     .with_header("Accept", "application/json")
186///     .unwrap();
187/// ```
188#[derive(Debug, Clone)]
189pub struct Request {
190    /// The target URL for this request.
191    pub url: Url,
192    /// The HTTP method (GET, POST, etc.).
193    pub method: reqwest::Method,
194    /// HTTP headers for the request.
195    pub headers: http::header::HeaderMap,
196    /// Optional request body.
197    pub body: Option<Body>,
198    /// Lazy-initialized metadata - only allocated when actually used.
199    /// This reduces memory allocation for simple requests without metadata.
200    meta: Option<Arc<DashMap<String, Value>>>,
201}
202
203// Custom serialization for Request struct
204impl Serialize for Request {
205    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
206    where
207        S: serde::Serializer,
208    {
209        use serde::ser::SerializeStruct;
210        // Convert HeaderMap to a serializable format
211        let headers_vec: Vec<(String, String)> = self
212            .headers
213            .iter()
214            .filter_map(|(name, value)| {
215                value
216                    .to_str()
217                    .ok()
218                    .map(|val_str| (name.as_str().to_string(), val_str.to_string()))
219            })
220            .collect();
221
222        let mut s = serializer.serialize_struct("Request", 5)?;
223        s.serialize_field("url", &self.url.as_str())?;
224        s.serialize_field("method", &self.method.as_str())?;
225        s.serialize_field("headers", &headers_vec)?;
226        s.serialize_field("body", &self.body)?;
227        // Serialize meta as empty HashMap if None (for backward compatibility)
228        let meta_map: HashMap<String, Value> = self
229            .meta
230            .as_ref()
231            .map(|m| {
232                m.iter()
233                    .map(|e| (e.key().clone(), e.value().clone()))
234                    .collect()
235            })
236            .unwrap_or_default();
237        s.serialize_field("meta", &meta_map)?;
238        s.end()
239    }
240}
241
242impl<'de> Deserialize<'de> for Request {
243    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
244    where
245        D: serde::Deserializer<'de>,
246    {
247        use serde::de::{self, MapAccess, Visitor};
248        use std::fmt;
249
250        #[derive(Deserialize)]
251        #[serde(field_identifier, rename_all = "lowercase")]
252        enum Field {
253            Url,
254            Method,
255            Headers,
256            Body,
257            Meta,
258        }
259
260        struct RequestVisitor;
261
262        impl<'de> Visitor<'de> for RequestVisitor {
263            type Value = Request;
264
265            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
266                formatter.write_str("struct Request")
267            }
268
269            fn visit_map<V>(self, mut map: V) -> Result<Request, V::Error>
270            where
271                V: MapAccess<'de>,
272            {
273                let mut url = None;
274                let mut method = None;
275                let mut headers = None;
276                let mut body = None;
277                let mut meta = None;
278
279                while let Some(key) = map.next_key()? {
280                    match key {
281                        Field::Url => {
282                            if url.is_some() {
283                                return Err(de::Error::duplicate_field("url"));
284                            }
285                            let url_str: String = map.next_value()?;
286                            let parsed_url = Url::parse(&url_str).map_err(de::Error::custom)?;
287                            url = Some(parsed_url);
288                        }
289                        Field::Method => {
290                            if method.is_some() {
291                                return Err(de::Error::duplicate_field("method"));
292                            }
293                            let method_str: String = map.next_value()?;
294                            let parsed_method =
295                                Method::from_str(&method_str).map_err(de::Error::custom)?;
296                            method = Some(parsed_method);
297                        }
298                        Field::Headers => {
299                            if headers.is_some() {
300                                return Err(de::Error::duplicate_field("headers"));
301                            }
302                            // Deserialize headers vector and convert back to HeaderMap
303                            let headers_vec: Vec<(String, String)> = map.next_value()?;
304                            let mut header_map = HeaderMap::new();
305                            for (name, value) in headers_vec {
306                                if let Ok(header_name) =
307                                    http::header::HeaderName::from_bytes(name.as_bytes())
308                                    && let Ok(header_value) =
309                                        http::header::HeaderValue::from_str(&value)
310                                {
311                                    header_map.insert(header_name, header_value);
312                                }
313                            }
314                            headers = Some(header_map);
315                        }
316                        Field::Body => {
317                            if body.is_some() {
318                                return Err(de::Error::duplicate_field("body"));
319                            }
320                            body = Some(map.next_value()?);
321                        }
322                        Field::Meta => {
323                            // Deserialize meta HashMap and convert to DashMap
324                            let meta_map: HashMap<String, Value> = map.next_value()?;
325                            if !meta_map.is_empty() {
326                                let dashmap = DashMap::new();
327                                for (k, v) in meta_map {
328                                    dashmap.insert(k, v);
329                                }
330                                meta = Some(Arc::new(dashmap));
331                            }
332                        }
333                    }
334                }
335
336                let url = url.ok_or_else(|| de::Error::missing_field("url"))?;
337                let method = method.ok_or_else(|| de::Error::missing_field("method"))?;
338                let headers = headers.ok_or_else(|| de::Error::missing_field("headers"))?;
339                let body = body; // Optional field
340
341                Ok(Request {
342                    url,
343                    method,
344                    headers,
345                    body,
346                    meta, // May be None if no meta was serialized
347                })
348            }
349        }
350
351        const FIELDS: &[&str] = &["url", "method", "headers", "body", "meta"];
352        deserializer.deserialize_struct("Request", FIELDS, RequestVisitor)
353    }
354}
355
356impl Default for Request {
357    fn default() -> Self {
358        let default_url = match Url::parse("http://default.invalid") {
359            Ok(url) => url,
360            Err(err) => panic!("invalid hardcoded default URL: {}", err),
361        };
362        Self {
363            url: default_url,
364            method: reqwest::Method::GET,
365            headers: http::header::HeaderMap::new(),
366            body: None,
367            meta: None, // Lazy initialization - no allocation until needed
368        }
369    }
370}
371
372impl Request {
373    /// Creates a new [`Request`] with the given URL.
374    ///
375    /// Does not allocate memory for metadata unless [`with_meta`](Request::with_meta) is called.
376    ///
377    /// ## Example
378    ///
379    /// ```rust
380    /// use spider_util::request::Request;
381    /// use url::Url;
382    ///
383    /// let request = Request::new(Url::parse("https://example.com").unwrap());
384    /// ```
385    pub fn new(url: Url) -> Self {
386        Request {
387            url,
388            method: reqwest::Method::GET,
389            headers: http::header::HeaderMap::new(),
390            body: None,
391            meta: None,
392        }
393    }
394
395    /// Sets the HTTP method for the request.
396    ///
397    /// ## Example
398    ///
399    /// ```rust
400    /// use spider_util::request::Request;
401    /// use url::Url;
402    ///
403    /// let request = Request::new(Url::parse("https://example.com").unwrap())
404    ///     .with_method(reqwest::Method::POST);
405    /// ```
406    pub fn with_method(mut self, method: reqwest::Method) -> Self {
407        self.method = method;
408        self
409    }
410
411    /// Adds a header to the request.
412    ///
413    /// # Errors
414    ///
415    /// Returns a [`SpiderError::HeaderValueError`] if the header name or value is invalid.
416    ///
417    /// ## Example
418    ///
419    /// ```rust
420    /// use spider_util::request::Request;
421    /// use url::Url;
422    ///
423    /// let request = Request::new(Url::parse("https://example.com").unwrap())
424    ///     .with_header("Accept", "application/json")
425    ///     .unwrap();
426    /// ```
427    pub fn with_header(mut self, name: &str, value: &str) -> Result<Self, SpiderError> {
428        let header_name =
429            reqwest::header::HeaderName::from_bytes(name.as_bytes()).map_err(|e| {
430                SpiderError::HeaderValueError(format!("Invalid header name '{}': {}", name, e))
431            })?;
432        let header_value = reqwest::header::HeaderValue::from_str(value).map_err(|e| {
433            SpiderError::HeaderValueError(format!("Invalid header value '{}': {}", value, e))
434        })?;
435
436        self.headers.insert(header_name, header_value);
437        Ok(self)
438    }
439
440    /// Sets the body of the request and defaults the method to POST.
441    ///
442    /// ## Example
443    ///
444    /// ```rust
445    /// use spider_util::request::{Request, Body};
446    /// use url::Url;
447    /// use serde_json::json;
448    ///
449    /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
450    ///     .with_body(Body::Json(json!({"key": "value"})));
451    /// ```
452    pub fn with_body(mut self, body: Body) -> Self {
453        self.body = Some(body);
454        self.with_method(reqwest::Method::POST)
455    }
456
457    /// Sets the body of the request to a JSON value and defaults the method to POST.
458    ///
459    /// ## Example
460    ///
461    /// ```rust
462    /// use spider_util::request::Request;
463    /// use url::Url;
464    /// use serde_json::json;
465    ///
466    /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
467    ///     .with_json(json!({"name": "test"}));
468    /// ```
469    pub fn with_json(self, json: serde_json::Value) -> Self {
470        self.with_body(Body::Json(json))
471    }
472
473    /// Sets the body of the request to form data and defaults the method to POST.
474    ///
475    /// ## Example
476    ///
477    /// ```rust
478    /// use spider_util::request::Request;
479    /// use url::Url;
480    /// use dashmap::DashMap;
481    ///
482    /// let mut form = DashMap::new();
483    /// form.insert("key".to_string(), "value".to_string());
484    ///
485    /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
486    ///     .with_form(form);
487    /// ```
488    pub fn with_form(self, form: DashMap<String, String>) -> Self {
489        self.with_body(Body::Form(form))
490    }
491
492    /// Sets the body of the request to raw bytes and defaults the method to POST.
493    ///
494    /// ## Example
495    ///
496    /// ```rust
497    /// use spider_util::request::Request;
498    /// use url::Url;
499    /// use bytes::Bytes;
500    ///
501    /// let data = Bytes::from("binary data");
502    /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
503    ///     .with_bytes(data);
504    /// ```
505    pub fn with_bytes(self, bytes: bytes::Bytes) -> Self {
506        self.with_body(Body::Bytes(bytes))
507    }
508
509    /// Adds a value to the request's metadata.
510    ///
511    /// Lazily allocates the metadata map on first use.
512    ///
513    /// ## Example
514    ///
515    /// ```rust
516    /// use spider_util::request::Request;
517    /// use url::Url;
518    /// use serde_json::json;
519    ///
520    /// let request = Request::new(Url::parse("https://example.com").unwrap())
521    ///     .with_meta("priority", json!(1))
522    ///     .with_meta("source", json!("manual"));
523    /// ```
524    pub fn with_meta(mut self, key: &str, value: serde_json::Value) -> Self {
525        self.meta
526            .get_or_insert_with(|| Arc::new(DashMap::new()))
527            .insert(key.to_string(), value);
528        self
529    }
530
531    /// Gets a reference to a metadata value, if it exists.
532    ///
533    /// Returns `None` if the key doesn't exist or if metadata hasn't been set.
534    pub fn get_meta(&self, key: &str) -> Option<serde_json::Value> {
535        self.meta
536            .as_ref()
537            .and_then(|m| m.get(key).map(|e| e.value().clone()))
538    }
539
540    /// Returns `true` if the request has metadata.
541    pub fn has_meta(&self) -> bool {
542        self.meta.as_ref().is_some_and(|m| !m.is_empty())
543    }
544
545    /// Returns a reference to the internal metadata map, if it exists.
546    pub fn meta_map(&self) -> Option<&Arc<DashMap<String, serde_json::Value>>> {
547        self.meta.as_ref()
548    }
549
550    /// Inserts a value into metadata, creating the map if needed.
551    ///
552    /// This is intended for internal framework use.
553    pub fn insert_meta(&mut self, key: String, value: serde_json::Value) {
554        self.meta
555            .get_or_insert_with(|| Arc::new(DashMap::new()))
556            .insert(key, value);
557    }
558
559    /// Gets a value from metadata using DashMap's API.
560    ///
561    /// This is intended for internal framework use where direct access is needed.
562    pub fn get_meta_ref(
563        &self,
564        key: &str,
565    ) -> Option<dashmap::mapref::one::Ref<'_, String, serde_json::Value>> {
566        self.meta.as_ref().and_then(|m| m.get(key))
567    }
568
569    /// Sets the metadata map directly.
570    ///
571    /// Used for internal framework operations.
572    pub fn set_meta_from_option(&mut self, meta: Option<Arc<DashMap<String, serde_json::Value>>>) {
573        self.meta = meta;
574    }
575
576    /// Clones the metadata map.
577    ///
578    /// Used for internal framework operations where metadata needs to be copied.
579    pub fn clone_meta(&self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
580        self.meta.clone()
581    }
582
583    /// Takes the metadata map, leaving `None` in its place.
584    ///
585    /// Used for internal framework operations.
586    pub fn take_meta(&mut self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
587        self.meta.take()
588    }
589
590    /// Returns a reference to the metadata Arc for internal framework use.
591    pub fn meta_inner(&self) -> &Option<Arc<DashMap<String, serde_json::Value>>> {
592        &self.meta
593    }
594
595    const RETRY_ATTEMPTS_KEY: &str = "retry_attempts";
596
597    /// Gets the number of times the request has been retried.
598    ///
599    /// Returns `0` if no retry attempts have been recorded.
600    pub fn get_retry_attempts(&self) -> u32 {
601        self.meta
602            .as_ref()
603            .and_then(|m| m.get(Self::RETRY_ATTEMPTS_KEY))
604            .and_then(|v| v.value().as_u64())
605            .unwrap_or(0) as u32
606    }
607
608    /// Increments the retry count for the request.
609    ///
610    /// Lazily allocates the metadata map if not already present.
611    pub fn increment_retry_attempts(&mut self) {
612        let current_attempts = self.get_retry_attempts();
613        self.meta
614            .get_or_insert_with(|| Arc::new(DashMap::new()))
615            .insert(
616                Self::RETRY_ATTEMPTS_KEY.to_string(),
617                serde_json::Value::from(current_attempts + 1),
618            );
619    }
620
621    /// Generates a unique fingerprint for the request based on its URL, method, and body.
622    ///
623    /// The fingerprint is used for duplicate detection and caching. It combines:
624    /// - The request URL
625    /// - The HTTP method
626    /// - The request body (if present)
627    ///
628    /// ## Example
629    ///
630    /// ```rust
631    /// use spider_util::request::Request;
632    /// use url::Url;
633    ///
634    /// let request = Request::new(Url::parse("https://example.com").unwrap());
635    /// let fingerprint = request.fingerprint();
636    /// ```
637    pub fn fingerprint(&self) -> String {
638        let mut hasher = XxHash64::default();
639        hasher.write(self.url.as_str().as_bytes());
640        hasher.write(self.method.as_str().as_bytes());
641
642        if let Some(ref body) = self.body {
643            match body {
644                Body::Json(json_val) => {
645                    if let Ok(serialized) = serde_json::to_string(json_val) {
646                        hasher.write(serialized.as_bytes());
647                    }
648                }
649                Body::Form(form_val) => {
650                    // Optimized: hash components directly without building intermediate String
651                    for r in form_val.iter() {
652                        hasher.write(r.key().as_bytes());
653                        hasher.write(r.value().as_bytes());
654                    }
655                }
656                Body::Bytes(bytes_val) => {
657                    hasher.write(bytes_val);
658                }
659            }
660        }
661        format!("{:x}", hasher.finish())
662    }
663}