Skip to main content

spider_lib/
request.rs

1//! Data structures for representing HTTP requests in `spider-lib`.
2//!
3//! This module defines the `Request` struct, which is a central component
4//! for constructing and managing outgoing HTTP requests within the
5//! `spider-lib` framework. It encapsulates all necessary details of an
6//! HTTP request, including:
7//! - The target URL and HTTP method.
8//! - Request headers and an optional request body (supporting JSON, form data, or raw bytes).
9//! - Metadata for tracking retry attempts or other custom information.
10//!
11//! Additionally, the module provides methods for building requests,
12//! incrementing retry counters, and generating unique fingerprints
13//! for request deduplication and caching.
14
15use crate::error::SpiderError;
16use bytes::Bytes;
17use dashmap::DashMap;
18use hex;
19use http::header::HeaderMap;
20use reqwest::{Method, Url};
21use serde::{Deserialize, Serialize};
22use serde_json;
23use serde_with::{DisplayFromStr, serde_as};
24use sha2::{Digest, Sha256};
25use std::borrow::Cow;
26
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub enum Body {
29    Json(serde_json::Value),
30    Form(DashMap<String, String>),
31    Bytes(Bytes),
32}
33
34#[serde_as]
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct Request {
37    pub url: Url,
38    #[serde_as(as = "DisplayFromStr")]
39    pub method: Method,
40    #[serde(
41        serialize_with = "header_serde::serialize_headers",
42        deserialize_with = "header_serde::deserialize_headers"
43    )]
44    pub headers: HeaderMap,
45    pub body: Option<Body>,
46    #[serde(skip)]
47    pub meta: DashMap<Cow<'static, str>, serde_json::Value>,
48}
49
50impl Default for Request {
51    fn default() -> Self {
52        Self {
53            url: Url::parse("http://default.invalid").unwrap(),
54            method: Method::GET,
55            headers: HeaderMap::new(),
56            body: None,
57            meta: DashMap::new(),
58        }
59    }
60}
61
62impl Request {
63    /// Creates a new `Request` with the given URL.
64    pub fn new(url: Url) -> Self {
65        Request {
66            url,
67            method: Method::GET,
68            headers: HeaderMap::new(),
69            body: None,
70            meta: DashMap::new(),
71        }
72    }
73
74    /// Sets the HTTP method for the request.
75    pub fn with_method(mut self, method: Method) -> Self {
76        self.method = method;
77        self
78    }
79
80    /// Adds a header to the request.
81    pub fn with_header(mut self, name: &str, value: &str) -> Result<Self, SpiderError> {
82        let header_name =
83            reqwest::header::HeaderName::from_bytes(name.as_bytes()).map_err(|e| {
84                SpiderError::HeaderValueError(format!("Invalid header name '{}': {}", name, e))
85            })?;
86        let header_value = reqwest::header::HeaderValue::from_str(value).map_err(|e| {
87            SpiderError::HeaderValueError(format!("Invalid header value '{}': {}", value, e))
88        })?;
89
90        self.headers.insert(header_name, header_value);
91        Ok(self)
92    }
93
94    /// Sets the body of the request and defaults the method to POST.
95    pub fn with_body(mut self, body: Body) -> Self {
96        self.body = Some(body);
97        self.with_method(Method::POST)
98    }
99
100    /// Sets the body of the request to a JSON value.
101    pub fn with_json(self, json: serde_json::Value) -> Self {
102        self.with_body(Body::Json(json))
103    }
104
105    /// Sets the body of the request to a form.
106    pub fn with_form(self, form: DashMap<String, String>) -> Self {
107        self.with_body(Body::Form(form))
108    }
109
110    /// Sets the body of the request to a byte slice.
111    pub fn with_bytes(self, bytes: Bytes) -> Self {
112        self.with_body(Body::Bytes(bytes))
113    }
114
115    /// Adds a value to the request's metadata.
116    pub fn with_meta(self, key: &str, value: serde_json::Value) -> Self {
117        self.meta.insert(Cow::Owned(key.to_owned()), value);
118        self
119    }
120
121    const RETRY_ATTEMPTS_KEY: &str = "retry_attempts";
122
123    /// Gets the number of times the request has been retried.
124    pub fn get_retry_attempts(&self) -> u32 {
125        self.meta
126            .get(Self::RETRY_ATTEMPTS_KEY)
127            .and_then(|v| v.value().as_u64())
128            .unwrap_or(0) as u32
129    }
130
131    /// Increments the retry count for the request.
132    pub fn increment_retry_attempts(&mut self) {
133        let current_attempts = self.get_retry_attempts();
134        self.meta.insert(
135            Cow::Borrowed(Self::RETRY_ATTEMPTS_KEY),
136            serde_json::to_value(current_attempts + 1).unwrap(),
137        );
138    }
139
140    /// Generates a unique fingerprint for the request based on its URL, method, and body.
141    pub fn fingerprint(&self) -> String {
142        let mut hasher = Sha256::new();
143        hasher.update(self.url.as_str().as_bytes());
144        hasher.update(self.method.as_str().as_bytes());
145
146        if let Some(ref body) = self.body {
147            match body {
148                Body::Json(json_val) => {
149                    if let Ok(serialized) = serde_json::to_string(json_val) {
150                        hasher.update(serialized.as_bytes());
151                    }
152                }
153                Body::Form(form_val) => {
154                    let mut form_string = String::new();
155                    for r in form_val.iter() {
156                        form_string.push_str(r.key());
157                        form_string.push_str(r.value());
158                    }
159                    hasher.update(form_string.as_bytes());
160                }
161                Body::Bytes(bytes_val) => {
162                    hasher.update(bytes_val);
163                }
164            }
165        }
166        hex::encode(hasher.finalize())
167    }
168}
169
170mod header_serde {
171    use super::*;
172    use reqwest::header::{HeaderName, HeaderValue};
173    use serde::{Deserializer, Serializer};
174    use std::str::FromStr;
175
176    pub fn serialize_headers<S>(headers: &HeaderMap, serializer: S) -> Result<S::Ok, S::Error>
177    where
178        S: Serializer,
179    {
180        let map: Vec<(&str, &str)> = headers
181            .iter()
182            .filter_map(|(name, value)| {
183                value
184                    .to_str()
185                    .ok()
186                    .map(|value_str| (name.as_str(), value_str))
187            })
188            .collect();
189        map.serialize(serializer)
190    }
191
192    pub fn deserialize_headers<'de, D>(deserializer: D) -> Result<HeaderMap, D::Error>
193    where
194        D: Deserializer<'de>,
195    {
196        let vec = Vec::<(&str, &str)>::deserialize(deserializer)?;
197        let mut headers = HeaderMap::new();
198        for (name, value) in vec {
199            let header_name = HeaderName::from_str(name).map_err(serde::de::Error::custom)?;
200            let header_value = HeaderValue::from_str(value).map_err(serde::de::Error::custom)?;
201            headers.insert(header_name, header_value);
202        }
203        Ok(headers)
204    }
205}