spider_util/request.rs
1//! Data structures for representing HTTP requests in `spider-lib`.
2//!
3//! This module defines the [`Request`] struct, which is a central component
4//! for constructing and managing outgoing HTTP requests within the
5//! `spider-lib` framework. It encapsulates all necessary details of an
6//! HTTP request, including:
7//! - The target URL and HTTP method
8//! - Request headers and an optional request body (supporting JSON, form data, or raw bytes)
9//! - Metadata for tracking retry attempts or other custom information
10//!
11//! Additionally, the module provides methods for building requests,
12//! incrementing retry counters, and generating unique fingerprints
13//! for request deduplication and caching.
14//!
15//! ## Example
16//!
17//! ```rust
18//! use spider_util::request::{Request, Body};
19//! use url::Url;
20//! use serde_json::json;
21//!
22//! // Create a simple GET request
23//! let url = Url::parse("https://example.com").unwrap();
24//! let request = Request::new(url);
25//!
26//! // Create a POST request with JSON body
27//! let post_request = Request::new(Url::parse("https://api.example.com/data").unwrap())
28//! .with_method(reqwest::Method::POST)
29//! .with_json(json!({"key": "value"}));
30//! ```
31
32use bytes::Bytes;
33use dashmap::DashMap;
34use http::header::HeaderMap;
35use reqwest::{Method, Url};
36use serde::{Deserialize, Serialize};
37use serde_json::Value;
38use std::collections::HashMap;
39use std::hash::Hasher;
40use std::str::FromStr;
41use std::sync::Arc;
42use twox_hash::XxHash64;
43
44use crate::error::SpiderError;
45
46/// The body of an HTTP request.
47///
48/// [`Body`] encapsulates the different types of request bodies that can be sent
49/// with an HTTP request. It supports JSON payloads, form data, and raw bytes.
50///
51/// ## Variants
52///
53/// - `Json`: A JSON value (typically an object or array)
54/// - `Form`: Key-value form data encoded as `application/x-www-form-urlencoded`
55/// - `Bytes`: Raw binary data
56///
57/// ## Example
58///
59/// ```rust
60/// use spider_util::request::Body;
61/// use serde_json::json;
62/// use dashmap::DashMap;
63/// use bytes::Bytes;
64///
65/// // JSON body
66/// let json_body = Body::Json(json!({"name": "test"}));
67///
68/// // Form data
69/// let mut form = DashMap::new();
70/// form.insert("key".to_string(), "value".to_string());
71/// let form_body = Body::Form(form);
72///
73/// // Raw bytes
74/// let bytes_body = Body::Bytes(Bytes::from("raw data"));
75/// ```
76#[derive(Debug, Clone)]
77pub enum Body {
78 /// JSON payload.
79 Json(serde_json::Value),
80 /// Form data (key-value pairs).
81 Form(DashMap<String, String>),
82 /// Raw binary data.
83 Bytes(Bytes),
84}
85
86// Custom serialization for Body enum
87impl Serialize for Body {
88 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
89 where
90 S: serde::Serializer,
91 {
92 use serde::ser::SerializeMap;
93 let mut map = serializer.serialize_map(Some(1))?;
94
95 match self {
96 Body::Json(value) => map.serialize_entry("Json", value)?,
97 Body::Form(dashmap) => {
98 let hmap: HashMap<String, String> = dashmap.clone().into_iter().collect();
99 map.serialize_entry("Form", &hmap)?
100 }
101 Body::Bytes(bytes) => map.serialize_entry("Bytes", bytes)?,
102 }
103
104 map.end()
105 }
106}
107
108impl<'de> Deserialize<'de> for Body {
109 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
110 where
111 D: serde::Deserializer<'de>,
112 {
113 use serde::de::{self, MapAccess, Visitor};
114 use std::fmt;
115
116 struct BodyVisitor;
117
118 impl<'de> Visitor<'de> for BodyVisitor {
119 type Value = Body;
120
121 fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
122 formatter.write_str("a body object")
123 }
124
125 fn visit_map<V>(self, mut map: V) -> Result<Body, V::Error>
126 where
127 V: MapAccess<'de>,
128 {
129 let entry = map.next_entry::<String, Value>()?;
130 let (key, value) = match entry {
131 Some((k, v)) => (k, v),
132 None => return Err(de::Error::custom("Expected a body variant")),
133 };
134
135 match key.as_str() {
136 "Json" => Ok(Body::Json(value)),
137 "Form" => {
138 let form_data: HashMap<String, String> =
139 serde_json::from_value(value).map_err(de::Error::custom)?;
140 let dashmap = DashMap::new();
141 for (k, v) in form_data {
142 dashmap.insert(k, v);
143 }
144 Ok(Body::Form(dashmap))
145 }
146 "Bytes" => {
147 let bytes: Bytes =
148 serde_json::from_value(value).map_err(de::Error::custom)?;
149 Ok(Body::Bytes(bytes))
150 }
151 _ => Err(de::Error::custom(format!("Unknown body variant: {}", key))),
152 }
153 }
154 }
155
156 deserializer.deserialize_map(BodyVisitor)
157 }
158}
159
160/// An HTTP request to be processed by the crawler.
161///
162/// [`Request`] is the primary data structure for representing outgoing HTTP
163/// requests in the spider framework. It contains all information needed to
164/// execute an HTTP request, including the URL, method, headers, body, and
165/// optional metadata.
166///
167/// ## Memory Efficiency
168///
169/// The `meta` field uses lazy initialization - the metadata map is only
170/// allocated when actually used. This reduces memory overhead for simple
171/// requests that don't need metadata.
172///
173/// ## Example
174///
175/// ```rust
176/// use spider_util::request::Request;
177/// use url::Url;
178///
179/// // Create a basic GET request
180/// let request = Request::new(Url::parse("https://example.com").unwrap());
181///
182/// // Build a request with headers and method
183/// let post_request = Request::new(Url::parse("https://api.example.com").unwrap())
184/// .with_method(reqwest::Method::POST)
185/// .with_header("Accept", "application/json")
186/// .unwrap();
187/// ```
188#[derive(Debug, Clone)]
189pub struct Request {
190 /// The target URL for this request.
191 pub url: Url,
192 /// The HTTP method (GET, POST, etc.).
193 pub method: reqwest::Method,
194 /// HTTP headers for the request.
195 pub headers: http::header::HeaderMap,
196 /// Optional request body.
197 pub body: Option<Body>,
198 /// Lazy-initialized metadata - only allocated when actually used.
199 /// This reduces memory allocation for simple requests without metadata.
200 meta: Option<Arc<DashMap<String, Value>>>,
201}
202
203// Custom serialization for Request struct
204impl Serialize for Request {
205 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
206 where
207 S: serde::Serializer,
208 {
209 use serde::ser::SerializeStruct;
210 // Convert HeaderMap to a serializable format
211 let headers_vec: Vec<(String, String)> = self
212 .headers
213 .iter()
214 .filter_map(|(name, value)| {
215 value
216 .to_str()
217 .ok()
218 .map(|val_str| (name.as_str().to_string(), val_str.to_string()))
219 })
220 .collect();
221
222 let mut s = serializer.serialize_struct("Request", 5)?;
223 s.serialize_field("url", &self.url.as_str())?;
224 s.serialize_field("method", &self.method.as_str())?;
225 s.serialize_field("headers", &headers_vec)?;
226 s.serialize_field("body", &self.body)?;
227 // Serialize meta as empty HashMap if None (for backward compatibility)
228 let meta_map: HashMap<String, Value> = self
229 .meta
230 .as_ref()
231 .map(|m| m.iter().map(|e| (e.key().clone(), e.value().clone())).collect())
232 .unwrap_or_default();
233 s.serialize_field("meta", &meta_map)?;
234 s.end()
235 }
236}
237
238impl<'de> Deserialize<'de> for Request {
239 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
240 where
241 D: serde::Deserializer<'de>,
242 {
243 use serde::de::{self, MapAccess, Visitor};
244 use std::fmt;
245
246 #[derive(Deserialize)]
247 #[serde(field_identifier, rename_all = "lowercase")]
248 enum Field {
249 Url,
250 Method,
251 Headers,
252 Body,
253 Meta,
254 }
255
256 struct RequestVisitor;
257
258 impl<'de> Visitor<'de> for RequestVisitor {
259 type Value = Request;
260
261 fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
262 formatter.write_str("struct Request")
263 }
264
265 fn visit_map<V>(self, mut map: V) -> Result<Request, V::Error>
266 where
267 V: MapAccess<'de>,
268 {
269 let mut url = None;
270 let mut method = None;
271 let mut headers = None;
272 let mut body = None;
273 let mut meta = None;
274
275 while let Some(key) = map.next_key()? {
276 match key {
277 Field::Url => {
278 if url.is_some() {
279 return Err(de::Error::duplicate_field("url"));
280 }
281 let url_str: String = map.next_value()?;
282 let parsed_url = Url::parse(&url_str).map_err(de::Error::custom)?;
283 url = Some(parsed_url);
284 }
285 Field::Method => {
286 if method.is_some() {
287 return Err(de::Error::duplicate_field("method"));
288 }
289 let method_str: String = map.next_value()?;
290 let parsed_method =
291 Method::from_str(&method_str).map_err(de::Error::custom)?;
292 method = Some(parsed_method);
293 }
294 Field::Headers => {
295 if headers.is_some() {
296 return Err(de::Error::duplicate_field("headers"));
297 }
298 // Deserialize headers vector and convert back to HeaderMap
299 let headers_vec: Vec<(String, String)> = map.next_value()?;
300 let mut header_map = HeaderMap::new();
301 for (name, value) in headers_vec {
302 if let Ok(header_name) =
303 http::header::HeaderName::from_bytes(name.as_bytes())
304 && let Ok(header_value) =
305 http::header::HeaderValue::from_str(&value)
306 {
307 header_map.insert(header_name, header_value);
308 }
309 }
310 headers = Some(header_map);
311 }
312 Field::Body => {
313 if body.is_some() {
314 return Err(de::Error::duplicate_field("body"));
315 }
316 body = Some(map.next_value()?);
317 }
318 Field::Meta => {
319 // Deserialize meta HashMap and convert to DashMap
320 let meta_map: HashMap<String, Value> = map.next_value()?;
321 if !meta_map.is_empty() {
322 let dashmap = DashMap::new();
323 for (k, v) in meta_map {
324 dashmap.insert(k, v);
325 }
326 meta = Some(Arc::new(dashmap));
327 }
328 }
329 }
330 }
331
332 let url = url.ok_or_else(|| de::Error::missing_field("url"))?;
333 let method = method.ok_or_else(|| de::Error::missing_field("method"))?;
334 let headers = headers.ok_or_else(|| de::Error::missing_field("headers"))?;
335 let body = body; // Optional field
336
337 Ok(Request {
338 url,
339 method,
340 headers,
341 body,
342 meta, // May be None if no meta was serialized
343 })
344 }
345 }
346
347 const FIELDS: &[&str] = &["url", "method", "headers", "body", "meta"];
348 deserializer.deserialize_struct("Request", FIELDS, RequestVisitor)
349 }
350}
351
352impl Default for Request {
353 fn default() -> Self {
354 Self {
355 url: Url::parse("http://default.invalid").unwrap(),
356 method: reqwest::Method::GET,
357 headers: http::header::HeaderMap::new(),
358 body: None,
359 meta: None, // Lazy initialization - no allocation until needed
360 }
361 }
362}
363
364impl Request {
365 /// Creates a new [`Request`] with the given URL.
366 ///
367 /// Does not allocate memory for metadata unless [`with_meta`](Request::with_meta) is called.
368 ///
369 /// ## Example
370 ///
371 /// ```rust
372 /// use spider_util::request::Request;
373 /// use url::Url;
374 ///
375 /// let request = Request::new(Url::parse("https://example.com").unwrap());
376 /// ```
377 pub fn new(url: Url) -> Self {
378 Request {
379 url,
380 method: reqwest::Method::GET,
381 headers: http::header::HeaderMap::new(),
382 body: None,
383 meta: None,
384 }
385 }
386
387 /// Sets the HTTP method for the request.
388 ///
389 /// ## Example
390 ///
391 /// ```rust
392 /// use spider_util::request::Request;
393 /// use url::Url;
394 ///
395 /// let request = Request::new(Url::parse("https://example.com").unwrap())
396 /// .with_method(reqwest::Method::POST);
397 /// ```
398 pub fn with_method(mut self, method: reqwest::Method) -> Self {
399 self.method = method;
400 self
401 }
402
403 /// Adds a header to the request.
404 ///
405 /// # Errors
406 ///
407 /// Returns a [`SpiderError::HeaderValueError`] if the header name or value is invalid.
408 ///
409 /// ## Example
410 ///
411 /// ```rust
412 /// use spider_util::request::Request;
413 /// use url::Url;
414 ///
415 /// let request = Request::new(Url::parse("https://example.com").unwrap())
416 /// .with_header("Accept", "application/json")
417 /// .unwrap();
418 /// ```
419 pub fn with_header(mut self, name: &str, value: &str) -> Result<Self, SpiderError> {
420 let header_name =
421 reqwest::header::HeaderName::from_bytes(name.as_bytes()).map_err(|e| {
422 SpiderError::HeaderValueError(format!("Invalid header name '{}': {}", name, e))
423 })?;
424 let header_value = reqwest::header::HeaderValue::from_str(value).map_err(|e| {
425 SpiderError::HeaderValueError(format!("Invalid header value '{}': {}", value, e))
426 })?;
427
428 self.headers.insert(header_name, header_value);
429 Ok(self)
430 }
431
432 /// Sets the body of the request and defaults the method to POST.
433 ///
434 /// ## Example
435 ///
436 /// ```rust
437 /// use spider_util::request::{Request, Body};
438 /// use url::Url;
439 /// use serde_json::json;
440 ///
441 /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
442 /// .with_body(Body::Json(json!({"key": "value"})));
443 /// ```
444 pub fn with_body(mut self, body: Body) -> Self {
445 self.body = Some(body);
446 self.with_method(reqwest::Method::POST)
447 }
448
449 /// Sets the body of the request to a JSON value and defaults the method to POST.
450 ///
451 /// ## Example
452 ///
453 /// ```rust
454 /// use spider_util::request::Request;
455 /// use url::Url;
456 /// use serde_json::json;
457 ///
458 /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
459 /// .with_json(json!({"name": "test"}));
460 /// ```
461 pub fn with_json(self, json: serde_json::Value) -> Self {
462 self.with_body(Body::Json(json))
463 }
464
465 /// Sets the body of the request to form data and defaults the method to POST.
466 ///
467 /// ## Example
468 ///
469 /// ```rust
470 /// use spider_util::request::Request;
471 /// use url::Url;
472 /// use dashmap::DashMap;
473 ///
474 /// let mut form = DashMap::new();
475 /// form.insert("key".to_string(), "value".to_string());
476 ///
477 /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
478 /// .with_form(form);
479 /// ```
480 pub fn with_form(self, form: DashMap<String, String>) -> Self {
481 self.with_body(Body::Form(form))
482 }
483
484 /// Sets the body of the request to raw bytes and defaults the method to POST.
485 ///
486 /// ## Example
487 ///
488 /// ```rust
489 /// use spider_util::request::Request;
490 /// use url::Url;
491 /// use bytes::Bytes;
492 ///
493 /// let data = Bytes::from("binary data");
494 /// let request = Request::new(Url::parse("https://api.example.com").unwrap())
495 /// .with_bytes(data);
496 /// ```
497 pub fn with_bytes(self, bytes: bytes::Bytes) -> Self {
498 self.with_body(Body::Bytes(bytes))
499 }
500
501 /// Adds a value to the request's metadata.
502 ///
503 /// Lazily allocates the metadata map on first use.
504 ///
505 /// ## Example
506 ///
507 /// ```rust
508 /// use spider_util::request::Request;
509 /// use url::Url;
510 /// use serde_json::json;
511 ///
512 /// let request = Request::new(Url::parse("https://example.com").unwrap())
513 /// .with_meta("priority", json!(1))
514 /// .with_meta("source", json!("manual"));
515 /// ```
516 pub fn with_meta(mut self, key: &str, value: serde_json::Value) -> Self {
517 self.meta
518 .get_or_insert_with(|| Arc::new(DashMap::new()))
519 .insert(key.to_string(), value);
520 self
521 }
522
523 /// Gets a reference to a metadata value, if it exists.
524 ///
525 /// Returns `None` if the key doesn't exist or if metadata hasn't been set.
526 pub fn get_meta(&self, key: &str) -> Option<serde_json::Value> {
527 self.meta
528 .as_ref()
529 .and_then(|m| m.get(key).map(|e| e.value().clone()))
530 }
531
532 /// Returns `true` if the request has metadata.
533 pub fn has_meta(&self) -> bool {
534 self.meta.as_ref().is_some_and(|m| !m.is_empty())
535 }
536
537 /// Returns a reference to the internal metadata map, if it exists.
538 pub fn meta_map(&self) -> Option<&Arc<DashMap<String, serde_json::Value>>> {
539 self.meta.as_ref()
540 }
541
542 /// Inserts a value into metadata, creating the map if needed.
543 ///
544 /// This is intended for internal framework use.
545 pub fn insert_meta(&mut self, key: String, value: serde_json::Value) {
546 self.meta
547 .get_or_insert_with(|| Arc::new(DashMap::new()))
548 .insert(key, value);
549 }
550
551 /// Gets a value from metadata using DashMap's API.
552 ///
553 /// This is intended for internal framework use where direct access is needed.
554 pub fn get_meta_ref(&self, key: &str) -> Option<dashmap::mapref::one::Ref<'_, String, serde_json::Value>> {
555 self.meta.as_ref().and_then(|m| m.get(key))
556 }
557
558 /// Sets the metadata map directly.
559 ///
560 /// Used for internal framework operations.
561 pub fn set_meta_from_option(&mut self, meta: Option<Arc<DashMap<String, serde_json::Value>>>) {
562 self.meta = meta;
563 }
564
565 /// Clones the metadata map.
566 ///
567 /// Used for internal framework operations where metadata needs to be copied.
568 pub fn clone_meta(&self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
569 self.meta.clone()
570 }
571
572 /// Takes the metadata map, leaving `None` in its place.
573 ///
574 /// Used for internal framework operations.
575 pub fn take_meta(&mut self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
576 self.meta.take()
577 }
578
579 /// Returns a reference to the metadata Arc for internal framework use.
580 pub fn meta_inner(&self) -> &Option<Arc<DashMap<String, serde_json::Value>>> {
581 &self.meta
582 }
583
584 const RETRY_ATTEMPTS_KEY: &str = "retry_attempts";
585
586 /// Gets the number of times the request has been retried.
587 ///
588 /// Returns `0` if no retry attempts have been recorded.
589 pub fn get_retry_attempts(&self) -> u32 {
590 self.meta
591 .as_ref()
592 .and_then(|m| m.get(Self::RETRY_ATTEMPTS_KEY))
593 .and_then(|v| v.value().as_u64())
594 .unwrap_or(0) as u32
595 }
596
597 /// Increments the retry count for the request.
598 ///
599 /// Lazily allocates the metadata map if not already present.
600 pub fn increment_retry_attempts(&mut self) {
601 let current_attempts = self.get_retry_attempts();
602 self.meta
603 .get_or_insert_with(|| Arc::new(DashMap::new()))
604 .insert(Self::RETRY_ATTEMPTS_KEY.to_string(), serde_json::Value::from(current_attempts + 1));
605 }
606
607 /// Generates a unique fingerprint for the request based on its URL, method, and body.
608 ///
609 /// The fingerprint is used for duplicate detection and caching. It combines:
610 /// - The request URL
611 /// - The HTTP method
612 /// - The request body (if present)
613 ///
614 /// ## Example
615 ///
616 /// ```rust
617 /// use spider_util::request::Request;
618 /// use url::Url;
619 ///
620 /// let request = Request::new(Url::parse("https://example.com").unwrap());
621 /// let fingerprint = request.fingerprint();
622 /// ```
623 pub fn fingerprint(&self) -> String {
624 let mut hasher = XxHash64::default();
625 hasher.write(self.url.as_str().as_bytes());
626 hasher.write(self.method.as_str().as_bytes());
627
628 if let Some(ref body) = self.body {
629 match body {
630 Body::Json(json_val) => {
631 if let Ok(serialized) = serde_json::to_string(json_val) {
632 hasher.write(serialized.as_bytes());
633 }
634 }
635 Body::Form(form_val) => {
636 // Optimized: hash components directly without building intermediate String
637 for r in form_val.iter() {
638 hasher.write(r.key().as_bytes());
639 hasher.write(r.value().as_bytes());
640 }
641 }
642 Body::Bytes(bytes_val) => {
643 hasher.write(bytes_val);
644 }
645 }
646 }
647 format!("{:x}", hasher.finish())
648 }
649}