spider_util/request.rs
1//! Request types used by the crawler runtime.
2//!
3//! [`Request`] is the runtime's transport-neutral request model. It stores the
4//! URL, method, headers, optional body, and a lazily allocated metadata map used
5//! by middleware and runtime internals.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_util::request::{Body, Method, Request};
11//! use url::Url;
12//! use serde_json::json;
13//!
14//! // Create a simple GET request
15//! let url = Url::parse("https://example.com")?;
16//! let request = Request::new(url);
17//!
18//! // Parse the URL as part of request construction
19//! let parsed_request = Request::try_new("https://example.com")?;
20//!
21//! // Create a POST request with JSON body
22//! let post_request = Request::new(Url::parse("https://api.example.com/data")?)
23//! .with_method(Method::Post)
24//! .with_json(json!({"key": "value"}));
25//! # Ok::<(), Box<dyn std::error::Error>>(())
26//! ```
27
28use bytes::Bytes;
29use dashmap::DashMap;
30use http::header::HeaderMap;
31use reqwest::{Method as ReqwestMethod, Url};
32use serde::de::DeserializeOwned;
33use serde::{Deserialize, Serialize};
34use serde_json::Value;
35use std::collections::HashMap;
36use std::hash::Hasher;
37use std::str::FromStr;
38use std::sync::Arc;
39use twox_hash::XxHash64;
40
41use crate::error::SpiderError;
42
43/// Request body variants supported by the default downloader.
44///
45/// ## Example
46///
47/// ```rust,ignore
48/// use spider_util::request::Body;
49/// use serde_json::json;
50/// use dashmap::DashMap;
51/// use bytes::Bytes;
52///
53/// // JSON body
54/// let json_body = Body::Json(json!({"name": "test"}));
55///
56/// // Form data
57/// let mut form = DashMap::new();
58/// form.insert("key".to_string(), "value".to_string());
59/// let form_body = Body::Form(form);
60///
61/// // Raw bytes
62/// let bytes_body = Body::Bytes(Bytes::from("raw data"));
63/// ```
64#[derive(Debug, Clone)]
65pub enum Body {
66 /// JSON payload.
67 Json(serde_json::Value),
68 /// Form data (key-value pairs).
69 Form(DashMap<String, String>),
70 /// Raw binary data.
71 Bytes(Bytes),
72}
73
74// Custom serialization for Body enum
75impl Serialize for Body {
76 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
77 where
78 S: serde::Serializer,
79 {
80 use serde::ser::SerializeMap;
81 let mut map = serializer.serialize_map(Some(1))?;
82
83 match self {
84 Body::Json(value) => map.serialize_entry("Json", value)?,
85 Body::Form(dashmap) => {
86 let hmap: HashMap<String, String> = dashmap.clone().into_iter().collect();
87 map.serialize_entry("Form", &hmap)?
88 }
89 Body::Bytes(bytes) => map.serialize_entry("Bytes", bytes)?,
90 }
91
92 map.end()
93 }
94}
95
96impl<'de> Deserialize<'de> for Body {
97 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
98 where
99 D: serde::Deserializer<'de>,
100 {
101 use serde::de::{self, MapAccess, Visitor};
102 use std::fmt;
103
104 struct BodyVisitor;
105
106 impl<'de> Visitor<'de> for BodyVisitor {
107 type Value = Body;
108
109 fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
110 formatter.write_str("a body object")
111 }
112
113 fn visit_map<V>(self, mut map: V) -> Result<Body, V::Error>
114 where
115 V: MapAccess<'de>,
116 {
117 let entry = map.next_entry::<String, Value>()?;
118 let (key, value) = match entry {
119 Some((k, v)) => (k, v),
120 None => return Err(de::Error::custom("Expected a body variant")),
121 };
122
123 match key.as_str() {
124 "Json" => Ok(Body::Json(value)),
125 "Form" => {
126 let form_data: HashMap<String, String> =
127 serde_json::from_value(value).map_err(de::Error::custom)?;
128 let dashmap = DashMap::new();
129 for (k, v) in form_data {
130 dashmap.insert(k, v);
131 }
132 Ok(Body::Form(dashmap))
133 }
134 "Bytes" => {
135 let bytes: Bytes =
136 serde_json::from_value(value).map_err(de::Error::custom)?;
137 Ok(Body::Bytes(bytes))
138 }
139 _ => Err(de::Error::custom(format!("Unknown body variant: {}", key))),
140 }
141 }
142 }
143
144 deserializer.deserialize_map(BodyVisitor)
145 }
146}
147
148/// Transport-neutral HTTP method used by [`Request`].
149#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
150pub enum Method {
151 /// `GET`
152 #[default]
153 Get,
154 /// `POST`
155 Post,
156 /// `PUT`
157 Put,
158 /// `PATCH`
159 Patch,
160 /// `DELETE`
161 Delete,
162 /// `HEAD`
163 Head,
164 /// `OPTIONS`
165 Options,
166 /// `TRACE`
167 Trace,
168 /// `CONNECT`
169 Connect,
170 /// Any other valid HTTP method token.
171 Custom(String),
172}
173
174impl Method {
175 /// Returns the wire-format method string.
176 pub fn as_str(&self) -> &str {
177 match self {
178 Method::Get => "GET",
179 Method::Post => "POST",
180 Method::Put => "PUT",
181 Method::Patch => "PATCH",
182 Method::Delete => "DELETE",
183 Method::Head => "HEAD",
184 Method::Options => "OPTIONS",
185 Method::Trace => "TRACE",
186 Method::Connect => "CONNECT",
187 Method::Custom(method) => method.as_str(),
188 }
189 }
190}
191
192impl std::fmt::Display for Method {
193 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
194 f.write_str(self.as_str())
195 }
196}
197
198impl FromStr for Method {
199 type Err = String;
200
201 fn from_str(method: &str) -> Result<Self, Self::Err> {
202 let parsed = ReqwestMethod::from_bytes(method.as_bytes()).map_err(|err| err.to_string())?;
203
204 Ok(match parsed.as_str() {
205 "GET" => Method::Get,
206 "POST" => Method::Post,
207 "PUT" => Method::Put,
208 "PATCH" => Method::Patch,
209 "DELETE" => Method::Delete,
210 "HEAD" => Method::Head,
211 "OPTIONS" => Method::Options,
212 "TRACE" => Method::Trace,
213 "CONNECT" => Method::Connect,
214 other => Method::Custom(other.to_string()),
215 })
216 }
217}
218
219impl From<Method> for ReqwestMethod {
220 fn from(method: Method) -> Self {
221 match method {
222 Method::Get => ReqwestMethod::GET,
223 Method::Post => ReqwestMethod::POST,
224 Method::Put => ReqwestMethod::PUT,
225 Method::Patch => ReqwestMethod::PATCH,
226 Method::Delete => ReqwestMethod::DELETE,
227 Method::Head => ReqwestMethod::HEAD,
228 Method::Options => ReqwestMethod::OPTIONS,
229 Method::Trace => ReqwestMethod::TRACE,
230 Method::Connect => ReqwestMethod::CONNECT,
231 Method::Custom(method) => {
232 ReqwestMethod::from_bytes(method.as_bytes()).expect("custom method validated")
233 }
234 }
235 }
236}
237
238/// Outgoing HTTP request used by the crawler runtime.
239///
240/// [`Request`] is the handoff type between spiders, middleware, the scheduler,
241/// and the downloader. It is transport-neutral enough to be shared across the
242/// workspace, but expressive enough for custom methods, headers, bodies, and
243/// request-scoped metadata.
244///
245/// ## Example
246///
247/// ```rust,ignore
248/// use spider_util::request::{Method, Request};
249/// use url::Url;
250///
251/// // Create a basic GET request
252/// let request = Request::new(Url::parse("https://example.com")?);
253///
254/// // Or parse a string into a request directly
255/// let request = Request::try_new("https://example.com")?;
256///
257/// // Build a request with headers and method
258/// let post_request = Request::new(Url::parse("https://api.example.com")?)
259/// .with_method(Method::Post)
260/// .with_header("Accept", "application/json")
261/// ?;
262/// # Ok::<(), Box<dyn std::error::Error>>(())
263/// ```
264#[derive(Debug, Clone)]
265pub struct Request {
266 /// The target URL for this request.
267 pub url: Url,
268 /// Request scheduling priority.
269 ///
270 /// Higher values are dequeued before lower values.
271 pub priority: i32,
272 /// The HTTP method (GET, POST, etc.).
273 pub method: Method,
274 /// HTTP headers for the request.
275 pub headers: http::header::HeaderMap,
276 /// Optional request body.
277 pub body: Option<Body>,
278 /// Lazy-initialized metadata - only allocated when actually used.
279 /// This reduces memory allocation for simple requests without metadata.
280 meta: Option<Arc<DashMap<String, Value>>>,
281}
282
283// Custom serialization for Request struct
284impl Serialize for Request {
285 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
286 where
287 S: serde::Serializer,
288 {
289 use serde::ser::SerializeStruct;
290 // Convert HeaderMap to a serializable format
291 let headers_vec: Vec<(String, String)> = self
292 .headers
293 .iter()
294 .filter_map(|(name, value)| {
295 value
296 .to_str()
297 .ok()
298 .map(|val_str| (name.as_str().to_string(), val_str.to_string()))
299 })
300 .collect();
301
302 let mut s = serializer.serialize_struct("Request", 6)?;
303 s.serialize_field("url", &self.url.as_str())?;
304 s.serialize_field("priority", &self.priority)?;
305 s.serialize_field("method", &self.method.as_str())?;
306 s.serialize_field("headers", &headers_vec)?;
307 s.serialize_field("body", &self.body)?;
308 // Serialize meta as empty HashMap if None (for backward compatibility)
309 let meta_map: HashMap<String, Value> = self
310 .meta
311 .as_ref()
312 .map(|m| {
313 m.iter()
314 .map(|e| (e.key().clone(), e.value().clone()))
315 .collect()
316 })
317 .unwrap_or_default();
318 s.serialize_field("meta", &meta_map)?;
319 s.end()
320 }
321}
322
323impl<'de> Deserialize<'de> for Request {
324 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
325 where
326 D: serde::Deserializer<'de>,
327 {
328 use serde::de::{self, MapAccess, Visitor};
329 use std::fmt;
330
331 #[derive(Deserialize)]
332 #[serde(field_identifier, rename_all = "lowercase")]
333 enum Field {
334 Url,
335 Priority,
336 Method,
337 Headers,
338 Body,
339 Meta,
340 }
341
342 struct RequestVisitor;
343
344 impl<'de> Visitor<'de> for RequestVisitor {
345 type Value = Request;
346
347 fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
348 formatter.write_str("struct Request")
349 }
350
351 fn visit_map<V>(self, mut map: V) -> Result<Request, V::Error>
352 where
353 V: MapAccess<'de>,
354 {
355 let mut url = None;
356 let mut priority = None;
357 let mut method = None;
358 let mut headers = None;
359 let mut body = None;
360 let mut meta = None;
361
362 while let Some(key) = map.next_key()? {
363 match key {
364 Field::Url => {
365 if url.is_some() {
366 return Err(de::Error::duplicate_field("url"));
367 }
368 let url_str: String = map.next_value()?;
369 let parsed_url = Url::parse(&url_str).map_err(de::Error::custom)?;
370 url = Some(parsed_url);
371 }
372 Field::Priority => {
373 if priority.is_some() {
374 return Err(de::Error::duplicate_field("priority"));
375 }
376 priority = Some(map.next_value()?);
377 }
378 Field::Method => {
379 if method.is_some() {
380 return Err(de::Error::duplicate_field("method"));
381 }
382 let method_str: String = map.next_value()?;
383 let parsed_method =
384 Method::from_str(&method_str).map_err(de::Error::custom)?;
385 method = Some(parsed_method);
386 }
387 Field::Headers => {
388 if headers.is_some() {
389 return Err(de::Error::duplicate_field("headers"));
390 }
391 // Deserialize headers vector and convert back to HeaderMap
392 let headers_vec: Vec<(String, String)> = map.next_value()?;
393 let mut header_map = HeaderMap::new();
394 for (name, value) in headers_vec {
395 if let Ok(header_name) =
396 http::header::HeaderName::from_bytes(name.as_bytes())
397 && let Ok(header_value) =
398 http::header::HeaderValue::from_str(&value)
399 {
400 header_map.insert(header_name, header_value);
401 }
402 }
403 headers = Some(header_map);
404 }
405 Field::Body => {
406 if body.is_some() {
407 return Err(de::Error::duplicate_field("body"));
408 }
409 body = map.next_value()?;
410 }
411 Field::Meta => {
412 // Deserialize meta HashMap and convert to DashMap
413 let meta_map: HashMap<String, Value> = map.next_value()?;
414 if !meta_map.is_empty() {
415 let dashmap = DashMap::new();
416 for (k, v) in meta_map {
417 dashmap.insert(k, v);
418 }
419 meta = Some(Arc::new(dashmap));
420 }
421 }
422 }
423 }
424
425 let url = url.ok_or_else(|| de::Error::missing_field("url"))?;
426 let priority = priority.unwrap_or(0);
427 let method = method.ok_or_else(|| de::Error::missing_field("method"))?;
428 let headers = headers.ok_or_else(|| de::Error::missing_field("headers"))?;
429 let body = body; // Optional field
430
431 Ok(Request {
432 url,
433 priority,
434 method,
435 headers,
436 body,
437 meta, // May be None if no meta was serialized
438 })
439 }
440 }
441
442 const FIELDS: &[&str] = &["url", "priority", "method", "headers", "body", "meta"];
443 deserializer.deserialize_struct("Request", FIELDS, RequestVisitor)
444 }
445}
446
447impl Default for Request {
448 fn default() -> Self {
449 let default_url = match Url::parse("http://default.invalid") {
450 Ok(url) => url,
451 Err(err) => panic!("invalid hardcoded default URL: {}", err),
452 };
453 Self {
454 url: default_url,
455 priority: 0,
456 method: Method::Get,
457 headers: http::header::HeaderMap::new(),
458 body: None,
459 meta: None, // Lazy initialization - no allocation until needed
460 }
461 }
462}
463
464impl Request {
465 /// Creates a new [`Request`] with the given URL.
466 ///
467 /// This is the most common constructor used by spiders when enqueueing
468 /// follow-up pages. It does not allocate metadata storage unless
469 /// [`with_meta`](Request::with_meta) is called.
470 ///
471 /// ## Example
472 ///
473 /// ```rust,ignore
474 /// use spider_util::request::{Method, Request};
475 /// use url::Url;
476 ///
477 /// let request = Request::new(Url::parse("https://example.com")?);
478 /// # Ok::<(), Box<dyn std::error::Error>>(())
479 /// ```
480 pub fn new(url: Url) -> Self {
481 Request {
482 url,
483 priority: 0,
484 method: Method::Get,
485 headers: http::header::HeaderMap::new(),
486 body: None,
487 meta: None,
488 }
489 }
490
491 /// Creates a new [`Request`] from any value that can be converted into a [`Url`].
492 ///
493 /// This is a fallible companion to [`Request::new`] for callers that want to
494 /// pass URL strings directly.
495 ///
496 /// ## Example
497 ///
498 /// ```rust,ignore
499 /// use spider_util::request::Request;
500 ///
501 /// let request = Request::try_new("https://example.com")?;
502 /// # Ok::<(), Box<dyn std::error::Error>>(())
503 /// ```
504 pub fn try_new<U>(url: U) -> Result<Self, SpiderError>
505 where
506 U: TryInto<Url>,
507 U::Error: Into<url::ParseError>,
508 {
509 let url = url
510 .try_into()
511 .map_err(|e| SpiderError::UrlParseError(e.into()))?;
512 Ok(Self::new(url))
513 }
514
515 /// Sets the HTTP method for the request.
516 ///
517 /// Use this together with one of the body helpers for POST, PUT, or PATCH
518 /// workflows.
519 ///
520 /// ## Example
521 ///
522 /// ```rust,ignore
523 /// use spider_util::request::Request;
524 /// use url::Url;
525 ///
526 /// let request = Request::new(Url::parse("https://example.com")?)
527 /// .with_method(Method::Post);
528 /// # Ok::<(), Box<dyn std::error::Error>>(())
529 /// ```
530 pub fn with_method(mut self, method: Method) -> Self {
531 self.method = method;
532 self
533 }
534
535 /// Sets the scheduling priority for the request.
536 ///
537 /// Higher values are scheduled before lower values. Requests with the same
538 /// priority retain FIFO ordering.
539 pub fn with_priority(mut self, priority: i32) -> Self {
540 self.priority = priority;
541 self
542 }
543
544 /// Returns the scheduling priority for the request.
545 pub fn priority(&self) -> i32 {
546 self.priority
547 }
548
549 /// Adds a header to the request.
550 ///
551 /// Accepts any types that can be converted into [`reqwest::header::HeaderName`]
552 /// and [`reqwest::header::HeaderValue`], including `&str`, `String`, and
553 /// standard header constants such as [`http::header::CONTENT_TYPE`].
554 ///
555 /// Returns an error if the header name or value is invalid.
556 ///
557 /// # Errors
558 ///
559 /// Returns a [`SpiderError::HeaderValueError`] if the header name or value is invalid.
560 ///
561 /// ## Example
562 ///
563 /// ```rust,ignore
564 /// use spider_util::request::Request;
565 /// use url::Url;
566 ///
567 /// let request = Request::new(Url::parse("https://example.com")?)
568 /// .with_header(http::header::ACCEPT, "application/json")
569 /// ?
570 /// .with_header("X-Trace-Id".to_string(), "abc-123".to_string())?;
571 /// # Ok::<(), Box<dyn std::error::Error>>(())
572 /// ```
573 pub fn with_header<N, V>(mut self, name: N, value: V) -> Result<Self, SpiderError>
574 where
575 N: TryInto<reqwest::header::HeaderName>,
576 N::Error: std::fmt::Display,
577 V: TryInto<reqwest::header::HeaderValue>,
578 V::Error: std::fmt::Display,
579 {
580 let header_name = name
581 .try_into()
582 .map_err(|e| SpiderError::HeaderValueError(format!("Invalid header name: {}", e)))?;
583 let header_value = value
584 .try_into()
585 .map_err(|e| SpiderError::HeaderValueError(format!("Invalid header value: {}", e)))?;
586
587 self.headers.insert(header_name, header_value);
588 Ok(self)
589 }
590
591 /// Sets the body of the request and defaults the method to POST.
592 ///
593 /// ## Example
594 ///
595 /// ```rust,ignore
596 /// use spider_util::request::{Request, Body};
597 /// use url::Url;
598 /// use serde_json::json;
599 ///
600 /// let request = Request::new(Url::parse("https://api.example.com")?)
601 /// .with_body(Body::Json(json!({"key": "value"})));
602 /// # Ok::<(), Box<dyn std::error::Error>>(())
603 /// ```
604 pub fn with_body(mut self, body: Body) -> Self {
605 self.body = Some(body);
606 self.with_method(Method::Post)
607 }
608
609 /// Sets the body of the request to a JSON value and defaults the method to POST.
610 ///
611 /// This helper stores the payload body only. Add content-type headers
612 /// explicitly when the target service expects them.
613 ///
614 /// ## Example
615 ///
616 /// ```rust,ignore
617 /// use spider_util::request::Request;
618 /// use url::Url;
619 /// use serde_json::json;
620 ///
621 /// let request = Request::new(Url::parse("https://api.example.com")?)
622 /// .with_json(json!({"name": "test"}));
623 /// # Ok::<(), Box<dyn std::error::Error>>(())
624 /// ```
625 pub fn with_json(self, json: serde_json::Value) -> Self {
626 self.with_body(Body::Json(json))
627 }
628
629 /// Sets the body of the request to form data and defaults the method to POST.
630 ///
631 /// ## Example
632 ///
633 /// ```rust,ignore
634 /// use spider_util::request::Request;
635 /// use url::Url;
636 /// use dashmap::DashMap;
637 ///
638 /// let mut form = DashMap::new();
639 /// form.insert("key".to_string(), "value".to_string());
640 ///
641 /// let request = Request::new(Url::parse("https://api.example.com")?)
642 /// .with_form(form);
643 /// # Ok::<(), Box<dyn std::error::Error>>(())
644 /// ```
645 pub fn with_form(self, form: DashMap<String, String>) -> Self {
646 self.with_body(Body::Form(form))
647 }
648
649 /// Sets the body of the request to raw bytes and defaults the method to POST.
650 ///
651 /// ## Example
652 ///
653 /// ```rust,ignore
654 /// use spider_util::request::Request;
655 /// use url::Url;
656 /// use bytes::Bytes;
657 ///
658 /// let data = Bytes::from("binary data");
659 /// let request = Request::new(Url::parse("https://api.example.com")?)
660 /// .with_bytes(data);
661 /// # Ok::<(), Box<dyn std::error::Error>>(())
662 /// ```
663 pub fn with_bytes(self, bytes: bytes::Bytes) -> Self {
664 self.with_body(Body::Bytes(bytes))
665 }
666
667 /// Adds a value to the request's metadata.
668 ///
669 /// Lazily allocates the metadata map on first use. Metadata is commonly
670 /// used to carry crawl context such as pagination state, source URLs, or
671 /// retry bookkeeping across middleware and parsing stages.
672 ///
673 /// ## Example
674 ///
675 /// ```rust,ignore
676 /// use spider_util::request::Request;
677 /// use url::Url;
678 ///
679 /// let request = Request::new(Url::parse("https://example.com")?)
680 /// .with_priority(10)
681 /// .with_meta("source", serde_json::json!("manual"));
682 /// # Ok::<(), Box<dyn std::error::Error>>(())
683 /// ```
684 pub fn with_meta(mut self, key: impl Into<String>, value: serde_json::Value) -> Self {
685 self.meta
686 .get_or_insert_with(|| Arc::new(DashMap::new()))
687 .insert(key.into(), value);
688 self
689 }
690
691 /// Serializes and stores a metadata value under the provided key.
692 ///
693 /// This is a convenient typed companion to [`Request::with_meta`] that
694 /// avoids manual `serde_json::json!(...)` calls for structured metadata.
695 pub fn with_meta_value<T>(self, key: impl Into<String>, value: T) -> Result<Self, SpiderError>
696 where
697 T: Serialize,
698 {
699 Ok(self.with_meta(key, serde_json::to_value(value)?))
700 }
701
702 /// Serializes and stores a metadata value only when it is present.
703 pub fn with_optional_meta_value<T>(
704 self,
705 key: impl Into<String>,
706 value: Option<T>,
707 ) -> Result<Self, SpiderError>
708 where
709 T: Serialize,
710 {
711 match value {
712 Some(value) => self.with_meta_value(key, value),
713 None => Ok(self),
714 }
715 }
716
717 /// Gets a reference to a metadata value, if it exists.
718 ///
719 /// Returns a cloned JSON value because metadata is stored in a shared
720 /// concurrent map. Returns `None` if the key doesn't exist or if metadata
721 /// hasn't been set.
722 pub fn get_meta(&self, key: &str) -> Option<serde_json::Value> {
723 self.meta
724 .as_ref()
725 .and_then(|m| m.get(key).map(|e| e.value().clone()))
726 }
727
728 /// Deserializes a metadata value into the requested type.
729 pub fn meta_value<T>(&self, key: &str) -> Result<Option<T>, SpiderError>
730 where
731 T: DeserializeOwned,
732 {
733 self.get_meta(key)
734 .map(serde_json::from_value)
735 .transpose()
736 .map_err(SpiderError::from)
737 }
738
739 /// Returns `true` if the request has metadata.
740 pub fn has_meta(&self) -> bool {
741 self.meta.as_ref().is_some_and(|m| !m.is_empty())
742 }
743
744 /// Returns a reference to the internal metadata map, if it exists.
745 pub fn meta_map(&self) -> Option<&Arc<DashMap<String, serde_json::Value>>> {
746 self.meta.as_ref()
747 }
748
749 /// Inserts a value into metadata, creating the map if needed.
750 ///
751 /// This is intended for internal framework use.
752 pub fn insert_meta(&mut self, key: impl Into<String>, value: serde_json::Value) {
753 self.meta
754 .get_or_insert_with(|| Arc::new(DashMap::new()))
755 .insert(key.into(), value);
756 }
757
758 /// Serializes and inserts a metadata value for internal or incremental use.
759 pub fn insert_meta_value<T>(
760 &mut self,
761 key: impl Into<String>,
762 value: T,
763 ) -> Result<(), SpiderError>
764 where
765 T: Serialize,
766 {
767 self.insert_meta(key, serde_json::to_value(value)?);
768 Ok(())
769 }
770
771 /// Removes a metadata entry by key, returning the stored JSON value if any.
772 pub fn remove_meta(&mut self, key: &str) -> Option<serde_json::Value> {
773 self.meta
774 .as_ref()
775 .and_then(|meta| meta.remove(key).map(|(_, value)| value))
776 }
777
778 /// Gets a value from metadata using DashMap's API.
779 ///
780 /// This is intended for internal framework use where direct access is needed.
781 pub fn get_meta_ref(
782 &self,
783 key: &str,
784 ) -> Option<dashmap::mapref::one::Ref<'_, String, serde_json::Value>> {
785 self.meta.as_ref().and_then(|m| m.get(key))
786 }
787
788 /// Sets the metadata map directly.
789 ///
790 /// Used for internal framework operations.
791 pub fn set_meta_from_option(&mut self, meta: Option<Arc<DashMap<String, serde_json::Value>>>) {
792 self.meta = meta;
793 }
794
795 /// Clones the metadata map.
796 ///
797 /// Used for internal framework operations where metadata needs to be copied.
798 pub fn clone_meta(&self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
799 self.meta.clone()
800 }
801
802 /// Takes the metadata map, leaving `None` in its place.
803 ///
804 /// Used for internal framework operations.
805 pub fn take_meta(&mut self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
806 self.meta.take()
807 }
808
809 /// Returns a reference to the metadata Arc for internal framework use.
810 pub fn meta_inner(&self) -> &Option<Arc<DashMap<String, serde_json::Value>>> {
811 &self.meta
812 }
813
814 const RETRY_ATTEMPTS_KEY: &str = "retry_attempts";
815
816 /// Gets the number of times the request has been retried.
817 ///
818 /// Returns `0` if no retry attempts have been recorded.
819 pub fn get_retry_attempts(&self) -> u32 {
820 self.meta
821 .as_ref()
822 .and_then(|m| m.get(Self::RETRY_ATTEMPTS_KEY))
823 .and_then(|v| v.value().as_u64())
824 .unwrap_or(0) as u32
825 }
826
827 /// Increments the retry count for the request.
828 ///
829 /// Lazily allocates the metadata map if not already present.
830 pub fn increment_retry_attempts(&mut self) {
831 let current_attempts = self.get_retry_attempts();
832 self.meta
833 .get_or_insert_with(|| Arc::new(DashMap::new()))
834 .insert(
835 Self::RETRY_ATTEMPTS_KEY.to_string(),
836 serde_json::Value::from(current_attempts + 1),
837 );
838 }
839
840 /// Generates a unique fingerprint for the request based on its URL, method, and body.
841 ///
842 /// This is the stable identity used by runtime deduplication and related
843 /// components that need to recognize equivalent requests.
844 ///
845 /// The fingerprint is used for duplicate detection and caching. It combines:
846 /// - The request URL
847 /// - The HTTP method
848 /// - The request body (if present)
849 ///
850 /// ## Example
851 ///
852 /// ```rust,ignore
853 /// use spider_util::request::Request;
854 /// use url::Url;
855 ///
856 /// let request = Request::new(Url::parse("https://example.com")?);
857 /// let fingerprint = request.fingerprint();
858 /// # Ok::<(), Box<dyn std::error::Error>>(())
859 /// ```
860 pub fn fingerprint(&self) -> String {
861 let mut hasher = XxHash64::default();
862 hasher.write(self.url.as_str().as_bytes());
863 hasher.write(self.method.as_str().as_bytes());
864
865 if let Some(ref body) = self.body {
866 match body {
867 Body::Json(json_val) => {
868 if let Ok(serialized) = serde_json::to_string(json_val) {
869 hasher.write(serialized.as_bytes());
870 }
871 }
872 Body::Form(form_val) => {
873 // Optimized: hash components directly without building intermediate String
874 for r in form_val.iter() {
875 hasher.write(r.key().as_bytes());
876 hasher.write(r.value().as_bytes());
877 }
878 }
879 Body::Bytes(bytes_val) => {
880 hasher.write(bytes_val);
881 }
882 }
883 }
884 format!("{:x}", hasher.finish())
885 }
886}