Skip to main content

spider_util/
response.rs

1//! Data structures and utilities for handling HTTP responses in `spider-lib`.
2//!
3//! This module defines the [`Response`] struct, which represents an HTTP response
4//! received from a web server. It encapsulates crucial information such as
5//! the URL, status code, headers, and body of the response, along with any
6//! associated metadata.
7//!
8//! Additionally, this module provides:
9//! - Helper methods for [`Response`] to facilitate common tasks like parsing
10//!   the body as HTML or JSON, and reconstructing the original [`Request`]
11//! - [`Link`] and [`LinkType`] enums for structured representation and extraction
12//!   of hyperlinks found within the response content
13//!
14//! ## Example
15//!
16//! ```rust
17//! use spider_util::response::Response;
18//! use reqwest::StatusCode;
19//! use bytes::Bytes;
20//! use url::Url;
21//!
22//! // Create a response (typically done internally by the downloader)
23//! let response = Response {
24//!     url: Url::parse("https://example.com").unwrap(),
25//!     status: StatusCode::OK,
26//!     headers: http::header::HeaderMap::new(),
27//!     body: Bytes::from("<html><body>Hello</body></html>"),
28//!     request_url: Url::parse("https://example.com").unwrap(),
29//!     meta: None,
30//!     cached: false,
31//! };
32//!
33//! // Parse as HTML
34//! let html = response.to_html().unwrap();
35//!
36//! // Extract links from the response
37//! let links = response.links();
38//! ```
39
40use crate::request::Request;
41use crate::selector::get_cached_selector;
42use crate::util;
43use dashmap::{DashMap, DashSet};
44use linkify::{LinkFinder, LinkKind};
45use reqwest::StatusCode;
46use scraper::Html;
47use serde::de::DeserializeOwned;
48use serde_json;
49use std::{str::Utf8Error, str::from_utf8, sync::Arc};
50use url::Url;
51
52/// Represents the type of a discovered link.
53///
54/// [`LinkType`] categorizes links found on web pages to enable
55/// specialized handling based on the resource type.
56///
57/// ## Variants
58///
59/// - `Page`: Links to other web pages (typically `<a>` tags)
60/// - `Script`: Links to JavaScript files (`<script>` tags)
61/// - `Stylesheet`: Links to CSS stylesheets (`<link rel="stylesheet">`)
62/// - `Image`: Links to images (`<img>` tags)
63/// - `Media`: Links to audio/video files (`<audio>`, `<video>`, `<source>`)
64/// - `Other`: Any other type of resource with a custom identifier
65#[derive(Debug, Clone, PartialEq, Eq, Hash)]
66pub enum LinkType {
67    /// A link to another web page.
68    Page,
69    /// A link to a script file.
70    Script,
71    /// A link to a stylesheet.
72    Stylesheet,
73    /// A link to an image.
74    Image,
75    /// A link to a media file (audio/video).
76    Media,
77    /// A link to another type of resource.
78    Other(String),
79}
80
81/// Represents a link discovered on a web page.
82///
83/// [`Link`] encapsulates both the URL and the type of a discovered link,
84/// enabling type-aware link processing during crawling.
85///
86/// ## Example
87///
88/// ```rust
89/// use spider_util::response::{Link, LinkType};
90/// use url::Url;
91///
92/// let link = Link {
93///     url: Url::parse("https://example.com/page").unwrap(),
94///     link_type: LinkType::Page,
95/// };
96/// ```
97#[derive(Debug, Clone, PartialEq, Eq, Hash)]
98pub struct Link {
99    /// The URL of the discovered link.
100    pub url: Url,
101    /// The type of the discovered link.
102    pub link_type: LinkType,
103}
104
105/// Represents an HTTP response received from a server.
106///
107/// [`Response`] contains all information about an HTTP response, including
108/// the final URL (after redirects), status code, headers, body content,
109/// and metadata carried over from the original request.
110///
111/// ## Example
112///
113/// ```rust
114/// use spider_util::response::Response;
115/// use reqwest::StatusCode;
116/// use bytes::Bytes;
117/// use url::Url;
118///
119/// let response = Response {
120///     url: Url::parse("https://example.com").unwrap(),
121///     status: StatusCode::OK,
122///     headers: http::header::HeaderMap::new(),
123///     body: Bytes::from("<html><body>Hello</body></html>"),
124///     request_url: Url::parse("https://example.com").unwrap(),
125///     meta: None,
126///     cached: false,
127/// };
128///
129/// // Parse the response body as HTML
130/// if let Ok(html) = response.to_html() {
131///     // Process HTML...
132/// }
133/// ```
134#[derive(Debug)]
135pub struct Response {
136    /// The final URL of the response after any redirects.
137    pub url: Url,
138    /// The HTTP status code of the response.
139    pub status: StatusCode,
140    /// The headers of the response.
141    pub headers: http::header::HeaderMap,
142    /// The body of the response.
143    pub body: bytes::Bytes,
144    /// The original URL of the request that led to this response.
145    pub request_url: Url,
146    /// Metadata associated with the response, carried over from the request.
147    /// Uses Option to allow lazy initialization.
148    pub meta: Option<Arc<DashMap<String, serde_json::Value>>>,
149    /// Indicates if the response was served from a cache.
150    pub cached: bool,
151}
152
153impl Response {
154    /// Reconstructs the original [`Request`] that led to this response.
155    ///
156    /// This method creates a new [`Request`] with the same URL and metadata
157    /// as the request that produced this response. Useful for retry scenarios
158    /// or when you need to re-request the same resource.
159    ///
160    /// ## Example
161    ///
162    /// ```rust
163    /// # use spider_util::response::Response;
164    /// # use reqwest::StatusCode;
165    /// # use bytes::Bytes;
166    /// # use url::Url;
167    /// # let response = Response {
168    /// #     url: Url::parse("https://example.com").unwrap(),
169    /// #     status: StatusCode::OK,
170    /// #     headers: http::header::HeaderMap::new(),
171    /// #     body: Bytes::from("hello"),
172    /// #     request_url: Url::parse("https://example.com").unwrap(),
173    /// #     meta: None,
174    /// #     cached: false,
175    /// # };
176    /// let original_request = response.request_from_response();
177    /// ```
178    pub fn request_from_response(&self) -> Request {
179        let mut request = Request::new(self.request_url.clone());
180        request.set_meta_from_option(self.meta.clone());
181        request
182    }
183
184    /// Deserializes the response body as JSON.
185    ///
186    /// # Type Parameters
187    ///
188    /// - `T`: The target type to deserialize into (must implement `DeserializeOwned`)
189    ///
190    /// # Errors
191    ///
192    /// Returns a [`serde_json::Error`] if the body cannot be parsed as JSON
193    /// or if it cannot be deserialized into type `T`.
194    ///
195    /// ## Example
196    ///
197    /// ```rust
198    /// # use spider_util::response::Response;
199    /// # use reqwest::StatusCode;
200    /// # use bytes::Bytes;
201    /// # use url::Url;
202    /// # use serde::Deserialize;
203    /// # #[derive(Deserialize)]
204    /// # struct Data { value: String }
205    /// # let response = Response {
206    /// #     url: Url::parse("https://api.example.com").unwrap(),
207    /// #     status: StatusCode::OK,
208    /// #     headers: http::header::HeaderMap::new(),
209    /// #     body: Bytes::from(r#"{"value": "test"}"#),
210    /// #     request_url: Url::parse("https://api.example.com").unwrap(),
211    /// #     meta: None,
212    /// #     cached: false,
213    /// # };
214    /// let data: Data = response.json()?;
215    /// # Ok::<(), serde_json::Error>(())
216    /// ```
217    pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
218        serde_json::from_slice(&self.body)
219    }
220
221    /// Parses the response body as HTML.
222    ///
223    /// Returns a [`scraper::Html`] document that can be queried using CSS selectors.
224    ///
225    /// # Errors
226    ///
227    /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
228    ///
229    /// ## Example
230    ///
231    /// ```rust
232    /// # use spider_util::response::Response;
233    /// # use reqwest::StatusCode;
234    /// # use bytes::Bytes;
235    /// # use url::Url;
236    /// # let response = Response {
237    /// #     url: Url::parse("https://example.com").unwrap(),
238    /// #     status: StatusCode::OK,
239    /// #     headers: http::header::HeaderMap::new(),
240    /// #     body: Bytes::from("<html><body>Hello</body></html>"),
241    /// #     request_url: Url::parse("https://example.com").unwrap(),
242    /// #     meta: None,
243    /// #     cached: false,
244    /// # };
245    /// let html = response.to_html()?;
246    /// # Ok::<(), std::str::Utf8Error>(())
247    /// ```
248    pub fn to_html(&self) -> Result<Html, Utf8Error> {
249        let body_str = from_utf8(&self.body)?;
250        Ok(Html::parse_document(body_str))
251    }
252
253    /// Lazily parses the response body as HTML.
254    ///
255    /// Returns a closure that can be called when the HTML is actually needed.
256    /// This avoids parsing HTML for responses where it may not be used.
257    ///
258    /// # Errors
259    ///
260    /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
261    ///
262    /// ## Example
263    ///
264    /// ```rust
265    /// # use spider_util::response::Response;
266    /// # use reqwest::StatusCode;
267    /// # use bytes::Bytes;
268    /// # use url::Url;
269    /// # let response = Response {
270    /// #     url: Url::parse("https://example.com").unwrap(),
271    /// #     status: StatusCode::OK,
272    /// #     headers: http::header::HeaderMap::new(),
273    /// #     body: Bytes::from("<html><body>Hello</body></html>"),
274    /// #     request_url: Url::parse("https://example.com").unwrap(),
275    /// #     meta: None,
276    /// #     cached: false,
277    /// # };
278    /// let html_fn = response.lazy_html()?;
279    /// // Parse HTML only when needed
280    /// let html = html_fn()?;
281    /// # Ok::<(), std::str::Utf8Error>(())
282    /// ```
283    pub fn lazy_html(&self) -> Result<impl Fn() -> Result<Html, Utf8Error> + '_, Utf8Error> {
284        let body_bytes = &self.body;
285        Ok(move || {
286            let body_str = from_utf8(body_bytes)?;
287            Ok(Html::parse_document(body_str))
288        })
289    }
290
291    /// Extracts all unique, same-site links from the response body.
292    ///
293    /// This method discovers links from:
294    /// - HTML elements with `href` or `src` attributes (`<a>`, `<link>`, `<script>`, `<img>`, etc.)
295    /// - URLs found in text content (using link detection)
296    ///
297    /// Only links pointing to the same site (same registered domain) are included.
298    ///
299    /// ## Returns
300    ///
301    /// A [`DashSet`] of [`Link`] objects containing the URL and link type.
302    ///
303    /// ## Example
304    ///
305    /// ```rust
306    /// # use spider_util::response::Response;
307    /// # use reqwest::StatusCode;
308    /// # use bytes::Bytes;
309    /// # use url::Url;
310    /// # let response = Response {
311    /// #     url: Url::parse("https://example.com").unwrap(),
312    /// #     status: StatusCode::OK,
313    /// #     headers: http::header::HeaderMap::new(),
314    /// #     body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
315    /// #     request_url: Url::parse("https://example.com").unwrap(),
316    /// #     meta: None,
317    /// #     cached: false,
318    /// # };
319    /// let links = response.links();
320    /// for link in links.iter() {
321    ///     println!("Found {:?} link: {}", link.link_type, link.url);
322    /// }
323    /// ```
324    pub fn links(&self) -> DashSet<Link> {
325        let links = DashSet::new();
326
327        if let Ok(html_fn) = self.lazy_html()
328            && let Ok(html) = html_fn()
329        {
330            let selectors = vec![
331                ("a[href]", "href"),
332                ("link[href]", "href"),
333                ("script[src]", "src"),
334                ("img[src]", "src"),
335                ("audio[src]", "src"),
336                ("video[src]", "src"),
337                ("source[src]", "src"),
338            ];
339
340            for (selector_str, attr_name) in selectors {
341                if let Some(selector) = get_cached_selector(selector_str) {
342                    for element in html.select(&selector) {
343                        if let Some(attr_value) = element.value().attr(attr_name)
344                            && let Ok(url) = self.url.join(attr_value)
345                            && util::is_same_site(&url, &self.url)
346                        {
347                            let link_type = match element.value().name() {
348                                "a" => LinkType::Page,
349                                "link" => {
350                                    if let Some(rel) = element.value().attr("rel") {
351                                        if rel.eq_ignore_ascii_case("stylesheet") {
352                                            LinkType::Stylesheet
353                                        } else {
354                                            LinkType::Other(rel.to_string())
355                                        }
356                                    } else {
357                                        LinkType::Other("link".to_string())
358                                    }
359                                }
360                                "script" => LinkType::Script,
361                                "img" => LinkType::Image,
362                                "audio" | "video" | "source" => LinkType::Media,
363                                _ => LinkType::Other(element.value().name().to_string()),
364                            };
365                            links.insert(Link { url, link_type });
366                        }
367                    }
368                }
369            }
370
371            let finder = LinkFinder::new();
372            for text_node in html.tree.values().filter_map(|node| node.as_text()) {
373                for link in finder.links(text_node) {
374                    if link.kind() == &LinkKind::Url
375                        && let Ok(url) = self.url.join(link.as_str())
376                        && util::is_same_site(&url, &self.url)
377                    {
378                        links.insert(Link {
379                            url,
380                            link_type: LinkType::Page,
381                        });
382                    }
383                }
384            }
385        }
386
387        links
388    }
389}
390
391impl Clone for Response {
392    fn clone(&self) -> Self {
393        Response {
394            url: self.url.clone(),
395            status: self.status,
396            headers: self.headers.clone(),
397            body: self.body.clone(),
398            request_url: self.request_url.clone(),
399            meta: self.meta.clone(),
400            cached: self.cached,
401        }
402    }
403}