Skip to main content

spider_util/
response.rs

1//! Data structures and utilities for handling HTTP responses in `spider-lib`.
2//!
3//! This module defines the [`Response`] struct, which represents an HTTP response
4//! received from a web server. It encapsulates crucial information such as
5//! the URL, status code, headers, and body of the response, along with any
6//! associated metadata.
7//!
8//! Additionally, this module provides:
9//! - Helper methods for [`Response`] to facilitate common tasks like parsing
10//!   the body as HTML or JSON, and reconstructing the original [`Request`]
11//! - [`Link`] and [`LinkType`] enums for structured representation and extraction
12//!   of hyperlinks found within the response content
13//!
14//! ## Example
15//!
16//! ```rust
17//! use spider_util::response::Response;
18//! use reqwest::StatusCode;
19//! use bytes::Bytes;
20//! use url::Url;
21//!
22//! // Create a response (typically done internally by the downloader)
23//! let response = Response {
24//!     url: Url::parse("https://example.com").unwrap(),
25//!     status: StatusCode::OK,
26//!     headers: http::header::HeaderMap::new(),
27//!     body: Bytes::from("<html><body>Hello</body></html>"),
28//!     request_url: Url::parse("https://example.com").unwrap(),
29//!     meta: None,
30//!     cached: false,
31//! };
32//!
33//! // Parse as HTML
34//! let html = response.to_html().unwrap();
35//!
36//! // Extract links from the response
37//! let links = response.links();
38//! ```
39
40use crate::request::Request;
41use crate::selector::get_cached_selector;
42use crate::util;
43use dashmap::{DashMap, DashSet};
44use linkify::{LinkFinder, LinkKind};
45use reqwest::StatusCode;
46use scraper::Html;
47use serde::de::DeserializeOwned;
48use serde_json;
49use std::{str::Utf8Error, str::from_utf8, sync::Arc};
50use url::Url;
51
52/// Represents the type of a discovered link.
53///
54/// [`LinkType`] categorizes links found on web pages to enable
55/// specialized handling based on the resource type.
56///
57/// ## Variants
58///
59/// - `Page`: Links to other web pages (typically `<a>` tags)
60/// - `Script`: Links to JavaScript files (`<script>` tags)
61/// - `Stylesheet`: Links to CSS stylesheets (`<link rel="stylesheet">`)
62/// - `Image`: Links to images (`<img>` tags)
63/// - `Media`: Links to audio/video files (`<audio>`, `<video>`, `<source>`)
64/// - `Other`: Any other type of resource with a custom identifier
65#[derive(Debug, Clone, PartialEq, Eq, Hash)]
66pub enum LinkType {
67    /// A link to another web page.
68    Page,
69    /// A link to a script file.
70    Script,
71    /// A link to a stylesheet.
72    Stylesheet,
73    /// A link to an image.
74    Image,
75    /// A link to a media file (audio/video).
76    Media,
77    /// A link to another type of resource.
78    Other(String),
79}
80
81/// Represents a link discovered on a web page.
82///
83/// [`Link`] encapsulates both the URL and the type of a discovered link,
84/// enabling type-aware link processing during crawling.
85///
86/// ## Example
87///
88/// ```rust
89/// use spider_util::response::{Link, LinkType};
90/// use url::Url;
91///
92/// let link = Link {
93///     url: Url::parse("https://example.com/page").unwrap(),
94///     link_type: LinkType::Page,
95/// };
96/// ```
97#[derive(Debug, Clone, PartialEq, Eq, Hash)]
98pub struct Link {
99    /// The URL of the discovered link.
100    pub url: Url,
101    /// The type of the discovered link.
102    pub link_type: LinkType,
103}
104
105/// Represents an HTTP response received from a server.
106///
107/// [`Response`] contains all information about an HTTP response, including
108/// the final URL (after redirects), status code, headers, body content,
109/// and metadata carried over from the original request.
110///
111/// ## Example
112///
113/// ```rust
114/// use spider_util::response::Response;
115/// use reqwest::StatusCode;
116/// use bytes::Bytes;
117/// use url::Url;
118///
119/// let response = Response {
120///     url: Url::parse("https://example.com").unwrap(),
121///     status: StatusCode::OK,
122///     headers: http::header::HeaderMap::new(),
123///     body: Bytes::from("<html><body>Hello</body></html>"),
124///     request_url: Url::parse("https://example.com").unwrap(),
125///     meta: None,
126///     cached: false,
127/// };
128///
129/// // Parse the response body as HTML
130/// if let Ok(html) = response.to_html() {
131///     // Process HTML...
132/// }
133/// ```
134#[derive(Debug)]
135pub struct Response {
136    /// The final URL of the response after any redirects.
137    pub url: Url,
138    /// The HTTP status code of the response.
139    pub status: StatusCode,
140    /// The headers of the response.
141    pub headers: http::header::HeaderMap,
142    /// The body of the response.
143    pub body: bytes::Bytes,
144    /// The original URL of the request that led to this response.
145    pub request_url: Url,
146    /// Metadata associated with the response, carried over from the request.
147    /// Uses Option to allow lazy initialization.
148    pub meta: Option<Arc<DashMap<String, serde_json::Value>>>,
149    /// Indicates if the response was served from a cache.
150    pub cached: bool,
151}
152
153
154impl Response {
155    /// Reconstructs the original [`Request`] that led to this response.
156    ///
157    /// This method creates a new [`Request`] with the same URL and metadata
158    /// as the request that produced this response. Useful for retry scenarios
159    /// or when you need to re-request the same resource.
160    ///
161    /// ## Example
162    ///
163    /// ```rust
164    /// # use spider_util::response::Response;
165    /// # use reqwest::StatusCode;
166    /// # use bytes::Bytes;
167    /// # use url::Url;
168    /// # let response = Response {
169    /// #     url: Url::parse("https://example.com").unwrap(),
170    /// #     status: StatusCode::OK,
171    /// #     headers: http::header::HeaderMap::new(),
172    /// #     body: Bytes::from("hello"),
173    /// #     request_url: Url::parse("https://example.com").unwrap(),
174    /// #     meta: None,
175    /// #     cached: false,
176    /// # };
177    /// let original_request = response.request_from_response();
178    /// ```
179    pub fn request_from_response(&self) -> Request {
180        let mut request = Request::new(self.request_url.clone());
181        request.set_meta_from_option(self.meta.clone());
182        request
183    }
184
185    /// Deserializes the response body as JSON.
186    ///
187    /// # Type Parameters
188    ///
189    /// - `T`: The target type to deserialize into (must implement `DeserializeOwned`)
190    ///
191    /// # Errors
192    ///
193    /// Returns a [`serde_json::Error`] if the body cannot be parsed as JSON
194    /// or if it cannot be deserialized into type `T`.
195    ///
196    /// ## Example
197    ///
198    /// ```rust
199    /// # use spider_util::response::Response;
200    /// # use reqwest::StatusCode;
201    /// # use bytes::Bytes;
202    /// # use url::Url;
203    /// # use serde::Deserialize;
204    /// # #[derive(Deserialize)]
205    /// # struct Data { value: String }
206    /// # let response = Response {
207    /// #     url: Url::parse("https://api.example.com").unwrap(),
208    /// #     status: StatusCode::OK,
209    /// #     headers: http::header::HeaderMap::new(),
210    /// #     body: Bytes::from(r#"{"value": "test"}"#),
211    /// #     request_url: Url::parse("https://api.example.com").unwrap(),
212    /// #     meta: None,
213    /// #     cached: false,
214    /// # };
215    /// let data: Data = response.json()?;
216    /// # Ok::<(), serde_json::Error>(())
217    /// ```
218    pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
219        serde_json::from_slice(&self.body)
220    }
221
222    /// Parses the response body as HTML.
223    ///
224    /// Returns a [`scraper::Html`] document that can be queried using CSS selectors.
225    ///
226    /// # Errors
227    ///
228    /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
229    ///
230    /// ## Example
231    ///
232    /// ```rust
233    /// # use spider_util::response::Response;
234    /// # use reqwest::StatusCode;
235    /// # use bytes::Bytes;
236    /// # use url::Url;
237    /// # let response = Response {
238    /// #     url: Url::parse("https://example.com").unwrap(),
239    /// #     status: StatusCode::OK,
240    /// #     headers: http::header::HeaderMap::new(),
241    /// #     body: Bytes::from("<html><body>Hello</body></html>"),
242    /// #     request_url: Url::parse("https://example.com").unwrap(),
243    /// #     meta: None,
244    /// #     cached: false,
245    /// # };
246    /// let html = response.to_html()?;
247    /// # Ok::<(), std::str::Utf8Error>(())
248    /// ```
249    pub fn to_html(&self) -> Result<Html, Utf8Error> {
250        let body_str = from_utf8(&self.body)?;
251        Ok(Html::parse_document(body_str))
252    }
253
254    /// Lazily parses the response body as HTML.
255    ///
256    /// Returns a closure that can be called when the HTML is actually needed.
257    /// This avoids parsing HTML for responses where it may not be used.
258    ///
259    /// # Errors
260    ///
261    /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
262    ///
263    /// ## Example
264    ///
265    /// ```rust
266    /// # use spider_util::response::Response;
267    /// # use reqwest::StatusCode;
268    /// # use bytes::Bytes;
269    /// # use url::Url;
270    /// # let response = Response {
271    /// #     url: Url::parse("https://example.com").unwrap(),
272    /// #     status: StatusCode::OK,
273    /// #     headers: http::header::HeaderMap::new(),
274    /// #     body: Bytes::from("<html><body>Hello</body></html>"),
275    /// #     request_url: Url::parse("https://example.com").unwrap(),
276    /// #     meta: None,
277    /// #     cached: false,
278    /// # };
279    /// let html_fn = response.lazy_html()?;
280    /// // Parse HTML only when needed
281    /// let html = html_fn()?;
282    /// # Ok::<(), std::str::Utf8Error>(())
283    /// ```
284    pub fn lazy_html(&self) -> Result<impl Fn() -> Result<Html, Utf8Error> + '_, Utf8Error> {
285        let body_bytes = &self.body;
286        Ok(move || {
287            let body_str = from_utf8(body_bytes)?;
288            Ok(Html::parse_document(body_str))
289        })
290    }
291
292    /// Extracts all unique, same-site links from the response body.
293    ///
294    /// This method discovers links from:
295    /// - HTML elements with `href` or `src` attributes (`<a>`, `<link>`, `<script>`, `<img>`, etc.)
296    /// - URLs found in text content (using link detection)
297    ///
298    /// Only links pointing to the same site (same registered domain) are included.
299    ///
300    /// ## Returns
301    ///
302    /// A [`DashSet`] of [`Link`] objects containing the URL and link type.
303    ///
304    /// ## Example
305    ///
306    /// ```rust
307    /// # use spider_util::response::Response;
308    /// # use reqwest::StatusCode;
309    /// # use bytes::Bytes;
310    /// # use url::Url;
311    /// # let response = Response {
312    /// #     url: Url::parse("https://example.com").unwrap(),
313    /// #     status: StatusCode::OK,
314    /// #     headers: http::header::HeaderMap::new(),
315    /// #     body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
316    /// #     request_url: Url::parse("https://example.com").unwrap(),
317    /// #     meta: None,
318    /// #     cached: false,
319    /// # };
320    /// let links = response.links();
321    /// for link in links.iter() {
322    ///     println!("Found {:?} link: {}", link.link_type, link.url);
323    /// }
324    /// ```
325    pub fn links(&self) -> DashSet<Link> {
326        let links = DashSet::new();
327
328        if let Ok(html_fn) = self.lazy_html()
329            && let Ok(html) = html_fn()
330        {
331            let selectors = vec![
332                ("a[href]", "href"),
333                ("link[href]", "href"),
334                ("script[src]", "src"),
335                ("img[src]", "src"),
336                ("audio[src]", "src"),
337                ("video[src]", "src"),
338                ("source[src]", "src"),
339            ];
340
341            for (selector_str, attr_name) in selectors {
342                if let Some(selector) = get_cached_selector(selector_str) {
343                    for element in html.select(&selector) {
344                        if let Some(attr_value) = element.value().attr(attr_name)
345                            && let Ok(url) = self.url.join(attr_value)
346                            && util::is_same_site(&url, &self.url)
347                        {
348                            let link_type = match element.value().name() {
349                                "a" => LinkType::Page,
350                                "link" => {
351                                    if let Some(rel) = element.value().attr("rel") {
352                                        if rel.eq_ignore_ascii_case("stylesheet") {
353                                            LinkType::Stylesheet
354                                        } else {
355                                            LinkType::Other(rel.to_string())
356                                        }
357                                    } else {
358                                        LinkType::Other("link".to_string())
359                                    }
360                                }
361                                "script" => LinkType::Script,
362                                "img" => LinkType::Image,
363                                "audio" | "video" | "source" => LinkType::Media,
364                                _ => LinkType::Other(element.value().name().to_string()),
365                            };
366                            links.insert(Link { url, link_type });
367                        }
368                    }
369                }
370            }
371
372            let finder = LinkFinder::new();
373            for text_node in html.tree.values().filter_map(|node| node.as_text()) {
374                for link in finder.links(text_node) {
375                    if link.kind() == &LinkKind::Url
376                        && let Ok(url) = self.url.join(link.as_str())
377                        && util::is_same_site(&url, &self.url)
378                    {
379                        links.insert(Link {
380                            url,
381                            link_type: LinkType::Page,
382                        });
383                    }
384                }
385            }
386        }
387
388        links
389    }
390}
391
392impl Clone for Response {
393    fn clone(&self) -> Self {
394        Response {
395            url: self.url.clone(),
396            status: self.status,
397            headers: self.headers.clone(),
398            body: self.body.clone(),
399            request_url: self.request_url.clone(),
400            meta: self.meta.clone(),
401            cached: self.cached,
402        }
403    }
404}