spider_util/response.rs
1//! Data structures and utilities for handling HTTP responses in `spider-lib`.
2//!
3//! This module defines the [`Response`] struct, which represents an HTTP response
4//! received from a web server. It encapsulates crucial information such as
5//! the URL, status code, headers, and body of the response, along with any
6//! associated metadata.
7//!
8//! Additionally, this module provides:
9//! - Helper methods for [`Response`] to facilitate common tasks like parsing
10//! the body as HTML or JSON, and reconstructing the original [`Request`]
11//! - [`Link`] and [`LinkType`] enums for structured representation and extraction
12//! of hyperlinks found within the response content
13//!
14//! ## Example
15//!
16//! ```rust
17//! use spider_util::response::Response;
18//! use reqwest::StatusCode;
19//! use bytes::Bytes;
20//! use url::Url;
21//!
22//! // Create a response (typically done internally by the downloader)
23//! let response = Response {
24//! url: Url::parse("https://example.com").unwrap(),
25//! status: StatusCode::OK,
26//! headers: http::header::HeaderMap::new(),
27//! body: Bytes::from("<html><body>Hello</body></html>"),
28//! request_url: Url::parse("https://example.com").unwrap(),
29//! meta: None,
30//! cached: false,
31//! };
32//!
33//! // Parse as HTML
34//! let html = response.to_html().unwrap();
35//!
36//! // Extract links from the response
37//! let links = response.links();
38//! ```
39
40use crate::request::Request;
41use crate::selector::get_cached_selector;
42use crate::util;
43use dashmap::{DashMap, DashSet};
44use linkify::{LinkFinder, LinkKind};
45use reqwest::StatusCode;
46use scraper::Html;
47use serde::de::DeserializeOwned;
48use serde_json;
49use std::{str::Utf8Error, str::from_utf8, sync::Arc};
50use url::Url;
51
52/// Represents the type of a discovered link.
53///
54/// [`LinkType`] categorizes links found on web pages to enable
55/// specialized handling based on the resource type.
56///
57/// ## Variants
58///
59/// - `Page`: Links to other web pages (typically `<a>` tags)
60/// - `Script`: Links to JavaScript files (`<script>` tags)
61/// - `Stylesheet`: Links to CSS stylesheets (`<link rel="stylesheet">`)
62/// - `Image`: Links to images (`<img>` tags)
63/// - `Media`: Links to audio/video files (`<audio>`, `<video>`, `<source>`)
64/// - `Other`: Any other type of resource with a custom identifier
65#[derive(Debug, Clone, PartialEq, Eq, Hash)]
66pub enum LinkType {
67 /// A link to another web page.
68 Page,
69 /// A link to a script file.
70 Script,
71 /// A link to a stylesheet.
72 Stylesheet,
73 /// A link to an image.
74 Image,
75 /// A link to a media file (audio/video).
76 Media,
77 /// A link to another type of resource.
78 Other(String),
79}
80
81/// Represents a link discovered on a web page.
82///
83/// [`Link`] encapsulates both the URL and the type of a discovered link,
84/// enabling type-aware link processing during crawling.
85///
86/// ## Example
87///
88/// ```rust
89/// use spider_util::response::{Link, LinkType};
90/// use url::Url;
91///
92/// let link = Link {
93/// url: Url::parse("https://example.com/page").unwrap(),
94/// link_type: LinkType::Page,
95/// };
96/// ```
97#[derive(Debug, Clone, PartialEq, Eq, Hash)]
98pub struct Link {
99 /// The URL of the discovered link.
100 pub url: Url,
101 /// The type of the discovered link.
102 pub link_type: LinkType,
103}
104
105/// Represents an HTTP response received from a server.
106///
107/// [`Response`] contains all information about an HTTP response, including
108/// the final URL (after redirects), status code, headers, body content,
109/// and metadata carried over from the original request.
110///
111/// ## Example
112///
113/// ```rust
114/// use spider_util::response::Response;
115/// use reqwest::StatusCode;
116/// use bytes::Bytes;
117/// use url::Url;
118///
119/// let response = Response {
120/// url: Url::parse("https://example.com").unwrap(),
121/// status: StatusCode::OK,
122/// headers: http::header::HeaderMap::new(),
123/// body: Bytes::from("<html><body>Hello</body></html>"),
124/// request_url: Url::parse("https://example.com").unwrap(),
125/// meta: None,
126/// cached: false,
127/// };
128///
129/// // Parse the response body as HTML
130/// if let Ok(html) = response.to_html() {
131/// // Process HTML...
132/// }
133/// ```
134#[derive(Debug)]
135pub struct Response {
136 /// The final URL of the response after any redirects.
137 pub url: Url,
138 /// The HTTP status code of the response.
139 pub status: StatusCode,
140 /// The headers of the response.
141 pub headers: http::header::HeaderMap,
142 /// The body of the response.
143 pub body: bytes::Bytes,
144 /// The original URL of the request that led to this response.
145 pub request_url: Url,
146 /// Metadata associated with the response, carried over from the request.
147 /// Uses Option to allow lazy initialization.
148 pub meta: Option<Arc<DashMap<String, serde_json::Value>>>,
149 /// Indicates if the response was served from a cache.
150 pub cached: bool,
151}
152
153impl Response {
154 /// Reconstructs the original [`Request`] that led to this response.
155 ///
156 /// This method creates a new [`Request`] with the same URL and metadata
157 /// as the request that produced this response. Useful for retry scenarios
158 /// or when you need to re-request the same resource.
159 ///
160 /// ## Example
161 ///
162 /// ```rust
163 /// # use spider_util::response::Response;
164 /// # use reqwest::StatusCode;
165 /// # use bytes::Bytes;
166 /// # use url::Url;
167 /// # let response = Response {
168 /// # url: Url::parse("https://example.com").unwrap(),
169 /// # status: StatusCode::OK,
170 /// # headers: http::header::HeaderMap::new(),
171 /// # body: Bytes::from("hello"),
172 /// # request_url: Url::parse("https://example.com").unwrap(),
173 /// # meta: None,
174 /// # cached: false,
175 /// # };
176 /// let original_request = response.request_from_response();
177 /// ```
178 pub fn request_from_response(&self) -> Request {
179 let mut request = Request::new(self.request_url.clone());
180 request.set_meta_from_option(self.meta.clone());
181 request
182 }
183
184 /// Deserializes the response body as JSON.
185 ///
186 /// # Type Parameters
187 ///
188 /// - `T`: The target type to deserialize into (must implement `DeserializeOwned`)
189 ///
190 /// # Errors
191 ///
192 /// Returns a [`serde_json::Error`] if the body cannot be parsed as JSON
193 /// or if it cannot be deserialized into type `T`.
194 ///
195 /// ## Example
196 ///
197 /// ```rust
198 /// # use spider_util::response::Response;
199 /// # use reqwest::StatusCode;
200 /// # use bytes::Bytes;
201 /// # use url::Url;
202 /// # use serde::Deserialize;
203 /// # #[derive(Deserialize)]
204 /// # struct Data { value: String }
205 /// # let response = Response {
206 /// # url: Url::parse("https://api.example.com").unwrap(),
207 /// # status: StatusCode::OK,
208 /// # headers: http::header::HeaderMap::new(),
209 /// # body: Bytes::from(r#"{"value": "test"}"#),
210 /// # request_url: Url::parse("https://api.example.com").unwrap(),
211 /// # meta: None,
212 /// # cached: false,
213 /// # };
214 /// let data: Data = response.json()?;
215 /// # Ok::<(), serde_json::Error>(())
216 /// ```
217 pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
218 serde_json::from_slice(&self.body)
219 }
220
221 /// Parses the response body as HTML.
222 ///
223 /// Returns a [`scraper::Html`] document that can be queried using CSS selectors.
224 ///
225 /// # Errors
226 ///
227 /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
228 ///
229 /// ## Example
230 ///
231 /// ```rust
232 /// # use spider_util::response::Response;
233 /// # use reqwest::StatusCode;
234 /// # use bytes::Bytes;
235 /// # use url::Url;
236 /// # let response = Response {
237 /// # url: Url::parse("https://example.com").unwrap(),
238 /// # status: StatusCode::OK,
239 /// # headers: http::header::HeaderMap::new(),
240 /// # body: Bytes::from("<html><body>Hello</body></html>"),
241 /// # request_url: Url::parse("https://example.com").unwrap(),
242 /// # meta: None,
243 /// # cached: false,
244 /// # };
245 /// let html = response.to_html()?;
246 /// # Ok::<(), std::str::Utf8Error>(())
247 /// ```
248 pub fn to_html(&self) -> Result<Html, Utf8Error> {
249 let body_str = from_utf8(&self.body)?;
250 Ok(Html::parse_document(body_str))
251 }
252
253 /// Lazily parses the response body as HTML.
254 ///
255 /// Returns a closure that can be called when the HTML is actually needed.
256 /// This avoids parsing HTML for responses where it may not be used.
257 ///
258 /// # Errors
259 ///
260 /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
261 ///
262 /// ## Example
263 ///
264 /// ```rust
265 /// # use spider_util::response::Response;
266 /// # use reqwest::StatusCode;
267 /// # use bytes::Bytes;
268 /// # use url::Url;
269 /// # let response = Response {
270 /// # url: Url::parse("https://example.com").unwrap(),
271 /// # status: StatusCode::OK,
272 /// # headers: http::header::HeaderMap::new(),
273 /// # body: Bytes::from("<html><body>Hello</body></html>"),
274 /// # request_url: Url::parse("https://example.com").unwrap(),
275 /// # meta: None,
276 /// # cached: false,
277 /// # };
278 /// let html_fn = response.lazy_html()?;
279 /// // Parse HTML only when needed
280 /// let html = html_fn()?;
281 /// # Ok::<(), std::str::Utf8Error>(())
282 /// ```
283 pub fn lazy_html(&self) -> Result<impl Fn() -> Result<Html, Utf8Error> + '_, Utf8Error> {
284 let body_bytes = &self.body;
285 Ok(move || {
286 let body_str = from_utf8(body_bytes)?;
287 Ok(Html::parse_document(body_str))
288 })
289 }
290
291 /// Extracts all unique, same-site links from the response body.
292 ///
293 /// This method discovers links from:
294 /// - HTML elements with `href` or `src` attributes (`<a>`, `<link>`, `<script>`, `<img>`, etc.)
295 /// - URLs found in text content (using link detection)
296 ///
297 /// Only links pointing to the same site (same registered domain) are included.
298 ///
299 /// ## Returns
300 ///
301 /// A [`DashSet`] of [`Link`] objects containing the URL and link type.
302 ///
303 /// ## Example
304 ///
305 /// ```rust
306 /// # use spider_util::response::Response;
307 /// # use reqwest::StatusCode;
308 /// # use bytes::Bytes;
309 /// # use url::Url;
310 /// # let response = Response {
311 /// # url: Url::parse("https://example.com").unwrap(),
312 /// # status: StatusCode::OK,
313 /// # headers: http::header::HeaderMap::new(),
314 /// # body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
315 /// # request_url: Url::parse("https://example.com").unwrap(),
316 /// # meta: None,
317 /// # cached: false,
318 /// # };
319 /// let links = response.links();
320 /// for link in links.iter() {
321 /// println!("Found {:?} link: {}", link.link_type, link.url);
322 /// }
323 /// ```
324 pub fn links(&self) -> DashSet<Link> {
325 let links = DashSet::new();
326
327 if let Ok(html_fn) = self.lazy_html()
328 && let Ok(html) = html_fn()
329 {
330 let selectors = vec![
331 ("a[href]", "href"),
332 ("link[href]", "href"),
333 ("script[src]", "src"),
334 ("img[src]", "src"),
335 ("audio[src]", "src"),
336 ("video[src]", "src"),
337 ("source[src]", "src"),
338 ];
339
340 for (selector_str, attr_name) in selectors {
341 if let Some(selector) = get_cached_selector(selector_str) {
342 for element in html.select(&selector) {
343 if let Some(attr_value) = element.value().attr(attr_name)
344 && let Ok(url) = self.url.join(attr_value)
345 && util::is_same_site(&url, &self.url)
346 {
347 let link_type = match element.value().name() {
348 "a" => LinkType::Page,
349 "link" => {
350 if let Some(rel) = element.value().attr("rel") {
351 if rel.eq_ignore_ascii_case("stylesheet") {
352 LinkType::Stylesheet
353 } else {
354 LinkType::Other(rel.to_string())
355 }
356 } else {
357 LinkType::Other("link".to_string())
358 }
359 }
360 "script" => LinkType::Script,
361 "img" => LinkType::Image,
362 "audio" | "video" | "source" => LinkType::Media,
363 _ => LinkType::Other(element.value().name().to_string()),
364 };
365 links.insert(Link { url, link_type });
366 }
367 }
368 }
369 }
370
371 let finder = LinkFinder::new();
372 for text_node in html.tree.values().filter_map(|node| node.as_text()) {
373 for link in finder.links(text_node) {
374 if link.kind() == &LinkKind::Url
375 && let Ok(url) = self.url.join(link.as_str())
376 && util::is_same_site(&url, &self.url)
377 {
378 links.insert(Link {
379 url,
380 link_type: LinkType::Page,
381 });
382 }
383 }
384 }
385 }
386
387 links
388 }
389}
390
391impl Clone for Response {
392 fn clone(&self) -> Self {
393 Response {
394 url: self.url.clone(),
395 status: self.status,
396 headers: self.headers.clone(),
397 body: self.body.clone(),
398 request_url: self.request_url.clone(),
399 meta: self.meta.clone(),
400 cached: self.cached,
401 }
402 }
403}