spider_util/response.rs
1//! Data structures and utilities for handling HTTP responses in `spider-lib`.
2//!
3//! This module defines the [`Response`] struct, which represents an HTTP response
4//! received from a web server. It encapsulates crucial information such as
5//! the URL, status code, headers, and body of the response, along with any
6//! associated metadata.
7//!
8//! Additionally, this module provides:
9//! - Helper methods for [`Response`] to facilitate common tasks like parsing
10//! the body as HTML or JSON, and reconstructing the original [`Request`]
11//! - [`Link`] and [`LinkType`] enums for structured representation and extraction
12//! of hyperlinks found within the response content
13//!
14//! ## Example
15//!
16//! ```rust
17//! use spider_util::response::Response;
18//! use reqwest::StatusCode;
19//! use bytes::Bytes;
20//! use url::Url;
21//!
22//! // Create a response (typically done internally by the downloader)
23//! let response = Response {
24//! url: Url::parse("https://example.com").unwrap(),
25//! status: StatusCode::OK,
26//! headers: http::header::HeaderMap::new(),
27//! body: Bytes::from("<html><body>Hello</body></html>"),
28//! request_url: Url::parse("https://example.com").unwrap(),
29//! meta: None,
30//! cached: false,
31//! };
32//!
33//! // Parse as HTML
34//! let html = response.to_html().unwrap();
35//!
36//! // Extract links from the response
37//! let links = response.links();
38//! ```
39
40use crate::request::Request;
41use crate::selector::get_cached_selector;
42use crate::util;
43use dashmap::{DashMap, DashSet};
44use linkify::{LinkFinder, LinkKind};
45use reqwest::StatusCode;
46use scraper::Html;
47use serde::de::DeserializeOwned;
48use serde_json;
49use std::{str::Utf8Error, str::from_utf8, sync::Arc};
50use url::Url;
51
52/// Represents the type of a discovered link.
53///
54/// [`LinkType`] categorizes links found on web pages to enable
55/// specialized handling based on the resource type.
56///
57/// ## Variants
58///
59/// - `Page`: Links to other web pages (typically `<a>` tags)
60/// - `Script`: Links to JavaScript files (`<script>` tags)
61/// - `Stylesheet`: Links to CSS stylesheets (`<link rel="stylesheet">`)
62/// - `Image`: Links to images (`<img>` tags)
63/// - `Media`: Links to audio/video files (`<audio>`, `<video>`, `<source>`)
64/// - `Other`: Any other type of resource with a custom identifier
65#[derive(Debug, Clone, PartialEq, Eq, Hash)]
66pub enum LinkType {
67 /// A link to another web page.
68 Page,
69 /// A link to a script file.
70 Script,
71 /// A link to a stylesheet.
72 Stylesheet,
73 /// A link to an image.
74 Image,
75 /// A link to a media file (audio/video).
76 Media,
77 /// A link to another type of resource.
78 Other(String),
79}
80
81/// Represents a link discovered on a web page.
82///
83/// [`Link`] encapsulates both the URL and the type of a discovered link,
84/// enabling type-aware link processing during crawling.
85///
86/// ## Example
87///
88/// ```rust
89/// use spider_util::response::{Link, LinkType};
90/// use url::Url;
91///
92/// let link = Link {
93/// url: Url::parse("https://example.com/page").unwrap(),
94/// link_type: LinkType::Page,
95/// };
96/// ```
97#[derive(Debug, Clone, PartialEq, Eq, Hash)]
98pub struct Link {
99 /// The URL of the discovered link.
100 pub url: Url,
101 /// The type of the discovered link.
102 pub link_type: LinkType,
103}
104
105/// Represents an HTTP response received from a server.
106///
107/// [`Response`] contains all information about an HTTP response, including
108/// the final URL (after redirects), status code, headers, body content,
109/// and metadata carried over from the original request.
110///
111/// ## Example
112///
113/// ```rust
114/// use spider_util::response::Response;
115/// use reqwest::StatusCode;
116/// use bytes::Bytes;
117/// use url::Url;
118///
119/// let response = Response {
120/// url: Url::parse("https://example.com").unwrap(),
121/// status: StatusCode::OK,
122/// headers: http::header::HeaderMap::new(),
123/// body: Bytes::from("<html><body>Hello</body></html>"),
124/// request_url: Url::parse("https://example.com").unwrap(),
125/// meta: None,
126/// cached: false,
127/// };
128///
129/// // Parse the response body as HTML
130/// if let Ok(html) = response.to_html() {
131/// // Process HTML...
132/// }
133/// ```
134#[derive(Debug)]
135pub struct Response {
136 /// The final URL of the response after any redirects.
137 pub url: Url,
138 /// The HTTP status code of the response.
139 pub status: StatusCode,
140 /// The headers of the response.
141 pub headers: http::header::HeaderMap,
142 /// The body of the response.
143 pub body: bytes::Bytes,
144 /// The original URL of the request that led to this response.
145 pub request_url: Url,
146 /// Metadata associated with the response, carried over from the request.
147 /// Uses Option to allow lazy initialization.
148 pub meta: Option<Arc<DashMap<String, serde_json::Value>>>,
149 /// Indicates if the response was served from a cache.
150 pub cached: bool,
151}
152
153
154impl Response {
155 /// Reconstructs the original [`Request`] that led to this response.
156 ///
157 /// This method creates a new [`Request`] with the same URL and metadata
158 /// as the request that produced this response. Useful for retry scenarios
159 /// or when you need to re-request the same resource.
160 ///
161 /// ## Example
162 ///
163 /// ```rust
164 /// # use spider_util::response::Response;
165 /// # use reqwest::StatusCode;
166 /// # use bytes::Bytes;
167 /// # use url::Url;
168 /// # let response = Response {
169 /// # url: Url::parse("https://example.com").unwrap(),
170 /// # status: StatusCode::OK,
171 /// # headers: http::header::HeaderMap::new(),
172 /// # body: Bytes::from("hello"),
173 /// # request_url: Url::parse("https://example.com").unwrap(),
174 /// # meta: None,
175 /// # cached: false,
176 /// # };
177 /// let original_request = response.request_from_response();
178 /// ```
179 pub fn request_from_response(&self) -> Request {
180 let mut request = Request::new(self.request_url.clone());
181 request.set_meta_from_option(self.meta.clone());
182 request
183 }
184
185 /// Deserializes the response body as JSON.
186 ///
187 /// # Type Parameters
188 ///
189 /// - `T`: The target type to deserialize into (must implement `DeserializeOwned`)
190 ///
191 /// # Errors
192 ///
193 /// Returns a [`serde_json::Error`] if the body cannot be parsed as JSON
194 /// or if it cannot be deserialized into type `T`.
195 ///
196 /// ## Example
197 ///
198 /// ```rust
199 /// # use spider_util::response::Response;
200 /// # use reqwest::StatusCode;
201 /// # use bytes::Bytes;
202 /// # use url::Url;
203 /// # use serde::Deserialize;
204 /// # #[derive(Deserialize)]
205 /// # struct Data { value: String }
206 /// # let response = Response {
207 /// # url: Url::parse("https://api.example.com").unwrap(),
208 /// # status: StatusCode::OK,
209 /// # headers: http::header::HeaderMap::new(),
210 /// # body: Bytes::from(r#"{"value": "test"}"#),
211 /// # request_url: Url::parse("https://api.example.com").unwrap(),
212 /// # meta: None,
213 /// # cached: false,
214 /// # };
215 /// let data: Data = response.json()?;
216 /// # Ok::<(), serde_json::Error>(())
217 /// ```
218 pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
219 serde_json::from_slice(&self.body)
220 }
221
222 /// Parses the response body as HTML.
223 ///
224 /// Returns a [`scraper::Html`] document that can be queried using CSS selectors.
225 ///
226 /// # Errors
227 ///
228 /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
229 ///
230 /// ## Example
231 ///
232 /// ```rust
233 /// # use spider_util::response::Response;
234 /// # use reqwest::StatusCode;
235 /// # use bytes::Bytes;
236 /// # use url::Url;
237 /// # let response = Response {
238 /// # url: Url::parse("https://example.com").unwrap(),
239 /// # status: StatusCode::OK,
240 /// # headers: http::header::HeaderMap::new(),
241 /// # body: Bytes::from("<html><body>Hello</body></html>"),
242 /// # request_url: Url::parse("https://example.com").unwrap(),
243 /// # meta: None,
244 /// # cached: false,
245 /// # };
246 /// let html = response.to_html()?;
247 /// # Ok::<(), std::str::Utf8Error>(())
248 /// ```
249 pub fn to_html(&self) -> Result<Html, Utf8Error> {
250 let body_str = from_utf8(&self.body)?;
251 Ok(Html::parse_document(body_str))
252 }
253
254 /// Lazily parses the response body as HTML.
255 ///
256 /// Returns a closure that can be called when the HTML is actually needed.
257 /// This avoids parsing HTML for responses where it may not be used.
258 ///
259 /// # Errors
260 ///
261 /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
262 ///
263 /// ## Example
264 ///
265 /// ```rust
266 /// # use spider_util::response::Response;
267 /// # use reqwest::StatusCode;
268 /// # use bytes::Bytes;
269 /// # use url::Url;
270 /// # let response = Response {
271 /// # url: Url::parse("https://example.com").unwrap(),
272 /// # status: StatusCode::OK,
273 /// # headers: http::header::HeaderMap::new(),
274 /// # body: Bytes::from("<html><body>Hello</body></html>"),
275 /// # request_url: Url::parse("https://example.com").unwrap(),
276 /// # meta: None,
277 /// # cached: false,
278 /// # };
279 /// let html_fn = response.lazy_html()?;
280 /// // Parse HTML only when needed
281 /// let html = html_fn()?;
282 /// # Ok::<(), std::str::Utf8Error>(())
283 /// ```
284 pub fn lazy_html(&self) -> Result<impl Fn() -> Result<Html, Utf8Error> + '_, Utf8Error> {
285 let body_bytes = &self.body;
286 Ok(move || {
287 let body_str = from_utf8(body_bytes)?;
288 Ok(Html::parse_document(body_str))
289 })
290 }
291
292 /// Extracts all unique, same-site links from the response body.
293 ///
294 /// This method discovers links from:
295 /// - HTML elements with `href` or `src` attributes (`<a>`, `<link>`, `<script>`, `<img>`, etc.)
296 /// - URLs found in text content (using link detection)
297 ///
298 /// Only links pointing to the same site (same registered domain) are included.
299 ///
300 /// ## Returns
301 ///
302 /// A [`DashSet`] of [`Link`] objects containing the URL and link type.
303 ///
304 /// ## Example
305 ///
306 /// ```rust
307 /// # use spider_util::response::Response;
308 /// # use reqwest::StatusCode;
309 /// # use bytes::Bytes;
310 /// # use url::Url;
311 /// # let response = Response {
312 /// # url: Url::parse("https://example.com").unwrap(),
313 /// # status: StatusCode::OK,
314 /// # headers: http::header::HeaderMap::new(),
315 /// # body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
316 /// # request_url: Url::parse("https://example.com").unwrap(),
317 /// # meta: None,
318 /// # cached: false,
319 /// # };
320 /// let links = response.links();
321 /// for link in links.iter() {
322 /// println!("Found {:?} link: {}", link.link_type, link.url);
323 /// }
324 /// ```
325 pub fn links(&self) -> DashSet<Link> {
326 let links = DashSet::new();
327
328 if let Ok(html_fn) = self.lazy_html()
329 && let Ok(html) = html_fn()
330 {
331 let selectors = vec![
332 ("a[href]", "href"),
333 ("link[href]", "href"),
334 ("script[src]", "src"),
335 ("img[src]", "src"),
336 ("audio[src]", "src"),
337 ("video[src]", "src"),
338 ("source[src]", "src"),
339 ];
340
341 for (selector_str, attr_name) in selectors {
342 if let Some(selector) = get_cached_selector(selector_str) {
343 for element in html.select(&selector) {
344 if let Some(attr_value) = element.value().attr(attr_name)
345 && let Ok(url) = self.url.join(attr_value)
346 && util::is_same_site(&url, &self.url)
347 {
348 let link_type = match element.value().name() {
349 "a" => LinkType::Page,
350 "link" => {
351 if let Some(rel) = element.value().attr("rel") {
352 if rel.eq_ignore_ascii_case("stylesheet") {
353 LinkType::Stylesheet
354 } else {
355 LinkType::Other(rel.to_string())
356 }
357 } else {
358 LinkType::Other("link".to_string())
359 }
360 }
361 "script" => LinkType::Script,
362 "img" => LinkType::Image,
363 "audio" | "video" | "source" => LinkType::Media,
364 _ => LinkType::Other(element.value().name().to_string()),
365 };
366 links.insert(Link { url, link_type });
367 }
368 }
369 }
370 }
371
372 let finder = LinkFinder::new();
373 for text_node in html.tree.values().filter_map(|node| node.as_text()) {
374 for link in finder.links(text_node) {
375 if link.kind() == &LinkKind::Url
376 && let Ok(url) = self.url.join(link.as_str())
377 && util::is_same_site(&url, &self.url)
378 {
379 links.insert(Link {
380 url,
381 link_type: LinkType::Page,
382 });
383 }
384 }
385 }
386 }
387
388 links
389 }
390}
391
392impl Clone for Response {
393 fn clone(&self) -> Self {
394 Response {
395 url: self.url.clone(),
396 status: self.status,
397 headers: self.headers.clone(),
398 body: self.body.clone(),
399 request_url: self.request_url.clone(),
400 meta: self.meta.clone(),
401 cached: self.cached,
402 }
403 }
404}