spider_util/response.rs
1//! Response types and response-side helpers.
2//!
3//! [`Response`] wraps the downloaded body together with the final URL, status,
4//! headers, and request metadata. It also provides convenience methods for
5//! parsing HTML or JSON and for extracting links.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_util::response::Response;
11//! use reqwest::StatusCode;
12//! use bytes::Bytes;
13//! use url::Url;
14//!
15//! // Create a response (typically done internally by the downloader)
16//! let response = Response {
17//! url: Url::parse("https://example.com").unwrap(),
18//! status: StatusCode::OK,
19//! headers: http::header::HeaderMap::new(),
20//! body: Bytes::from("<html><body>Hello</body></html>"),
21//! request_url: Url::parse("https://example.com").unwrap(),
22//! meta: None,
23//! cached: false,
24//! };
25//!
26//! // Parse as HTML
27//! let html = response.to_html().unwrap();
28//!
29//! // Extract links from the response
30//! let links = response.links();
31//! ```
32//!
33//! In the crawler lifecycle, a [`Response`] is produced by the downloader,
34//! optionally rewritten by middleware, and then handed to
35//! [`Spider::parse`](spider_core::Spider::parse).
36
37use crate::request::Request;
38use crate::selector::get_cached_selector;
39use crate::util;
40use dashmap::{DashMap, DashSet};
41use linkify::{LinkFinder, LinkKind};
42use reqwest::StatusCode;
43use scraper::{ElementRef, Html};
44use seahash::SeaHasher;
45use serde::de::DeserializeOwned;
46use serde_json;
47use std::cell::RefCell;
48use std::collections::HashMap;
49use std::hash::{Hash, Hasher};
50use std::{str::Utf8Error, str::from_utf8, sync::Arc};
51use url::Url;
52
53thread_local! {
54 static HTML_CACHE: RefCell<HashMap<u64, Html>> = RefCell::new(HashMap::new());
55}
56
57/// Classification for links discovered in a response.
58///
59/// ## Variants
60///
61/// - `Page`: Links to other web pages (typically `<a>` tags)
62/// - `Script`: Links to JavaScript files (`<script>` tags)
63/// - `Stylesheet`: Links to CSS stylesheets (`<link rel="stylesheet">`)
64/// - `Image`: Links to images (`<img>` tags)
65/// - `Media`: Links to audio/video files (`<audio>`, `<video>`, `<source>`)
66/// - `Other`: Any other type of resource with a custom identifier
67#[derive(Debug, Clone, PartialEq, Eq, Hash)]
68pub enum LinkType {
69 /// A link to another web page.
70 Page,
71 /// A link to a script file.
72 Script,
73 /// A link to a stylesheet.
74 Stylesheet,
75 /// A link to an image.
76 Image,
77 /// A link to a media file (audio/video).
78 Media,
79 /// A link to another type of resource.
80 Other(String),
81}
82
83/// A link discovered while extracting URLs from a response.
84///
85/// ## Example
86///
87/// ```rust,ignore
88/// use spider_util::response::{Link, LinkType};
89/// use url::Url;
90///
91/// let link = Link {
92/// url: Url::parse("https://example.com/page").unwrap(),
93/// link_type: LinkType::Page,
94/// };
95/// ```
96#[derive(Debug, Clone, PartialEq, Eq, Hash)]
97pub struct Link {
98 /// The URL of the discovered link.
99 pub url: Url,
100 /// The type of the discovered link.
101 pub link_type: LinkType,
102}
103
104/// One selector/attribute pair used during link extraction.
105///
106/// This is useful when the default HTML link sources are not enough for the
107/// target site and you need to teach the extractor about custom attributes.
108#[derive(Debug, Clone, PartialEq, Eq)]
109pub struct LinkSource {
110 /// CSS selector used to find candidate elements.
111 pub selector: String,
112 /// Attribute name that contains the URL.
113 pub attribute: String,
114 /// Optional fixed link type for matches from this source.
115 pub link_type: Option<LinkType>,
116}
117
118impl LinkSource {
119 /// Creates a new source definition.
120 pub fn new(selector: impl Into<String>, attribute: impl Into<String>) -> Self {
121 Self {
122 selector: selector.into(),
123 attribute: attribute.into(),
124 link_type: None,
125 }
126 }
127
128 /// Overrides the inferred link type for this source.
129 pub fn with_link_type(mut self, link_type: LinkType) -> Self {
130 self.link_type = Some(link_type);
131 self
132 }
133}
134
135/// Options that control link extraction from a [`Response`].
136///
137/// The defaults are intentionally conservative for crawler use: same-site
138/// filtering is enabled, text links are included, and common HTML elements are
139/// scanned for navigable URLs.
140#[derive(Debug, Clone, PartialEq, Eq)]
141pub struct LinkExtractOptions {
142 /// Restrict discovered links to the same registered domain.
143 pub same_site_only: bool,
144 /// Include URLs found in text content.
145 pub include_text_links: bool,
146 /// HTML sources used to discover attribute-based links.
147 pub sources: Vec<LinkSource>,
148 /// Optional allow-list of link types to include.
149 pub allowed_link_types: Option<Vec<LinkType>>,
150}
151
152impl Default for LinkExtractOptions {
153 fn default() -> Self {
154 Self {
155 same_site_only: true,
156 include_text_links: true,
157 sources: default_link_sources(),
158 allowed_link_types: None,
159 }
160 }
161}
162
163impl LinkExtractOptions {
164 /// Sets whether only same-site URLs should be returned.
165 pub fn same_site_only(mut self, same_site_only: bool) -> Self {
166 self.same_site_only = same_site_only;
167 self
168 }
169
170 /// Sets whether URLs found in text content should be returned.
171 pub fn include_text_links(mut self, include_text_links: bool) -> Self {
172 self.include_text_links = include_text_links;
173 self
174 }
175
176 /// Replaces the configured HTML extraction sources.
177 pub fn with_sources(mut self, sources: impl IntoIterator<Item = LinkSource>) -> Self {
178 self.sources = sources.into_iter().collect();
179 self
180 }
181
182 /// Adds an HTML extraction source.
183 pub fn add_source(mut self, source: LinkSource) -> Self {
184 self.sources.push(source);
185 self
186 }
187
188 /// Restricts extraction to the provided link types.
189 pub fn with_allowed_link_types(
190 mut self,
191 allowed_link_types: impl IntoIterator<Item = LinkType>,
192 ) -> Self {
193 self.allowed_link_types = Some(allowed_link_types.into_iter().collect());
194 self
195 }
196}
197
198/// Represents an HTTP response received from a server.
199///
200/// [`Response`] contains all information about an HTTP response, including
201/// the final URL (after redirects), status code, headers, body content,
202/// and metadata carried over from the original request.
203///
204/// The type is designed for parse-time ergonomics:
205/// - [`Response::to_html`] parses the body as HTML
206/// - [`Response::json`] deserializes JSON payloads
207/// - [`Response::links`] and related helpers extract follow-up links
208/// - [`Response::to_request`] reconstructs the originating request context
209///
210/// ## Example
211///
212/// ```rust,ignore
213/// use spider_util::response::Response;
214/// use reqwest::StatusCode;
215/// use bytes::Bytes;
216/// use url::Url;
217///
218/// let response = Response {
219/// url: Url::parse("https://example.com").unwrap(),
220/// status: StatusCode::OK,
221/// headers: http::header::HeaderMap::new(),
222/// body: Bytes::from("<html><body>Hello</body></html>"),
223/// request_url: Url::parse("https://example.com").unwrap(),
224/// meta: None,
225/// cached: false,
226/// };
227///
228/// // Parse the response body as HTML
229/// if let Ok(html) = response.to_html() {
230/// // Process HTML...
231/// }
232/// ```
233#[derive(Debug)]
234pub struct Response {
235 /// The final URL of the response after any redirects.
236 pub url: Url,
237 /// The HTTP status code of the response.
238 pub status: StatusCode,
239 /// The headers of the response.
240 pub headers: http::header::HeaderMap,
241 /// The body of the response.
242 pub body: bytes::Bytes,
243 /// The original URL of the request that led to this response.
244 pub request_url: Url,
245 /// Metadata associated with the response, carried over from the request.
246 /// Uses Option to allow lazy initialization.
247 pub meta: Option<Arc<DashMap<String, serde_json::Value>>>,
248 /// Indicates if the response was served from a cache.
249 pub cached: bool,
250}
251
252impl Response {
253 /// Creates a new response with an empty HTML cache.
254 ///
255 /// Most application code receives responses from the runtime rather than
256 /// constructing them directly. This constructor is mainly useful for custom
257 /// downloaders and lower-level integrations.
258 pub fn new(
259 url: Url,
260 status: StatusCode,
261 headers: http::header::HeaderMap,
262 body: bytes::Bytes,
263 request_url: Url,
264 ) -> Self {
265 Self {
266 url,
267 status,
268 headers,
269 body,
270 request_url,
271 meta: None,
272 cached: false,
273 }
274 }
275
276 /// Reconstructs the original [`Request`] that led to this response.
277 ///
278 /// This method creates a new [`Request`] with the same URL and metadata
279 /// as the request that produced this response. Useful for retry scenarios
280 /// or when you need to re-request the same resource.
281 ///
282 /// ## Example
283 ///
284 /// ```rust,ignore
285 /// # use spider_util::response::Response;
286 /// # use reqwest::StatusCode;
287 /// # use bytes::Bytes;
288 /// # use url::Url;
289 /// # let response = Response {
290 /// # url: Url::parse("https://example.com").unwrap(),
291 /// # status: StatusCode::OK,
292 /// # headers: http::header::HeaderMap::new(),
293 /// # body: Bytes::from("hello"),
294 /// # request_url: Url::parse("https://example.com").unwrap(),
295 /// # meta: None,
296 /// # cached: false,
297 /// # };
298 /// let original_request = response.request_from_response();
299 /// ```
300 pub fn request_from_response(&self) -> Request {
301 let mut request = Request::new(self.request_url.clone());
302 request.set_meta_from_option(self.meta.clone());
303 request
304 }
305
306 /// Deserializes the response body as JSON.
307 ///
308 /// # Type Parameters
309 ///
310 /// - `T`: The target type to deserialize into (must implement `DeserializeOwned`)
311 ///
312 /// # Errors
313 ///
314 /// Returns a [`serde_json::Error`] if the body cannot be parsed as JSON
315 /// or if it cannot be deserialized into type `T`.
316 ///
317 /// ## Example
318 ///
319 /// ```rust,ignore
320 /// # use spider_util::response::Response;
321 /// # use reqwest::StatusCode;
322 /// # use bytes::Bytes;
323 /// # use url::Url;
324 /// # use serde::Deserialize;
325 /// # #[derive(Deserialize)]
326 /// # struct Data { value: String }
327 /// # let response = Response {
328 /// # url: Url::parse("https://api.example.com").unwrap(),
329 /// # status: StatusCode::OK,
330 /// # headers: http::header::HeaderMap::new(),
331 /// # body: Bytes::from(r#"{"value": "test"}"#),
332 /// # request_url: Url::parse("https://api.example.com").unwrap(),
333 /// # meta: None,
334 /// # cached: false,
335 /// # };
336 /// let data: Data = response.json()?;
337 /// # Ok::<(), serde_json::Error>(())
338 /// ```
339 pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
340 serde_json::from_slice(&self.body)
341 }
342
343 /// Parses the response body as HTML.
344 ///
345 /// Returns a [`scraper::Html`] document that can be queried using CSS selectors.
346 ///
347 /// # Errors
348 ///
349 /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
350 ///
351 /// ## Example
352 ///
353 /// ```rust,ignore
354 /// # use spider_util::response::Response;
355 /// # use reqwest::StatusCode;
356 /// # use bytes::Bytes;
357 /// # use url::Url;
358 /// # let response = Response {
359 /// # url: Url::parse("https://example.com").unwrap(),
360 /// # status: StatusCode::OK,
361 /// # headers: http::header::HeaderMap::new(),
362 /// # body: Bytes::from("<html><body>Hello</body></html>"),
363 /// # request_url: Url::parse("https://example.com").unwrap(),
364 /// # meta: None,
365 /// # cached: false,
366 /// # };
367 /// let html = response.to_html()?;
368 /// # Ok::<(), std::str::Utf8Error>(())
369 /// ```
370 pub fn to_html(&self) -> Result<Html, Utf8Error> {
371 let cache_key = self.html_cache_key();
372
373 HTML_CACHE.with(|cache| {
374 if let Some(html) = cache.borrow().get(&cache_key).cloned() {
375 return Ok(html);
376 }
377
378 let body_str = from_utf8(&self.body)?;
379 let html = Html::parse_document(body_str);
380 cache.borrow_mut().insert(cache_key, html.clone());
381 Ok(html)
382 })
383 }
384
385 /// Lazily parses the response body as HTML.
386 ///
387 /// Returns a closure that can be called when the HTML is actually needed.
388 /// This avoids parsing HTML for responses where it may not be used.
389 ///
390 /// # Errors
391 ///
392 /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
393 ///
394 /// ## Example
395 ///
396 /// ```rust,ignore
397 /// # use spider_util::response::Response;
398 /// # use reqwest::StatusCode;
399 /// # use bytes::Bytes;
400 /// # use url::Url;
401 /// # let response = Response {
402 /// # url: Url::parse("https://example.com").unwrap(),
403 /// # status: StatusCode::OK,
404 /// # headers: http::header::HeaderMap::new(),
405 /// # body: Bytes::from("<html><body>Hello</body></html>"),
406 /// # request_url: Url::parse("https://example.com").unwrap(),
407 /// # meta: None,
408 /// # cached: false,
409 /// # };
410 /// let html_fn = response.lazy_html()?;
411 /// // Parse HTML only when needed
412 /// let html = html_fn()?;
413 /// # Ok::<(), std::str::Utf8Error>(())
414 /// ```
415 pub fn lazy_html(&self) -> Result<impl Fn() -> Result<Html, Utf8Error> + '_, Utf8Error> {
416 Ok(move || self.to_html())
417 }
418
419 /// Returns a customizable iterator of links discovered in the response body.
420 ///
421 /// Unlike [`Response::links`], this method does not deduplicate results.
422 /// Callers that need uniqueness can collect into a set or use [`Response::links`].
423 ///
424 /// ## Example
425 ///
426 /// ```rust,ignore
427 /// # use spider_util::response::{LinkExtractOptions, Response};
428 /// # use reqwest::StatusCode;
429 /// # use bytes::Bytes;
430 /// # use url::Url;
431 /// # let response = Response {
432 /// # url: Url::parse("https://example.com").unwrap(),
433 /// # status: StatusCode::OK,
434 /// # headers: http::header::HeaderMap::new(),
435 /// # body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
436 /// # request_url: Url::parse("https://example.com").unwrap(),
437 /// # meta: None,
438 /// # cached: false,
439 /// # };
440 /// let links: Vec<_> = response
441 /// .links_iter(LinkExtractOptions::default())
442 /// .collect();
443 /// assert!(!links.is_empty());
444 /// ```
445 pub fn links_iter(&self, options: LinkExtractOptions) -> impl Iterator<Item = Link> {
446 self.parse_links(options).unwrap_or_default().into_iter()
447 }
448
449 /// Extracts all unique, same-site links from the response body.
450 ///
451 /// This method discovers links from:
452 /// - HTML elements with `href` or `src` attributes (`<a>`, `<link>`, `<script>`, `<img>`, etc.)
453 /// - URLs found in text content (using link detection)
454 ///
455 /// Only links pointing to the same site (same registered domain) are included.
456 ///
457 /// ## Returns
458 ///
459 /// A [`DashSet`] of [`Link`] objects containing the URL and link type.
460 ///
461 /// ## Example
462 ///
463 /// ```rust,ignore
464 /// # use spider_util::response::Response;
465 /// # use reqwest::StatusCode;
466 /// # use bytes::Bytes;
467 /// # use url::Url;
468 /// # let response = Response {
469 /// # url: Url::parse("https://example.com").unwrap(),
470 /// # status: StatusCode::OK,
471 /// # headers: http::header::HeaderMap::new(),
472 /// # body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
473 /// # request_url: Url::parse("https://example.com").unwrap(),
474 /// # meta: None,
475 /// # cached: false,
476 /// # };
477 /// let links = response.links();
478 /// for link in links.iter() {
479 /// println!("Found {:?} link: {}", link.link_type, link.url);
480 /// }
481 /// ```
482 pub fn links(&self) -> DashSet<Link> {
483 let links = DashSet::new();
484
485 for link in self.links_iter(LinkExtractOptions::default()) {
486 links.insert(link);
487 }
488
489 links
490 }
491
492 fn parse_links(&self, options: LinkExtractOptions) -> Result<Vec<Link>, Utf8Error> {
493 let html_fn = self.lazy_html()?;
494 let html = html_fn()?;
495 let mut links = Vec::new();
496
497 self.collect_attribute_links(&html, &options, &mut links);
498
499 if options.include_text_links {
500 self.collect_text_links(&html, &options, &mut links);
501 }
502
503 Ok(links)
504 }
505
506 fn collect_attribute_links(
507 &self,
508 html: &Html,
509 options: &LinkExtractOptions,
510 links: &mut Vec<Link>,
511 ) {
512 for source in &options.sources {
513 let Some(selector) = get_cached_selector(&source.selector) else {
514 continue;
515 };
516
517 for element in html.select(&selector) {
518 let Some(attr_value) = element.value().attr(&source.attribute) else {
519 continue;
520 };
521
522 let link_type = source
523 .link_type
524 .clone()
525 .unwrap_or_else(|| infer_link_type(&element));
526
527 if let Some(link) = self.build_link(attr_value, link_type, options) {
528 links.push(link);
529 }
530 }
531 }
532 }
533
534 fn collect_text_links(&self, html: &Html, options: &LinkExtractOptions, links: &mut Vec<Link>) {
535 let finder = LinkFinder::new();
536
537 for text_node in html.tree.values().filter_map(|node| node.as_text()) {
538 for link in finder.links(text_node) {
539 if link.kind() != &LinkKind::Url {
540 continue;
541 }
542
543 if let Some(link) = self.build_link(link.as_str(), LinkType::Page, options) {
544 links.push(link);
545 }
546 }
547 }
548 }
549
550 fn build_link(
551 &self,
552 raw_url: &str,
553 link_type: LinkType,
554 options: &LinkExtractOptions,
555 ) -> Option<Link> {
556 let url = self.url.join(raw_url).ok()?;
557
558 if options.same_site_only && !util::is_same_site(&url, &self.url) {
559 return None;
560 }
561
562 if !options
563 .allowed_link_types
564 .as_ref()
565 .is_none_or(|allowed| allowed.contains(&link_type))
566 {
567 return None;
568 }
569
570 Some(Link { url, link_type })
571 }
572
573 fn html_cache_key(&self) -> u64 {
574 let mut hasher = SeaHasher::new();
575 self.url.as_str().hash(&mut hasher);
576 self.request_url.as_str().hash(&mut hasher);
577 self.body.hash(&mut hasher);
578 hasher.finish()
579 }
580}
581
582impl Clone for Response {
583 fn clone(&self) -> Self {
584 Response {
585 url: self.url.clone(),
586 status: self.status,
587 headers: self.headers.clone(),
588 body: self.body.clone(),
589 request_url: self.request_url.clone(),
590 meta: self.meta.clone(),
591 cached: self.cached,
592 }
593 }
594}
595
596fn default_link_sources() -> Vec<LinkSource> {
597 vec![
598 LinkSource::new("a[href]", "href"),
599 LinkSource::new("link[href]", "href"),
600 LinkSource::new("script[src]", "src"),
601 LinkSource::new("img[src]", "src"),
602 LinkSource::new("audio[src]", "src"),
603 LinkSource::new("video[src]", "src"),
604 LinkSource::new("source[src]", "src"),
605 ]
606}
607
608fn infer_link_type(element: &ElementRef<'_>) -> LinkType {
609 match element.value().name() {
610 "a" => LinkType::Page,
611 "link" => {
612 if let Some(rel) = element.value().attr("rel") {
613 if rel.eq_ignore_ascii_case("stylesheet") {
614 LinkType::Stylesheet
615 } else {
616 LinkType::Other(rel.to_string())
617 }
618 } else {
619 LinkType::Other("link".to_string())
620 }
621 }
622 "script" => LinkType::Script,
623 "img" => LinkType::Image,
624 "audio" | "video" | "source" => LinkType::Media,
625 _ => LinkType::Other(element.value().name().to_string()),
626 }
627}