spider_util/
response.rs

1//! Response types and response-side helpers.
2//!
3//! [`Response`] wraps the downloaded body together with the final URL, status,
4//! headers, and request metadata. It also provides convenience methods for
5//! parsing HTML or JSON and for extracting links.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_util::response::Response;
11//! use reqwest::StatusCode;
12//! use bytes::Bytes;
13//! use url::Url;
14//!
15//! // Create a response (typically done internally by the downloader)
16//! let response = Response {
17//!     url: Url::parse("https://example.com").unwrap(),
18//!     status: StatusCode::OK,
19//!     headers: http::header::HeaderMap::new(),
20//!     body: Bytes::from("<html><body>Hello</body></html>"),
21//!     request_url: Url::parse("https://example.com").unwrap(),
22//!     meta: None,
23//!     cached: false,
24//! };
25//!
26//! // Parse as HTML
27//! let html = response.to_html().unwrap();
28//!
29//! // Extract links from the response
30//! let links = response.links();
31//! ```
32//!
33//! In the crawler lifecycle, a [`Response`] is produced by the downloader,
34//! optionally rewritten by middleware, and then handed to
35//! [`Spider::parse`](spider_core::Spider::parse).
36
37use crate::request::Request;
38use crate::selector::get_cached_selector;
39use crate::util;
40use dashmap::{DashMap, DashSet};
41use linkify::{LinkFinder, LinkKind};
42use reqwest::StatusCode;
43use scraper::{ElementRef, Html};
44use seahash::SeaHasher;
45use serde::de::DeserializeOwned;
46use serde_json;
47use std::cell::RefCell;
48use std::collections::HashMap;
49use std::hash::{Hash, Hasher};
50use std::{str::Utf8Error, str::from_utf8, sync::Arc};
51use url::Url;
52
53thread_local! {
54    static HTML_CACHE: RefCell<HashMap<u64, Html>> = RefCell::new(HashMap::new());
55}
56
57/// Classification for links discovered in a response.
58///
59/// ## Variants
60///
61/// - `Page`: Links to other web pages (typically `<a>` tags)
62/// - `Script`: Links to JavaScript files (`<script>` tags)
63/// - `Stylesheet`: Links to CSS stylesheets (`<link rel="stylesheet">`)
64/// - `Image`: Links to images (`<img>` tags)
65/// - `Media`: Links to audio/video files (`<audio>`, `<video>`, `<source>`)
66/// - `Other`: Any other type of resource with a custom identifier
67#[derive(Debug, Clone, PartialEq, Eq, Hash)]
68pub enum LinkType {
69    /// A link to another web page.
70    Page,
71    /// A link to a script file.
72    Script,
73    /// A link to a stylesheet.
74    Stylesheet,
75    /// A link to an image.
76    Image,
77    /// A link to a media file (audio/video).
78    Media,
79    /// A link to another type of resource.
80    Other(String),
81}
82
83/// A link discovered while extracting URLs from a response.
84///
85/// ## Example
86///
87/// ```rust,ignore
88/// use spider_util::response::{Link, LinkType};
89/// use url::Url;
90///
91/// let link = Link {
92///     url: Url::parse("https://example.com/page").unwrap(),
93///     link_type: LinkType::Page,
94/// };
95/// ```
96#[derive(Debug, Clone, PartialEq, Eq, Hash)]
97pub struct Link {
98    /// The URL of the discovered link.
99    pub url: Url,
100    /// The type of the discovered link.
101    pub link_type: LinkType,
102}
103
104/// One selector/attribute pair used during link extraction.
105///
106/// This is useful when the default HTML link sources are not enough for the
107/// target site and you need to teach the extractor about custom attributes.
108#[derive(Debug, Clone, PartialEq, Eq)]
109pub struct LinkSource {
110    /// CSS selector used to find candidate elements.
111    pub selector: String,
112    /// Attribute name that contains the URL.
113    pub attribute: String,
114    /// Optional fixed link type for matches from this source.
115    pub link_type: Option<LinkType>,
116}
117
118impl LinkSource {
119    /// Creates a new source definition.
120    pub fn new(selector: impl Into<String>, attribute: impl Into<String>) -> Self {
121        Self {
122            selector: selector.into(),
123            attribute: attribute.into(),
124            link_type: None,
125        }
126    }
127
128    /// Overrides the inferred link type for this source.
129    pub fn with_link_type(mut self, link_type: LinkType) -> Self {
130        self.link_type = Some(link_type);
131        self
132    }
133}
134
135/// Options that control link extraction from a [`Response`].
136///
137/// The defaults are intentionally conservative for crawler use: same-site
138/// filtering is enabled, text links are included, and common HTML elements are
139/// scanned for navigable URLs.
140#[derive(Debug, Clone, PartialEq, Eq)]
141pub struct LinkExtractOptions {
142    /// Restrict discovered links to the same registered domain.
143    pub same_site_only: bool,
144    /// Include URLs found in text content.
145    pub include_text_links: bool,
146    /// HTML sources used to discover attribute-based links.
147    pub sources: Vec<LinkSource>,
148    /// Optional allow-list of link types to include.
149    pub allowed_link_types: Option<Vec<LinkType>>,
150}
151
152impl Default for LinkExtractOptions {
153    fn default() -> Self {
154        Self {
155            same_site_only: true,
156            include_text_links: true,
157            sources: default_link_sources(),
158            allowed_link_types: None,
159        }
160    }
161}
162
163impl LinkExtractOptions {
164    /// Sets whether only same-site URLs should be returned.
165    pub fn same_site_only(mut self, same_site_only: bool) -> Self {
166        self.same_site_only = same_site_only;
167        self
168    }
169
170    /// Sets whether URLs found in text content should be returned.
171    pub fn include_text_links(mut self, include_text_links: bool) -> Self {
172        self.include_text_links = include_text_links;
173        self
174    }
175
176    /// Replaces the configured HTML extraction sources.
177    pub fn with_sources(mut self, sources: impl IntoIterator<Item = LinkSource>) -> Self {
178        self.sources = sources.into_iter().collect();
179        self
180    }
181
182    /// Adds an HTML extraction source.
183    pub fn add_source(mut self, source: LinkSource) -> Self {
184        self.sources.push(source);
185        self
186    }
187
188    /// Restricts extraction to the provided link types.
189    pub fn with_allowed_link_types(
190        mut self,
191        allowed_link_types: impl IntoIterator<Item = LinkType>,
192    ) -> Self {
193        self.allowed_link_types = Some(allowed_link_types.into_iter().collect());
194        self
195    }
196}
197
198/// Represents an HTTP response received from a server.
199///
200/// [`Response`] contains all information about an HTTP response, including
201/// the final URL (after redirects), status code, headers, body content,
202/// and metadata carried over from the original request.
203///
204/// The type is designed for parse-time ergonomics:
205/// - [`Response::to_html`] parses the body as HTML
206/// - [`Response::json`] deserializes JSON payloads
207/// - [`Response::links`] and related helpers extract follow-up links
208/// - [`Response::to_request`] reconstructs the originating request context
209///
210/// ## Example
211///
212/// ```rust,ignore
213/// use spider_util::response::Response;
214/// use reqwest::StatusCode;
215/// use bytes::Bytes;
216/// use url::Url;
217///
218/// let response = Response {
219///     url: Url::parse("https://example.com").unwrap(),
220///     status: StatusCode::OK,
221///     headers: http::header::HeaderMap::new(),
222///     body: Bytes::from("<html><body>Hello</body></html>"),
223///     request_url: Url::parse("https://example.com").unwrap(),
224///     meta: None,
225///     cached: false,
226/// };
227///
228/// // Parse the response body as HTML
229/// if let Ok(html) = response.to_html() {
230///     // Process HTML...
231/// }
232/// ```
233#[derive(Debug)]
234pub struct Response {
235    /// The final URL of the response after any redirects.
236    pub url: Url,
237    /// The HTTP status code of the response.
238    pub status: StatusCode,
239    /// The headers of the response.
240    pub headers: http::header::HeaderMap,
241    /// The body of the response.
242    pub body: bytes::Bytes,
243    /// The original URL of the request that led to this response.
244    pub request_url: Url,
245    /// Metadata associated with the response, carried over from the request.
246    /// Uses Option to allow lazy initialization.
247    pub meta: Option<Arc<DashMap<String, serde_json::Value>>>,
248    /// Indicates if the response was served from a cache.
249    pub cached: bool,
250}
251
252impl Response {
253    /// Creates a new response with an empty HTML cache.
254    ///
255    /// Most application code receives responses from the runtime rather than
256    /// constructing them directly. This constructor is mainly useful for custom
257    /// downloaders and lower-level integrations.
258    pub fn new(
259        url: Url,
260        status: StatusCode,
261        headers: http::header::HeaderMap,
262        body: bytes::Bytes,
263        request_url: Url,
264    ) -> Self {
265        Self {
266            url,
267            status,
268            headers,
269            body,
270            request_url,
271            meta: None,
272            cached: false,
273        }
274    }
275
276    /// Reconstructs the original [`Request`] that led to this response.
277    ///
278    /// This method creates a new [`Request`] with the same URL and metadata
279    /// as the request that produced this response. Useful for retry scenarios
280    /// or when you need to re-request the same resource.
281    ///
282    /// ## Example
283    ///
284    /// ```rust,ignore
285    /// # use spider_util::response::Response;
286    /// # use reqwest::StatusCode;
287    /// # use bytes::Bytes;
288    /// # use url::Url;
289    /// # let response = Response {
290    /// #     url: Url::parse("https://example.com").unwrap(),
291    /// #     status: StatusCode::OK,
292    /// #     headers: http::header::HeaderMap::new(),
293    /// #     body: Bytes::from("hello"),
294    /// #     request_url: Url::parse("https://example.com").unwrap(),
295    /// #     meta: None,
296    /// #     cached: false,
297    /// # };
298    /// let original_request = response.request_from_response();
299    /// ```
300    pub fn request_from_response(&self) -> Request {
301        let mut request = Request::new(self.request_url.clone());
302        request.set_meta_from_option(self.meta.clone());
303        request
304    }
305
306    /// Deserializes the response body as JSON.
307    ///
308    /// # Type Parameters
309    ///
310    /// - `T`: The target type to deserialize into (must implement `DeserializeOwned`)
311    ///
312    /// # Errors
313    ///
314    /// Returns a [`serde_json::Error`] if the body cannot be parsed as JSON
315    /// or if it cannot be deserialized into type `T`.
316    ///
317    /// ## Example
318    ///
319    /// ```rust,ignore
320    /// # use spider_util::response::Response;
321    /// # use reqwest::StatusCode;
322    /// # use bytes::Bytes;
323    /// # use url::Url;
324    /// # use serde::Deserialize;
325    /// # #[derive(Deserialize)]
326    /// # struct Data { value: String }
327    /// # let response = Response {
328    /// #     url: Url::parse("https://api.example.com").unwrap(),
329    /// #     status: StatusCode::OK,
330    /// #     headers: http::header::HeaderMap::new(),
331    /// #     body: Bytes::from(r#"{"value": "test"}"#),
332    /// #     request_url: Url::parse("https://api.example.com").unwrap(),
333    /// #     meta: None,
334    /// #     cached: false,
335    /// # };
336    /// let data: Data = response.json()?;
337    /// # Ok::<(), serde_json::Error>(())
338    /// ```
339    pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
340        serde_json::from_slice(&self.body)
341    }
342
343    /// Parses the response body as HTML.
344    ///
345    /// Returns a [`scraper::Html`] document that can be queried using CSS selectors.
346    ///
347    /// # Errors
348    ///
349    /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
350    ///
351    /// ## Example
352    ///
353    /// ```rust,ignore
354    /// # use spider_util::response::Response;
355    /// # use reqwest::StatusCode;
356    /// # use bytes::Bytes;
357    /// # use url::Url;
358    /// # let response = Response {
359    /// #     url: Url::parse("https://example.com").unwrap(),
360    /// #     status: StatusCode::OK,
361    /// #     headers: http::header::HeaderMap::new(),
362    /// #     body: Bytes::from("<html><body>Hello</body></html>"),
363    /// #     request_url: Url::parse("https://example.com").unwrap(),
364    /// #     meta: None,
365    /// #     cached: false,
366    /// # };
367    /// let html = response.to_html()?;
368    /// # Ok::<(), std::str::Utf8Error>(())
369    /// ```
370    pub fn to_html(&self) -> Result<Html, Utf8Error> {
371        let cache_key = self.html_cache_key();
372
373        HTML_CACHE.with(|cache| {
374            if let Some(html) = cache.borrow().get(&cache_key).cloned() {
375                return Ok(html);
376            }
377
378            let body_str = from_utf8(&self.body)?;
379            let html = Html::parse_document(body_str);
380            cache.borrow_mut().insert(cache_key, html.clone());
381            Ok(html)
382        })
383    }
384
385    /// Lazily parses the response body as HTML.
386    ///
387    /// Returns a closure that can be called when the HTML is actually needed.
388    /// This avoids parsing HTML for responses where it may not be used.
389    ///
390    /// # Errors
391    ///
392    /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
393    ///
394    /// ## Example
395    ///
396    /// ```rust,ignore
397    /// # use spider_util::response::Response;
398    /// # use reqwest::StatusCode;
399    /// # use bytes::Bytes;
400    /// # use url::Url;
401    /// # let response = Response {
402    /// #     url: Url::parse("https://example.com").unwrap(),
403    /// #     status: StatusCode::OK,
404    /// #     headers: http::header::HeaderMap::new(),
405    /// #     body: Bytes::from("<html><body>Hello</body></html>"),
406    /// #     request_url: Url::parse("https://example.com").unwrap(),
407    /// #     meta: None,
408    /// #     cached: false,
409    /// # };
410    /// let html_fn = response.lazy_html()?;
411    /// // Parse HTML only when needed
412    /// let html = html_fn()?;
413    /// # Ok::<(), std::str::Utf8Error>(())
414    /// ```
415    pub fn lazy_html(&self) -> Result<impl Fn() -> Result<Html, Utf8Error> + '_, Utf8Error> {
416        Ok(move || self.to_html())
417    }
418
419    /// Returns a customizable iterator of links discovered in the response body.
420    ///
421    /// Unlike [`Response::links`], this method does not deduplicate results.
422    /// Callers that need uniqueness can collect into a set or use [`Response::links`].
423    ///
424    /// ## Example
425    ///
426    /// ```rust,ignore
427    /// # use spider_util::response::{LinkExtractOptions, Response};
428    /// # use reqwest::StatusCode;
429    /// # use bytes::Bytes;
430    /// # use url::Url;
431    /// # let response = Response {
432    /// #     url: Url::parse("https://example.com").unwrap(),
433    /// #     status: StatusCode::OK,
434    /// #     headers: http::header::HeaderMap::new(),
435    /// #     body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
436    /// #     request_url: Url::parse("https://example.com").unwrap(),
437    /// #     meta: None,
438    /// #     cached: false,
439    /// # };
440    /// let links: Vec<_> = response
441    ///     .links_iter(LinkExtractOptions::default())
442    ///     .collect();
443    /// assert!(!links.is_empty());
444    /// ```
445    pub fn links_iter(&self, options: LinkExtractOptions) -> impl Iterator<Item = Link> {
446        self.parse_links(options).unwrap_or_default().into_iter()
447    }
448
449    /// Extracts all unique, same-site links from the response body.
450    ///
451    /// This method discovers links from:
452    /// - HTML elements with `href` or `src` attributes (`<a>`, `<link>`, `<script>`, `<img>`, etc.)
453    /// - URLs found in text content (using link detection)
454    ///
455    /// Only links pointing to the same site (same registered domain) are included.
456    ///
457    /// ## Returns
458    ///
459    /// A [`DashSet`] of [`Link`] objects containing the URL and link type.
460    ///
461    /// ## Example
462    ///
463    /// ```rust,ignore
464    /// # use spider_util::response::Response;
465    /// # use reqwest::StatusCode;
466    /// # use bytes::Bytes;
467    /// # use url::Url;
468    /// # let response = Response {
469    /// #     url: Url::parse("https://example.com").unwrap(),
470    /// #     status: StatusCode::OK,
471    /// #     headers: http::header::HeaderMap::new(),
472    /// #     body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
473    /// #     request_url: Url::parse("https://example.com").unwrap(),
474    /// #     meta: None,
475    /// #     cached: false,
476    /// # };
477    /// let links = response.links();
478    /// for link in links.iter() {
479    ///     println!("Found {:?} link: {}", link.link_type, link.url);
480    /// }
481    /// ```
482    pub fn links(&self) -> DashSet<Link> {
483        let links = DashSet::new();
484
485        for link in self.links_iter(LinkExtractOptions::default()) {
486            links.insert(link);
487        }
488
489        links
490    }
491
492    fn parse_links(&self, options: LinkExtractOptions) -> Result<Vec<Link>, Utf8Error> {
493        let html_fn = self.lazy_html()?;
494        let html = html_fn()?;
495        let mut links = Vec::new();
496
497        self.collect_attribute_links(&html, &options, &mut links);
498
499        if options.include_text_links {
500            self.collect_text_links(&html, &options, &mut links);
501        }
502
503        Ok(links)
504    }
505
506    fn collect_attribute_links(
507        &self,
508        html: &Html,
509        options: &LinkExtractOptions,
510        links: &mut Vec<Link>,
511    ) {
512        for source in &options.sources {
513            let Some(selector) = get_cached_selector(&source.selector) else {
514                continue;
515            };
516
517            for element in html.select(&selector) {
518                let Some(attr_value) = element.value().attr(&source.attribute) else {
519                    continue;
520                };
521
522                let link_type = source
523                    .link_type
524                    .clone()
525                    .unwrap_or_else(|| infer_link_type(&element));
526
527                if let Some(link) = self.build_link(attr_value, link_type, options) {
528                    links.push(link);
529                }
530            }
531        }
532    }
533
534    fn collect_text_links(&self, html: &Html, options: &LinkExtractOptions, links: &mut Vec<Link>) {
535        let finder = LinkFinder::new();
536
537        for text_node in html.tree.values().filter_map(|node| node.as_text()) {
538            for link in finder.links(text_node) {
539                if link.kind() != &LinkKind::Url {
540                    continue;
541                }
542
543                if let Some(link) = self.build_link(link.as_str(), LinkType::Page, options) {
544                    links.push(link);
545                }
546            }
547        }
548    }
549
550    fn build_link(
551        &self,
552        raw_url: &str,
553        link_type: LinkType,
554        options: &LinkExtractOptions,
555    ) -> Option<Link> {
556        let url = self.url.join(raw_url).ok()?;
557
558        if options.same_site_only && !util::is_same_site(&url, &self.url) {
559            return None;
560        }
561
562        if !options
563            .allowed_link_types
564            .as_ref()
565            .is_none_or(|allowed| allowed.contains(&link_type))
566        {
567            return None;
568        }
569
570        Some(Link { url, link_type })
571    }
572
573    fn html_cache_key(&self) -> u64 {
574        let mut hasher = SeaHasher::new();
575        self.url.as_str().hash(&mut hasher);
576        self.request_url.as_str().hash(&mut hasher);
577        self.body.hash(&mut hasher);
578        hasher.finish()
579    }
580}
581
582impl Clone for Response {
583    fn clone(&self) -> Self {
584        Response {
585            url: self.url.clone(),
586            status: self.status,
587            headers: self.headers.clone(),
588            body: self.body.clone(),
589            request_url: self.request_url.clone(),
590            meta: self.meta.clone(),
591            cached: self.cached,
592        }
593    }
594}
595
596fn default_link_sources() -> Vec<LinkSource> {
597    vec![
598        LinkSource::new("a[href]", "href"),
599        LinkSource::new("link[href]", "href"),
600        LinkSource::new("script[src]", "src"),
601        LinkSource::new("img[src]", "src"),
602        LinkSource::new("audio[src]", "src"),
603        LinkSource::new("video[src]", "src"),
604        LinkSource::new("source[src]", "src"),
605    ]
606}
607
608fn infer_link_type(element: &ElementRef<'_>) -> LinkType {
609    match element.value().name() {
610        "a" => LinkType::Page,
611        "link" => {
612            if let Some(rel) = element.value().attr("rel") {
613                if rel.eq_ignore_ascii_case("stylesheet") {
614                    LinkType::Stylesheet
615                } else {
616                    LinkType::Other(rel.to_string())
617                }
618            } else {
619                LinkType::Other("link".to_string())
620            }
621        }
622        "script" => LinkType::Script,
623        "img" => LinkType::Image,
624        "audio" | "video" | "source" => LinkType::Media,
625        _ => LinkType::Other(element.value().name().to_string()),
626    }
627}
spider_util/response.rs

spider_util/
response.rs