spider_util/
response.rs

1//! Response types and response-side helpers.
2//!
3//! [`Response`] wraps the downloaded body together with the final URL, status,
4//! headers, and request metadata. It also provides convenience methods for
5//! Scrapy-like CSS extraction, parsing HTML or JSON, and extracting links.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_util::response::Response;
11//! use reqwest::StatusCode;
12//! use bytes::Bytes;
13//! use url::Url;
14//!
15//! // Create a response (typically done internally by the downloader)
16//! let response = Response {
17//!     url: Url::parse("https://example.com").unwrap(),
18//!     status: StatusCode::OK,
19//!     headers: http::header::HeaderMap::new(),
20//!     body: Bytes::from("<html><body>Hello</body></html>"),
21//!     request_url: Url::parse("https://example.com").unwrap(),
22//!     request_priority: 0,
23//!     meta: None,
24//!     cached: false,
25//! };
26//!
27//! // Extract text with the builtin selector API
28//! let heading = response.css("h1::text").unwrap().get();
29//!
30//! // Extract links from the response
31//! let links = response.links();
32//! ```
33//!
34//! In the crawler lifecycle, a [`Response`] is produced by the downloader,
35//! optionally rewritten by middleware, and then handed to
36//! [`Spider::parse`](spider_core::Spider::parse).
37
38use crate::error::SpiderError;
39use crate::request::Request;
40use crate::selector::{SelectorList, get_cached_selector};
41use crate::util;
42use dashmap::{DashMap, DashSet};
43use linkify::{LinkFinder, LinkKind};
44use reqwest::StatusCode;
45use scraper::{ElementRef, Html};
46use seahash::SeaHasher;
47use serde::de::DeserializeOwned;
48use serde::{Deserialize, Serialize};
49use serde_json;
50use std::cell::RefCell;
51use std::collections::HashMap;
52use std::hash::{Hash, Hasher};
53use std::{str::Utf8Error, str::from_utf8, sync::Arc};
54use url::Url;
55
56thread_local! {
57    static HTML_CACHE: RefCell<HashMap<u64, Arc<Html>>> = RefCell::new(HashMap::new());
58}
59
60const DISCOVERY_RULE_META_KEY: &str = "__discovery_rule";
61
62/// Classification for links discovered in a response.
63///
64/// ## Variants
65///
66/// - `Page`: Links to other web pages (typically `<a>` tags)
67/// - `Script`: Links to JavaScript files (`<script>` tags)
68/// - `Stylesheet`: Links to CSS stylesheets (`<link rel="stylesheet">`)
69/// - `Image`: Links to images (`<img>` tags)
70/// - `Media`: Links to audio/video files (`<audio>`, `<video>`, `<source>`)
71/// - `Other`: Any other type of resource with a custom identifier
72#[derive(Debug, Clone, PartialEq, Eq, Hash)]
73pub enum LinkType {
74    /// A link to another web page.
75    Page,
76    /// A link to a script file.
77    Script,
78    /// A link to a stylesheet.
79    Stylesheet,
80    /// A link to an image.
81    Image,
82    /// A link to a media file (audio/video).
83    Media,
84    /// A link to another type of resource.
85    Other(String),
86}
87
88/// A link discovered while extracting URLs from a response.
89///
90/// ## Example
91///
92/// ```rust,ignore
93/// use spider_util::response::{Link, LinkType};
94/// use url::Url;
95///
96/// let link = Link {
97///     url: Url::parse("https://example.com/page").unwrap(),
98///     link_type: LinkType::Page,
99/// };
100/// ```
101#[derive(Debug, Clone, PartialEq, Eq, Hash)]
102pub struct Link {
103    /// The URL of the discovered link.
104    pub url: Url,
105    /// The type of the discovered link.
106    pub link_type: LinkType,
107}
108
109/// One selector/attribute pair used during link extraction.
110///
111/// This is useful when the default HTML link sources are not enough for the
112/// target site and you need to teach the extractor about custom attributes.
113#[derive(Debug, Clone, PartialEq, Eq)]
114pub struct LinkSource {
115    /// CSS selector used to find candidate elements.
116    pub selector: String,
117    /// Attribute name that contains the URL.
118    pub attribute: String,
119    /// Optional fixed link type for matches from this source.
120    pub link_type: Option<LinkType>,
121}
122
123impl LinkSource {
124    /// Creates a new source definition.
125    pub fn new(selector: impl Into<String>, attribute: impl Into<String>) -> Self {
126        Self {
127            selector: selector.into(),
128            attribute: attribute.into(),
129            link_type: None,
130        }
131    }
132
133    /// Overrides the inferred link type for this source.
134    pub fn with_link_type(mut self, link_type: LinkType) -> Self {
135        self.link_type = Some(link_type);
136        self
137    }
138}
139
140/// Options that control link extraction from a [`Response`].
141///
142/// The defaults are intentionally conservative for crawler use: same-site
143/// filtering is enabled, text links are included, and common HTML elements are
144/// scanned for navigable URLs.
145#[derive(Debug, Clone, PartialEq, Eq)]
146pub struct LinkExtractOptions {
147    /// Restrict discovered links to the same registered domain.
148    pub same_site_only: bool,
149    /// Include URLs found in text content.
150    pub include_text_links: bool,
151    /// HTML sources used to discover attribute-based links.
152    pub sources: Vec<LinkSource>,
153    /// Optional allow-list of link types to include.
154    pub allowed_link_types: Option<Vec<LinkType>>,
155    /// Optional deny-list of link types to exclude.
156    pub denied_link_types: Vec<LinkType>,
157    /// Optional allow-list of glob-style URL patterns (`*` and `?` supported).
158    pub allow_patterns: Vec<String>,
159    /// Optional deny-list of glob-style URL patterns (`*` and `?` supported).
160    pub deny_patterns: Vec<String>,
161    /// Optional allow-list of domains or registered-domain suffixes.
162    pub allow_domains: Vec<String>,
163    /// Optional deny-list of domains or registered-domain suffixes.
164    pub deny_domains: Vec<String>,
165    /// Optional allow-list of URL path prefixes.
166    pub allow_path_prefixes: Vec<String>,
167    /// Optional deny-list of URL path prefixes.
168    pub deny_path_prefixes: Vec<String>,
169    /// Optional allow-list of HTML tag names used for attribute extraction.
170    pub allowed_tags: Option<Vec<String>>,
171    /// Optional allow-list of attribute names used for attribute extraction.
172    pub allowed_attributes: Option<Vec<String>>,
173}
174
175impl Default for LinkExtractOptions {
176    fn default() -> Self {
177        Self {
178            same_site_only: true,
179            include_text_links: true,
180            sources: default_link_sources(),
181            allowed_link_types: None,
182            denied_link_types: Vec::new(),
183            allow_patterns: Vec::new(),
184            deny_patterns: Vec::new(),
185            allow_domains: Vec::new(),
186            deny_domains: Vec::new(),
187            allow_path_prefixes: Vec::new(),
188            deny_path_prefixes: Vec::new(),
189            allowed_tags: None,
190            allowed_attributes: None,
191        }
192    }
193}
194
195impl LinkExtractOptions {
196    /// Sets whether only same-site URLs should be returned.
197    pub fn same_site_only(mut self, same_site_only: bool) -> Self {
198        self.same_site_only = same_site_only;
199        self
200    }
201
202    /// Sets whether URLs found in text content should be returned.
203    pub fn include_text_links(mut self, include_text_links: bool) -> Self {
204        self.include_text_links = include_text_links;
205        self
206    }
207
208    /// Replaces the configured HTML extraction sources.
209    pub fn with_sources(mut self, sources: impl IntoIterator<Item = LinkSource>) -> Self {
210        self.sources = sources.into_iter().collect();
211        self
212    }
213
214    /// Adds an HTML extraction source.
215    pub fn add_source(mut self, source: LinkSource) -> Self {
216        self.sources.push(source);
217        self
218    }
219
220    /// Restricts extraction to the provided link types.
221    pub fn with_allowed_link_types(
222        mut self,
223        allowed_link_types: impl IntoIterator<Item = LinkType>,
224    ) -> Self {
225        self.allowed_link_types = Some(allowed_link_types.into_iter().collect());
226        self
227    }
228
229    /// Adds link types that should be excluded even if discovered.
230    pub fn with_denied_link_types(
231        mut self,
232        denied_link_types: impl IntoIterator<Item = LinkType>,
233    ) -> Self {
234        self.denied_link_types = denied_link_types.into_iter().collect();
235        self
236    }
237
238    /// Adds a glob-style allow pattern that URLs must match.
239    pub fn allow_pattern(mut self, pattern: impl Into<String>) -> Self {
240        self.allow_patterns.push(pattern.into());
241        self
242    }
243
244    /// Replaces the glob-style allow patterns.
245    pub fn with_allow_patterns(
246        mut self,
247        patterns: impl IntoIterator<Item = impl Into<String>>,
248    ) -> Self {
249        self.allow_patterns = patterns.into_iter().map(Into::into).collect();
250        self
251    }
252
253    /// Adds a glob-style deny pattern that excludes matching URLs.
254    pub fn deny_pattern(mut self, pattern: impl Into<String>) -> Self {
255        self.deny_patterns.push(pattern.into());
256        self
257    }
258
259    /// Replaces the glob-style deny patterns.
260    pub fn with_deny_patterns(
261        mut self,
262        patterns: impl IntoIterator<Item = impl Into<String>>,
263    ) -> Self {
264        self.deny_patterns = patterns.into_iter().map(Into::into).collect();
265        self
266    }
267
268    /// Adds a domain or registered-domain suffix to allow.
269    pub fn allow_domain(mut self, domain: impl Into<String>) -> Self {
270        self.allow_domains.push(normalize_domain_filter(domain));
271        self
272    }
273
274    /// Replaces the allowed domains.
275    pub fn with_allow_domains(
276        mut self,
277        domains: impl IntoIterator<Item = impl Into<String>>,
278    ) -> Self {
279        self.allow_domains = domains.into_iter().map(normalize_domain_filter).collect();
280        self
281    }
282
283    /// Adds a domain or registered-domain suffix to deny.
284    pub fn deny_domain(mut self, domain: impl Into<String>) -> Self {
285        self.deny_domains.push(normalize_domain_filter(domain));
286        self
287    }
288
289    /// Replaces the denied domains.
290    pub fn with_deny_domains(
291        mut self,
292        domains: impl IntoIterator<Item = impl Into<String>>,
293    ) -> Self {
294        self.deny_domains = domains.into_iter().map(normalize_domain_filter).collect();
295        self
296    }
297
298    /// Adds a URL path prefix that links must match.
299    pub fn allow_path_prefix(mut self, prefix: impl Into<String>) -> Self {
300        self.allow_path_prefixes.push(normalize_path_prefix(prefix));
301        self
302    }
303
304    /// Replaces the allowed URL path prefixes.
305    pub fn with_allow_path_prefixes(
306        mut self,
307        prefixes: impl IntoIterator<Item = impl Into<String>>,
308    ) -> Self {
309        self.allow_path_prefixes = prefixes.into_iter().map(normalize_path_prefix).collect();
310        self
311    }
312
313    /// Adds a URL path prefix that should be excluded.
314    pub fn deny_path_prefix(mut self, prefix: impl Into<String>) -> Self {
315        self.deny_path_prefixes.push(normalize_path_prefix(prefix));
316        self
317    }
318
319    /// Replaces the denied URL path prefixes.
320    pub fn with_deny_path_prefixes(
321        mut self,
322        prefixes: impl IntoIterator<Item = impl Into<String>>,
323    ) -> Self {
324        self.deny_path_prefixes = prefixes.into_iter().map(normalize_path_prefix).collect();
325        self
326    }
327
328    /// Restricts attribute-based extraction to specific HTML tag names.
329    pub fn with_allowed_tags(mut self, tags: impl IntoIterator<Item = impl Into<String>>) -> Self {
330        self.allowed_tags = Some(
331            tags.into_iter()
332                .map(Into::into)
333                .map(|tag: String| tag.to_ascii_lowercase())
334                .collect(),
335        );
336        self
337    }
338
339    /// Restricts attribute-based extraction to specific attribute names.
340    pub fn with_allowed_attributes(
341        mut self,
342        attributes: impl IntoIterator<Item = impl Into<String>>,
343    ) -> Self {
344        self.allowed_attributes = Some(
345            attributes
346                .into_iter()
347                .map(Into::into)
348                .map(|attr: String| attr.to_ascii_lowercase())
349                .collect(),
350        );
351        self
352    }
353}
354
355/// Structured page metadata extracted from an HTML response.
356#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
357pub struct PageMetadata {
358    /// Contents of the `<title>` element.
359    pub title: Option<String>,
360    /// Contents of `<meta name="description">`.
361    pub description: Option<String>,
362    /// Canonical URL from `<link rel="canonical">`.
363    pub canonical_url: Option<Url>,
364    /// Open Graph metadata such as `og:title` or `og:image`.
365    pub open_graph: HashMap<String, String>,
366    /// Feed URLs discovered from alternate RSS/Atom link tags.
367    pub feed_urls: Vec<Url>,
368}
369
370impl PageMetadata {
371    /// Returns `true` when no metadata fields were extracted.
372    pub fn is_empty(&self) -> bool {
373        self.title.is_none()
374            && self.description.is_none()
375            && self.canonical_url.is_none()
376            && self.open_graph.is_empty()
377            && self.feed_urls.is_empty()
378    }
379}
380
381/// Represents an HTTP response received from a server.
382///
383/// [`Response`] contains all information about an HTTP response, including
384/// the final URL (after redirects), status code, headers, body content,
385/// and metadata carried over from the original request.
386///
387/// The type is designed for parse-time ergonomics:
388/// - [`Response::css`] exposes the recommended Scrapy-like selector API
389/// - [`Response::to_html`] remains available for lower-level DOM access
390/// - [`Response::json`] deserializes JSON payloads
391/// - [`Response::links`] and related helpers extract follow-up links
392/// - [`Response::to_request`] reconstructs the originating request context
393///
394/// ## Example
395///
396/// ```rust,ignore
397/// use spider_util::response::Response;
398/// use reqwest::StatusCode;
399/// use bytes::Bytes;
400/// use url::Url;
401///
402/// let response = Response {
403///     url: Url::parse("https://example.com").unwrap(),
404///     status: StatusCode::OK,
405///     headers: http::header::HeaderMap::new(),
406///     body: Bytes::from("<html><body>Hello</body></html>"),
407///     request_url: Url::parse("https://example.com").unwrap(),
408///     meta: None,
409///     cached: false,
410/// };
411///
412/// // Extract text using the builtin selector API
413/// let title = response.css("title::text").ok().and_then(|list| list.get());
414/// ```
415#[derive(Debug)]
416pub struct Response {
417    /// The final URL of the response after any redirects.
418    pub url: Url,
419    /// The HTTP status code of the response.
420    pub status: StatusCode,
421    /// The headers of the response.
422    pub headers: http::header::HeaderMap,
423    /// The body of the response.
424    pub body: bytes::Bytes,
425    /// The original URL of the request that led to this response.
426    pub request_url: Url,
427    /// The scheduling priority of the original request.
428    pub request_priority: i32,
429    /// Metadata associated with the response, carried over from the request.
430    /// Uses Option to allow lazy initialization.
431    pub meta: Option<Arc<DashMap<String, serde_json::Value>>>,
432    /// Indicates if the response was served from a cache.
433    pub cached: bool,
434}
435
436impl Response {
437    /// Creates a new response with an empty HTML cache.
438    ///
439    /// Most application code receives responses from the runtime rather than
440    /// constructing them directly. This constructor is mainly useful for custom
441    /// downloaders and lower-level integrations.
442    pub fn new(
443        url: Url,
444        status: StatusCode,
445        headers: http::header::HeaderMap,
446        body: bytes::Bytes,
447        request_url: Url,
448    ) -> Self {
449        Self {
450            url,
451            status,
452            headers,
453            body,
454            request_url,
455            request_priority: 0,
456            meta: None,
457            cached: false,
458        }
459    }
460
461    /// Reconstructs the original [`Request`] that led to this response.
462    ///
463    /// This method creates a new [`Request`] with the same URL and metadata
464    /// as the request that produced this response. Useful for retry scenarios
465    /// or when you need to re-request the same resource.
466    ///
467    /// ## Example
468    ///
469    /// ```rust,ignore
470    /// # use spider_util::response::Response;
471    /// # use reqwest::StatusCode;
472    /// # use bytes::Bytes;
473    /// # use url::Url;
474    /// # let response = Response {
475    /// #     url: Url::parse("https://example.com").unwrap(),
476    /// #     status: StatusCode::OK,
477    /// #     headers: http::header::HeaderMap::new(),
478    /// #     body: Bytes::from("hello"),
479    /// #     request_url: Url::parse("https://example.com").unwrap(),
480    /// #     request_priority: 0,
481    /// #     meta: None,
482    /// #     cached: false,
483    /// # };
484    /// let original_request = response.request_from_response();
485    /// ```
486    pub fn request_from_response(&self) -> Request {
487        let mut request =
488            Request::new(self.request_url.clone()).with_priority(self.request_priority);
489        request.set_meta_from_option(self.meta.clone());
490        request
491    }
492
493    /// Returns a cloned metadata value by key.
494    pub fn get_meta(&self, key: &str) -> Option<serde_json::Value> {
495        self.meta
496            .as_ref()
497            .and_then(|m| m.get(key).map(|entry| entry.value().clone()))
498    }
499
500    /// Deserializes a metadata value into the requested type.
501    pub fn meta_value<T>(&self, key: &str) -> Result<Option<T>, serde_json::Error>
502    where
503        T: DeserializeOwned,
504    {
505        self.get_meta(key).map(serde_json::from_value).transpose()
506    }
507
508    /// Returns the runtime discovery rule name attached to this response, if any.
509    pub fn discovery_rule_name(&self) -> Option<String> {
510        self.get_meta(DISCOVERY_RULE_META_KEY)
511            .and_then(|value| value.as_str().map(ToOwned::to_owned))
512    }
513
514    /// Returns `true` when the response was reached through the named discovery rule.
515    pub fn matches_discovery_rule(&self, rule_name: &str) -> bool {
516        self.discovery_rule_name().as_deref() == Some(rule_name)
517    }
518
519    /// Inserts a metadata value, lazily allocating the map if needed.
520    pub fn insert_meta(&mut self, key: impl Into<String>, value: serde_json::Value) {
521        self.meta
522            .get_or_insert_with(|| Arc::new(DashMap::new()))
523            .insert(key.into(), value);
524    }
525
526    /// Returns a clone of the internal metadata map, if present.
527    pub fn clone_meta(&self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
528        self.meta.clone()
529    }
530
531    /// Deserializes the response body as JSON.
532    ///
533    /// # Type Parameters
534    ///
535    /// - `T`: The target type to deserialize into (must implement `DeserializeOwned`)
536    ///
537    /// # Errors
538    ///
539    /// Returns a [`serde_json::Error`] if the body cannot be parsed as JSON
540    /// or if it cannot be deserialized into type `T`.
541    ///
542    /// ## Example
543    ///
544    /// ```rust,ignore
545    /// # use spider_util::response::Response;
546    /// # use reqwest::StatusCode;
547    /// # use bytes::Bytes;
548    /// # use url::Url;
549    /// # use serde::Deserialize;
550    /// # #[derive(Deserialize)]
551    /// # struct Data { value: String }
552    /// # let response = Response {
553    /// #     url: Url::parse("https://api.example.com").unwrap(),
554    /// #     status: StatusCode::OK,
555    /// #     headers: http::header::HeaderMap::new(),
556    /// #     body: Bytes::from(r#"{"value": "test"}"#),
557    /// #     request_url: Url::parse("https://api.example.com").unwrap(),
558    /// #     meta: None,
559    /// #     cached: false,
560    /// # };
561    /// let data: Data = response.json()?;
562    /// # Ok::<(), serde_json::Error>(())
563    /// ```
564    pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
565        serde_json::from_slice(&self.body)
566    }
567
568    /// Parses the response body as HTML.
569    ///
570    /// This method is kept for lower-level DOM access and interop. For most
571    /// spider code, prefer [`Response::css`] and the builtin selector API.
572    ///
573    /// Returns a [`scraper::Html`] document that can be queried directly.
574    ///
575    /// # Errors
576    ///
577    /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
578    ///
579    /// ## Example
580    ///
581    /// ```rust,ignore
582    /// # use spider_util::response::Response;
583    /// # use reqwest::StatusCode;
584    /// # use bytes::Bytes;
585    /// # use url::Url;
586    /// # let response = Response {
587    /// #     url: Url::parse("https://example.com").unwrap(),
588    /// #     status: StatusCode::OK,
589    /// #     headers: http::header::HeaderMap::new(),
590    /// #     body: Bytes::from("<html><body>Hello</body></html>"),
591    /// #     request_url: Url::parse("https://example.com").unwrap(),
592    /// #     meta: None,
593    /// #     cached: false,
594    /// # };
595    /// let html = response.to_html()?;
596    /// # Ok::<(), std::str::Utf8Error>(())
597    /// ```
598    pub fn to_html(&self) -> Result<Html, Utf8Error> {
599        Ok((*self.cached_html()?).clone())
600    }
601
602    /// Lazily parses the response body as HTML.
603    ///
604    /// Returns a closure that can be called when lower-level HTML access is
605    /// actually needed. Most spiders should prefer [`Response::css`].
606    ///
607    /// # Errors
608    ///
609    /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
610    ///
611    /// ## Example
612    ///
613    /// ```rust,ignore
614    /// # use spider_util::response::Response;
615    /// # use reqwest::StatusCode;
616    /// # use bytes::Bytes;
617    /// # use url::Url;
618    /// # let response = Response {
619    /// #     url: Url::parse("https://example.com").unwrap(),
620    /// #     status: StatusCode::OK,
621    /// #     headers: http::header::HeaderMap::new(),
622    /// #     body: Bytes::from("<html><body>Hello</body></html>"),
623    /// #     request_url: Url::parse("https://example.com").unwrap(),
624    /// #     meta: None,
625    /// #     cached: false,
626    /// # };
627    /// let html_fn = response.lazy_html()?;
628    /// // Parse HTML only when needed
629    /// let html = html_fn()?;
630    /// # Ok::<(), std::str::Utf8Error>(())
631    /// ```
632    pub fn lazy_html(&self) -> Result<impl Fn() -> Result<Html, Utf8Error> + '_, Utf8Error> {
633        Ok(move || self.to_html())
634    }
635
636    /// Applies a builtin CSS selector to the response body using a Scrapy-like API.
637    ///
638    /// Supports standard CSS selectors plus terminal extraction suffixes:
639    /// - `::text`
640    /// - `::attr(name)`
641    ///
642    /// ## Example
643    ///
644    /// ```rust,ignore
645    /// # use spider_util::response::Response;
646    /// # use reqwest::StatusCode;
647    /// # use bytes::Bytes;
648    /// # use url::Url;
649    /// # let response = Response {
650    /// #     url: Url::parse("https://example.com").unwrap(),
651    /// #     status: StatusCode::OK,
652    /// #     headers: http::header::HeaderMap::new(),
653    /// #     body: Bytes::from(r#"<html><body><h1>Hello</h1><a href="/next">Next</a></body></html>"#),
654    /// #     request_url: Url::parse("https://example.com").unwrap(),
655    /// #     request_priority: 0,
656    /// #     meta: None,
657    /// #     cached: false,
658    /// # };
659    /// let heading = response.css("h1::text")?.get().unwrap_or_default();
660    /// let next_href = response.css("a::attr(href)")?.get();
661    /// # Ok::<(), crate::error::SpiderError>(())
662    /// ```
663    ///
664    /// # Errors
665    ///
666    /// Returns [`SpiderError::Utf8Error`] when the body is not valid UTF-8 and
667    /// [`SpiderError::HtmlParseError`] when the selector is invalid.
668    pub fn css(&self, query: &str) -> Result<SelectorList, SpiderError> {
669        let document = self.cached_html()?;
670        SelectorList::from_document_query(document, query)
671    }
672
673    /// Returns the response body as UTF-8 text.
674    pub fn text(&self) -> Result<&str, Utf8Error> {
675        from_utf8(&self.body)
676    }
677
678    /// Extracts structured page metadata from HTML responses.
679    pub fn page_metadata(&self) -> Result<PageMetadata, Utf8Error> {
680        let html = self.cached_html()?;
681        let mut metadata = PageMetadata::default();
682
683        if let Some(selector) = get_cached_selector("title") {
684            metadata.title = html
685                .select(&selector)
686                .next()
687                .map(|node| node.text().collect::<String>().trim().to_string())
688                .filter(|value| !value.is_empty());
689        }
690
691        if let Some(selector) = get_cached_selector("meta[name], meta[property], meta[content]") {
692            for element in html.select(&selector) {
693                let Some(content) = element.value().attr("content") else {
694                    continue;
695                };
696                let content = content.trim();
697                if content.is_empty() {
698                    continue;
699                }
700
701                if let Some(name) = element.value().attr("name")
702                    && name.eq_ignore_ascii_case("description")
703                    && metadata.description.is_none()
704                {
705                    metadata.description = Some(content.to_string());
706                }
707
708                if let Some(property) = element.value().attr("property")
709                    && property.len() >= 3
710                    && property[..3].eq_ignore_ascii_case("og:")
711                {
712                    metadata
713                        .open_graph
714                        .entry(property.to_string())
715                        .or_insert_with(|| content.to_string());
716                }
717            }
718        }
719
720        if let Some(selector) = get_cached_selector("link[href]") {
721            for element in html.select(&selector) {
722                let Some(href) = element.value().attr("href") else {
723                    continue;
724                };
725                let rel = element.value().attr("rel").unwrap_or_default();
726
727                if rel
728                    .split_ascii_whitespace()
729                    .any(|token| token.eq_ignore_ascii_case("canonical"))
730                    && metadata.canonical_url.is_none()
731                    && let Ok(url) = self.url.join(href)
732                {
733                    metadata.canonical_url = Some(url);
734                }
735
736                let is_alternate = rel
737                    .split_ascii_whitespace()
738                    .any(|token| token.eq_ignore_ascii_case("alternate"));
739                let ty = element.value().attr("type").unwrap_or_default();
740                let is_feed = ty.eq_ignore_ascii_case("application/rss+xml")
741                    || ty.eq_ignore_ascii_case("application/atom+xml")
742                    || ty.eq_ignore_ascii_case("application/xml")
743                    || ty.eq_ignore_ascii_case("text/xml");
744
745                if is_alternate
746                    && is_feed
747                    && let Ok(url) = self.url.join(href)
748                    && !metadata.feed_urls.contains(&url)
749                {
750                    metadata.feed_urls.push(url);
751                }
752            }
753        }
754
755        Ok(metadata)
756    }
757
758    /// Returns a customizable iterator of links discovered in the response body.
759    ///
760    /// Unlike [`Response::links`], this method does not deduplicate results.
761    /// Callers that need uniqueness can collect into a set or use [`Response::links`].
762    ///
763    /// ## Example
764    ///
765    /// ```rust,ignore
766    /// # use spider_util::response::{LinkExtractOptions, Response};
767    /// # use reqwest::StatusCode;
768    /// # use bytes::Bytes;
769    /// # use url::Url;
770    /// # let response = Response {
771    /// #     url: Url::parse("https://example.com").unwrap(),
772    /// #     status: StatusCode::OK,
773    /// #     headers: http::header::HeaderMap::new(),
774    /// #     body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
775    /// #     request_url: Url::parse("https://example.com").unwrap(),
776    /// #     meta: None,
777    /// #     cached: false,
778    /// # };
779    /// let links: Vec<_> = response
780    ///     .links_iter(LinkExtractOptions::default())
781    ///     .collect();
782    /// assert!(!links.is_empty());
783    /// ```
784    pub fn links_iter(&self, options: LinkExtractOptions) -> impl Iterator<Item = Link> {
785        self.parse_links(options).unwrap_or_default().into_iter()
786    }
787
788    /// Extracts all unique, same-site links from the response body.
789    ///
790    /// This method discovers links from:
791    /// - HTML elements with `href` or `src` attributes (`<a>`, `<link>`, `<script>`, `<img>`, etc.)
792    /// - URLs found in text content (using link detection)
793    ///
794    /// Only links pointing to the same site (same registered domain) are included.
795    ///
796    /// ## Returns
797    ///
798    /// A [`DashSet`] of [`Link`] objects containing the URL and link type.
799    ///
800    /// ## Example
801    ///
802    /// ```rust,ignore
803    /// # use spider_util::response::Response;
804    /// # use reqwest::StatusCode;
805    /// # use bytes::Bytes;
806    /// # use url::Url;
807    /// # let response = Response {
808    /// #     url: Url::parse("https://example.com").unwrap(),
809    /// #     status: StatusCode::OK,
810    /// #     headers: http::header::HeaderMap::new(),
811    /// #     body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
812    /// #     request_url: Url::parse("https://example.com").unwrap(),
813    /// #     meta: None,
814    /// #     cached: false,
815    /// # };
816    /// let links = response.links();
817    /// for link in links.iter() {
818    ///     println!("Found {:?} link: {}", link.link_type, link.url);
819    /// }
820    /// ```
821    pub fn links(&self) -> DashSet<Link> {
822        let links = DashSet::new();
823
824        for link in self.links_iter(LinkExtractOptions::default()) {
825            links.insert(link);
826        }
827
828        links
829    }
830
831    fn parse_links(&self, options: LinkExtractOptions) -> Result<Vec<Link>, Utf8Error> {
832        let html = self.cached_html()?;
833        let mut links = Vec::new();
834
835        self.collect_attribute_links(&html, &options, &mut links);
836
837        if options.include_text_links {
838            self.collect_text_links(&html, &options, &mut links);
839        }
840
841        Ok(links)
842    }
843
844    fn collect_attribute_links(
845        &self,
846        html: &Html,
847        options: &LinkExtractOptions,
848        links: &mut Vec<Link>,
849    ) {
850        for source in &options.sources {
851            if !options
852                .allowed_attributes
853                .as_ref()
854                .is_none_or(|allowed| allowed.iter().any(|attr| attr == &source.attribute))
855            {
856                continue;
857            }
858
859            let Some(selector) = get_cached_selector(&source.selector) else {
860                continue;
861            };
862
863            for element in html.select(&selector) {
864                let tag_name = element.value().name();
865                if !options
866                    .allowed_tags
867                    .as_ref()
868                    .is_none_or(|allowed| allowed.iter().any(|tag| tag == tag_name))
869                {
870                    continue;
871                }
872
873                let Some(attr_value) = element.value().attr(&source.attribute) else {
874                    continue;
875                };
876
877                let link_type = source
878                    .link_type
879                    .clone()
880                    .unwrap_or_else(|| infer_link_type(&element));
881
882                if let Some(link) = self.build_link(attr_value, link_type, options) {
883                    links.push(link);
884                }
885            }
886        }
887    }
888
889    fn collect_text_links(&self, html: &Html, options: &LinkExtractOptions, links: &mut Vec<Link>) {
890        let finder = LinkFinder::new();
891
892        for text_node in html.tree.values().filter_map(|node| node.as_text()) {
893            for link in finder.links(text_node) {
894                if link.kind() != &LinkKind::Url {
895                    continue;
896                }
897
898                if let Some(link) = self.build_link(link.as_str(), LinkType::Page, options) {
899                    links.push(link);
900                }
901            }
902        }
903    }
904
905    fn build_link(
906        &self,
907        raw_url: &str,
908        link_type: LinkType,
909        options: &LinkExtractOptions,
910    ) -> Option<Link> {
911        let url = self.url.join(raw_url).ok()?;
912
913        if options.same_site_only && !util::is_same_site(&url, &self.url) {
914            return None;
915        }
916
917        if !options
918            .allowed_link_types
919            .as_ref()
920            .is_none_or(|allowed| allowed.contains(&link_type))
921        {
922            return None;
923        }
924
925        if options.denied_link_types.contains(&link_type) {
926            return None;
927        }
928
929        let absolute_url = url.as_str();
930        if !options.allow_patterns.is_empty()
931            && !options
932                .allow_patterns
933                .iter()
934                .any(|pattern| glob_matches(pattern, absolute_url))
935        {
936            return None;
937        }
938
939        if options
940            .deny_patterns
941            .iter()
942            .any(|pattern| glob_matches(pattern, absolute_url))
943        {
944            return None;
945        }
946
947        let host = url.host_str().unwrap_or_default();
948        if !options.allow_domains.is_empty()
949            && !options
950                .allow_domains
951                .iter()
952                .any(|domain| domain_matches(host, domain))
953        {
954            return None;
955        }
956
957        if options
958            .deny_domains
959            .iter()
960            .any(|domain| domain_matches(host, domain))
961        {
962            return None;
963        }
964
965        let path = url.path();
966        if !options.allow_path_prefixes.is_empty()
967            && !options
968                .allow_path_prefixes
969                .iter()
970                .any(|prefix| path.starts_with(prefix))
971        {
972            return None;
973        }
974
975        if options
976            .deny_path_prefixes
977            .iter()
978            .any(|prefix| path.starts_with(prefix))
979        {
980            return None;
981        }
982
983        Some(Link { url, link_type })
984    }
985
986    fn html_cache_key(&self) -> u64 {
987        let mut hasher = SeaHasher::new();
988        self.url.as_str().hash(&mut hasher);
989        self.request_url.as_str().hash(&mut hasher);
990        self.body.hash(&mut hasher);
991        hasher.finish()
992    }
993
994    fn cached_html(&self) -> Result<Arc<Html>, Utf8Error> {
995        let cache_key = self.html_cache_key();
996
997        HTML_CACHE.with(|cache| {
998            if let Some(html) = cache.borrow().get(&cache_key).cloned() {
999                return Ok(html);
1000            }
1001
1002            let body_str = from_utf8(&self.body)?;
1003            let html = Arc::new(Html::parse_document(body_str));
1004            cache.borrow_mut().insert(cache_key, html.clone());
1005            Ok(html)
1006        })
1007    }
1008}
1009
1010impl Clone for Response {
1011    fn clone(&self) -> Self {
1012        Response {
1013            url: self.url.clone(),
1014            status: self.status,
1015            headers: self.headers.clone(),
1016            body: self.body.clone(),
1017            request_url: self.request_url.clone(),
1018            request_priority: self.request_priority,
1019            meta: self.meta.clone(),
1020            cached: self.cached,
1021        }
1022    }
1023}
1024
1025fn default_link_sources() -> Vec<LinkSource> {
1026    vec![
1027        LinkSource::new("a[href]", "href"),
1028        LinkSource::new("link[href]", "href"),
1029        LinkSource::new("script[src]", "src"),
1030        LinkSource::new("img[src]", "src"),
1031        LinkSource::new("audio[src]", "src"),
1032        LinkSource::new("video[src]", "src"),
1033        LinkSource::new("source[src]", "src"),
1034    ]
1035}
1036
1037fn infer_link_type(element: &ElementRef<'_>) -> LinkType {
1038    match element.value().name() {
1039        "a" => LinkType::Page,
1040        "link" => {
1041            if let Some(rel) = element.value().attr("rel") {
1042                if rel.eq_ignore_ascii_case("stylesheet") {
1043                    LinkType::Stylesheet
1044                } else {
1045                    LinkType::Other(rel.to_string())
1046                }
1047            } else {
1048                LinkType::Other("link".to_string())
1049            }
1050        }
1051        "script" => LinkType::Script,
1052        "img" => LinkType::Image,
1053        "audio" | "video" | "source" => LinkType::Media,
1054        _ => LinkType::Other(element.value().name().to_string()),
1055    }
1056}
1057
1058fn normalize_domain_filter(domain: impl Into<String>) -> String {
1059    domain
1060        .into()
1061        .trim()
1062        .trim_start_matches('.')
1063        .to_ascii_lowercase()
1064}
1065
1066fn normalize_path_prefix(prefix: impl Into<String>) -> String {
1067    let prefix = prefix.into();
1068    let prefix = prefix.trim();
1069    if prefix.is_empty() || prefix == "/" {
1070        "/".to_string()
1071    } else if prefix.starts_with('/') {
1072        prefix.to_string()
1073    } else {
1074        format!("/{prefix}")
1075    }
1076}
1077
1078fn domain_matches(host: &str, filter: &str) -> bool {
1079    let host = host.to_ascii_lowercase();
1080    let filter = filter.to_ascii_lowercase();
1081    host == filter || host.ends_with(&format!(".{filter}"))
1082}
1083
1084fn glob_matches(pattern: &str, input: &str) -> bool {
1085    let pattern = pattern.as_bytes();
1086    let input = input.as_bytes();
1087    let (mut p, mut s) = (0usize, 0usize);
1088    let mut last_star = None;
1089    let mut match_after_star = 0usize;
1090
1091    while s < input.len() {
1092        if p < pattern.len() && (pattern[p] == b'?' || pattern[p] == input[s]) {
1093            p += 1;
1094            s += 1;
1095        } else if p < pattern.len() && pattern[p] == b'*' {
1096            last_star = Some(p);
1097            p += 1;
1098            match_after_star = s;
1099        } else if let Some(star_idx) = last_star {
1100            p = star_idx + 1;
1101            match_after_star += 1;
1102            s = match_after_star;
1103        } else {
1104            return false;
1105        }
1106    }
1107
1108    while p < pattern.len() && pattern[p] == b'*' {
1109        p += 1;
1110    }
1111
1112    p == pattern.len()
1113}
spider_util/response.rs

spider_util/
response.rs