spider_util/
response.rs

1//! Response types and response-side helpers.
2//!
3//! [`Response`] wraps the downloaded body together with the final URL, status,
4//! headers, and request metadata. It also provides convenience methods for
5//! parsing HTML or JSON and for extracting links.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_util::response::Response;
11//! use reqwest::StatusCode;
12//! use bytes::Bytes;
13//! use url::Url;
14//!
15//! // Create a response (typically done internally by the downloader)
16//! let response = Response {
17//!     url: Url::parse("https://example.com").unwrap(),
18//!     status: StatusCode::OK,
19//!     headers: http::header::HeaderMap::new(),
20//!     body: Bytes::from("<html><body>Hello</body></html>"),
21//!     request_url: Url::parse("https://example.com").unwrap(),
22//!     request_priority: 0,
23//!     meta: None,
24//!     cached: false,
25//! };
26//!
27//! // Parse as HTML
28//! let html = response.to_html().unwrap();
29//!
30//! // Extract links from the response
31//! let links = response.links();
32//! ```
33//!
34//! In the crawler lifecycle, a [`Response`] is produced by the downloader,
35//! optionally rewritten by middleware, and then handed to
36//! [`Spider::parse`](spider_core::Spider::parse).
37
38use crate::request::Request;
39use crate::selector::get_cached_selector;
40use crate::util;
41use dashmap::{DashMap, DashSet};
42use linkify::{LinkFinder, LinkKind};
43use reqwest::StatusCode;
44use scraper::{ElementRef, Html};
45use seahash::SeaHasher;
46use serde::de::DeserializeOwned;
47use serde::{Deserialize, Serialize};
48use serde_json;
49use std::cell::RefCell;
50use std::collections::HashMap;
51use std::hash::{Hash, Hasher};
52use std::{str::Utf8Error, str::from_utf8, sync::Arc};
53use url::Url;
54
55thread_local! {
56    static HTML_CACHE: RefCell<HashMap<u64, Html>> = RefCell::new(HashMap::new());
57}
58
59const DISCOVERY_RULE_META_KEY: &str = "__discovery_rule";
60
61/// Classification for links discovered in a response.
62///
63/// ## Variants
64///
65/// - `Page`: Links to other web pages (typically `<a>` tags)
66/// - `Script`: Links to JavaScript files (`<script>` tags)
67/// - `Stylesheet`: Links to CSS stylesheets (`<link rel="stylesheet">`)
68/// - `Image`: Links to images (`<img>` tags)
69/// - `Media`: Links to audio/video files (`<audio>`, `<video>`, `<source>`)
70/// - `Other`: Any other type of resource with a custom identifier
71#[derive(Debug, Clone, PartialEq, Eq, Hash)]
72pub enum LinkType {
73    /// A link to another web page.
74    Page,
75    /// A link to a script file.
76    Script,
77    /// A link to a stylesheet.
78    Stylesheet,
79    /// A link to an image.
80    Image,
81    /// A link to a media file (audio/video).
82    Media,
83    /// A link to another type of resource.
84    Other(String),
85}
86
87/// A link discovered while extracting URLs from a response.
88///
89/// ## Example
90///
91/// ```rust,ignore
92/// use spider_util::response::{Link, LinkType};
93/// use url::Url;
94///
95/// let link = Link {
96///     url: Url::parse("https://example.com/page").unwrap(),
97///     link_type: LinkType::Page,
98/// };
99/// ```
100#[derive(Debug, Clone, PartialEq, Eq, Hash)]
101pub struct Link {
102    /// The URL of the discovered link.
103    pub url: Url,
104    /// The type of the discovered link.
105    pub link_type: LinkType,
106}
107
108/// One selector/attribute pair used during link extraction.
109///
110/// This is useful when the default HTML link sources are not enough for the
111/// target site and you need to teach the extractor about custom attributes.
112#[derive(Debug, Clone, PartialEq, Eq)]
113pub struct LinkSource {
114    /// CSS selector used to find candidate elements.
115    pub selector: String,
116    /// Attribute name that contains the URL.
117    pub attribute: String,
118    /// Optional fixed link type for matches from this source.
119    pub link_type: Option<LinkType>,
120}
121
122impl LinkSource {
123    /// Creates a new source definition.
124    pub fn new(selector: impl Into<String>, attribute: impl Into<String>) -> Self {
125        Self {
126            selector: selector.into(),
127            attribute: attribute.into(),
128            link_type: None,
129        }
130    }
131
132    /// Overrides the inferred link type for this source.
133    pub fn with_link_type(mut self, link_type: LinkType) -> Self {
134        self.link_type = Some(link_type);
135        self
136    }
137}
138
139/// Options that control link extraction from a [`Response`].
140///
141/// The defaults are intentionally conservative for crawler use: same-site
142/// filtering is enabled, text links are included, and common HTML elements are
143/// scanned for navigable URLs.
144#[derive(Debug, Clone, PartialEq, Eq)]
145pub struct LinkExtractOptions {
146    /// Restrict discovered links to the same registered domain.
147    pub same_site_only: bool,
148    /// Include URLs found in text content.
149    pub include_text_links: bool,
150    /// HTML sources used to discover attribute-based links.
151    pub sources: Vec<LinkSource>,
152    /// Optional allow-list of link types to include.
153    pub allowed_link_types: Option<Vec<LinkType>>,
154    /// Optional deny-list of link types to exclude.
155    pub denied_link_types: Vec<LinkType>,
156    /// Optional allow-list of glob-style URL patterns (`*` and `?` supported).
157    pub allow_patterns: Vec<String>,
158    /// Optional deny-list of glob-style URL patterns (`*` and `?` supported).
159    pub deny_patterns: Vec<String>,
160    /// Optional allow-list of domains or registered-domain suffixes.
161    pub allow_domains: Vec<String>,
162    /// Optional deny-list of domains or registered-domain suffixes.
163    pub deny_domains: Vec<String>,
164    /// Optional allow-list of URL path prefixes.
165    pub allow_path_prefixes: Vec<String>,
166    /// Optional deny-list of URL path prefixes.
167    pub deny_path_prefixes: Vec<String>,
168    /// Optional allow-list of HTML tag names used for attribute extraction.
169    pub allowed_tags: Option<Vec<String>>,
170    /// Optional allow-list of attribute names used for attribute extraction.
171    pub allowed_attributes: Option<Vec<String>>,
172}
173
174impl Default for LinkExtractOptions {
175    fn default() -> Self {
176        Self {
177            same_site_only: true,
178            include_text_links: true,
179            sources: default_link_sources(),
180            allowed_link_types: None,
181            denied_link_types: Vec::new(),
182            allow_patterns: Vec::new(),
183            deny_patterns: Vec::new(),
184            allow_domains: Vec::new(),
185            deny_domains: Vec::new(),
186            allow_path_prefixes: Vec::new(),
187            deny_path_prefixes: Vec::new(),
188            allowed_tags: None,
189            allowed_attributes: None,
190        }
191    }
192}
193
194impl LinkExtractOptions {
195    /// Sets whether only same-site URLs should be returned.
196    pub fn same_site_only(mut self, same_site_only: bool) -> Self {
197        self.same_site_only = same_site_only;
198        self
199    }
200
201    /// Sets whether URLs found in text content should be returned.
202    pub fn include_text_links(mut self, include_text_links: bool) -> Self {
203        self.include_text_links = include_text_links;
204        self
205    }
206
207    /// Replaces the configured HTML extraction sources.
208    pub fn with_sources(mut self, sources: impl IntoIterator<Item = LinkSource>) -> Self {
209        self.sources = sources.into_iter().collect();
210        self
211    }
212
213    /// Adds an HTML extraction source.
214    pub fn add_source(mut self, source: LinkSource) -> Self {
215        self.sources.push(source);
216        self
217    }
218
219    /// Restricts extraction to the provided link types.
220    pub fn with_allowed_link_types(
221        mut self,
222        allowed_link_types: impl IntoIterator<Item = LinkType>,
223    ) -> Self {
224        self.allowed_link_types = Some(allowed_link_types.into_iter().collect());
225        self
226    }
227
228    /// Adds link types that should be excluded even if discovered.
229    pub fn with_denied_link_types(
230        mut self,
231        denied_link_types: impl IntoIterator<Item = LinkType>,
232    ) -> Self {
233        self.denied_link_types = denied_link_types.into_iter().collect();
234        self
235    }
236
237    /// Adds a glob-style allow pattern that URLs must match.
238    pub fn allow_pattern(mut self, pattern: impl Into<String>) -> Self {
239        self.allow_patterns.push(pattern.into());
240        self
241    }
242
243    /// Replaces the glob-style allow patterns.
244    pub fn with_allow_patterns(
245        mut self,
246        patterns: impl IntoIterator<Item = impl Into<String>>,
247    ) -> Self {
248        self.allow_patterns = patterns.into_iter().map(Into::into).collect();
249        self
250    }
251
252    /// Adds a glob-style deny pattern that excludes matching URLs.
253    pub fn deny_pattern(mut self, pattern: impl Into<String>) -> Self {
254        self.deny_patterns.push(pattern.into());
255        self
256    }
257
258    /// Replaces the glob-style deny patterns.
259    pub fn with_deny_patterns(
260        mut self,
261        patterns: impl IntoIterator<Item = impl Into<String>>,
262    ) -> Self {
263        self.deny_patterns = patterns.into_iter().map(Into::into).collect();
264        self
265    }
266
267    /// Adds a domain or registered-domain suffix to allow.
268    pub fn allow_domain(mut self, domain: impl Into<String>) -> Self {
269        self.allow_domains.push(normalize_domain_filter(domain));
270        self
271    }
272
273    /// Replaces the allowed domains.
274    pub fn with_allow_domains(
275        mut self,
276        domains: impl IntoIterator<Item = impl Into<String>>,
277    ) -> Self {
278        self.allow_domains = domains.into_iter().map(normalize_domain_filter).collect();
279        self
280    }
281
282    /// Adds a domain or registered-domain suffix to deny.
283    pub fn deny_domain(mut self, domain: impl Into<String>) -> Self {
284        self.deny_domains.push(normalize_domain_filter(domain));
285        self
286    }
287
288    /// Replaces the denied domains.
289    pub fn with_deny_domains(
290        mut self,
291        domains: impl IntoIterator<Item = impl Into<String>>,
292    ) -> Self {
293        self.deny_domains = domains.into_iter().map(normalize_domain_filter).collect();
294        self
295    }
296
297    /// Adds a URL path prefix that links must match.
298    pub fn allow_path_prefix(mut self, prefix: impl Into<String>) -> Self {
299        self.allow_path_prefixes.push(normalize_path_prefix(prefix));
300        self
301    }
302
303    /// Replaces the allowed URL path prefixes.
304    pub fn with_allow_path_prefixes(
305        mut self,
306        prefixes: impl IntoIterator<Item = impl Into<String>>,
307    ) -> Self {
308        self.allow_path_prefixes = prefixes.into_iter().map(normalize_path_prefix).collect();
309        self
310    }
311
312    /// Adds a URL path prefix that should be excluded.
313    pub fn deny_path_prefix(mut self, prefix: impl Into<String>) -> Self {
314        self.deny_path_prefixes.push(normalize_path_prefix(prefix));
315        self
316    }
317
318    /// Replaces the denied URL path prefixes.
319    pub fn with_deny_path_prefixes(
320        mut self,
321        prefixes: impl IntoIterator<Item = impl Into<String>>,
322    ) -> Self {
323        self.deny_path_prefixes = prefixes.into_iter().map(normalize_path_prefix).collect();
324        self
325    }
326
327    /// Restricts attribute-based extraction to specific HTML tag names.
328    pub fn with_allowed_tags(mut self, tags: impl IntoIterator<Item = impl Into<String>>) -> Self {
329        self.allowed_tags = Some(
330            tags.into_iter()
331                .map(Into::into)
332                .map(|tag: String| tag.to_ascii_lowercase())
333                .collect(),
334        );
335        self
336    }
337
338    /// Restricts attribute-based extraction to specific attribute names.
339    pub fn with_allowed_attributes(
340        mut self,
341        attributes: impl IntoIterator<Item = impl Into<String>>,
342    ) -> Self {
343        self.allowed_attributes = Some(
344            attributes
345                .into_iter()
346                .map(Into::into)
347                .map(|attr: String| attr.to_ascii_lowercase())
348                .collect(),
349        );
350        self
351    }
352}
353
354/// Structured page metadata extracted from an HTML response.
355#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
356pub struct PageMetadata {
357    /// Contents of the `<title>` element.
358    pub title: Option<String>,
359    /// Contents of `<meta name="description">`.
360    pub description: Option<String>,
361    /// Canonical URL from `<link rel="canonical">`.
362    pub canonical_url: Option<Url>,
363    /// Open Graph metadata such as `og:title` or `og:image`.
364    pub open_graph: HashMap<String, String>,
365    /// Feed URLs discovered from alternate RSS/Atom link tags.
366    pub feed_urls: Vec<Url>,
367}
368
369impl PageMetadata {
370    /// Returns `true` when no metadata fields were extracted.
371    pub fn is_empty(&self) -> bool {
372        self.title.is_none()
373            && self.description.is_none()
374            && self.canonical_url.is_none()
375            && self.open_graph.is_empty()
376            && self.feed_urls.is_empty()
377    }
378}
379
380/// Represents an HTTP response received from a server.
381///
382/// [`Response`] contains all information about an HTTP response, including
383/// the final URL (after redirects), status code, headers, body content,
384/// and metadata carried over from the original request.
385///
386/// The type is designed for parse-time ergonomics:
387/// - [`Response::to_html`] parses the body as HTML
388/// - [`Response::json`] deserializes JSON payloads
389/// - [`Response::links`] and related helpers extract follow-up links
390/// - [`Response::to_request`] reconstructs the originating request context
391///
392/// ## Example
393///
394/// ```rust,ignore
395/// use spider_util::response::Response;
396/// use reqwest::StatusCode;
397/// use bytes::Bytes;
398/// use url::Url;
399///
400/// let response = Response {
401///     url: Url::parse("https://example.com").unwrap(),
402///     status: StatusCode::OK,
403///     headers: http::header::HeaderMap::new(),
404///     body: Bytes::from("<html><body>Hello</body></html>"),
405///     request_url: Url::parse("https://example.com").unwrap(),
406///     meta: None,
407///     cached: false,
408/// };
409///
410/// // Parse the response body as HTML
411/// if let Ok(html) = response.to_html() {
412///     // Process HTML...
413/// }
414/// ```
415#[derive(Debug)]
416pub struct Response {
417    /// The final URL of the response after any redirects.
418    pub url: Url,
419    /// The HTTP status code of the response.
420    pub status: StatusCode,
421    /// The headers of the response.
422    pub headers: http::header::HeaderMap,
423    /// The body of the response.
424    pub body: bytes::Bytes,
425    /// The original URL of the request that led to this response.
426    pub request_url: Url,
427    /// The scheduling priority of the original request.
428    pub request_priority: i32,
429    /// Metadata associated with the response, carried over from the request.
430    /// Uses Option to allow lazy initialization.
431    pub meta: Option<Arc<DashMap<String, serde_json::Value>>>,
432    /// Indicates if the response was served from a cache.
433    pub cached: bool,
434}
435
436impl Response {
437    /// Creates a new response with an empty HTML cache.
438    ///
439    /// Most application code receives responses from the runtime rather than
440    /// constructing them directly. This constructor is mainly useful for custom
441    /// downloaders and lower-level integrations.
442    pub fn new(
443        url: Url,
444        status: StatusCode,
445        headers: http::header::HeaderMap,
446        body: bytes::Bytes,
447        request_url: Url,
448    ) -> Self {
449        Self {
450            url,
451            status,
452            headers,
453            body,
454            request_url,
455            request_priority: 0,
456            meta: None,
457            cached: false,
458        }
459    }
460
461    /// Reconstructs the original [`Request`] that led to this response.
462    ///
463    /// This method creates a new [`Request`] with the same URL and metadata
464    /// as the request that produced this response. Useful for retry scenarios
465    /// or when you need to re-request the same resource.
466    ///
467    /// ## Example
468    ///
469    /// ```rust,ignore
470    /// # use spider_util::response::Response;
471    /// # use reqwest::StatusCode;
472    /// # use bytes::Bytes;
473    /// # use url::Url;
474    /// # let response = Response {
475    /// #     url: Url::parse("https://example.com").unwrap(),
476    /// #     status: StatusCode::OK,
477    /// #     headers: http::header::HeaderMap::new(),
478    /// #     body: Bytes::from("hello"),
479    /// #     request_url: Url::parse("https://example.com").unwrap(),
480    /// #     request_priority: 0,
481    /// #     meta: None,
482    /// #     cached: false,
483    /// # };
484    /// let original_request = response.request_from_response();
485    /// ```
486    pub fn request_from_response(&self) -> Request {
487        let mut request =
488            Request::new(self.request_url.clone()).with_priority(self.request_priority);
489        request.set_meta_from_option(self.meta.clone());
490        request
491    }
492
493    /// Returns a cloned metadata value by key.
494    pub fn get_meta(&self, key: &str) -> Option<serde_json::Value> {
495        self.meta
496            .as_ref()
497            .and_then(|m| m.get(key).map(|entry| entry.value().clone()))
498    }
499
500    /// Deserializes a metadata value into the requested type.
501    pub fn meta_value<T>(&self, key: &str) -> Result<Option<T>, serde_json::Error>
502    where
503        T: DeserializeOwned,
504    {
505        self.get_meta(key).map(serde_json::from_value).transpose()
506    }
507
508    /// Returns the runtime discovery rule name attached to this response, if any.
509    pub fn discovery_rule_name(&self) -> Option<String> {
510        self.get_meta(DISCOVERY_RULE_META_KEY)
511            .and_then(|value| value.as_str().map(ToOwned::to_owned))
512    }
513
514    /// Returns `true` when the response was reached through the named discovery rule.
515    pub fn matches_discovery_rule(&self, rule_name: &str) -> bool {
516        self.discovery_rule_name().as_deref() == Some(rule_name)
517    }
518
519    /// Inserts a metadata value, lazily allocating the map if needed.
520    pub fn insert_meta(&mut self, key: impl Into<String>, value: serde_json::Value) {
521        self.meta
522            .get_or_insert_with(|| Arc::new(DashMap::new()))
523            .insert(key.into(), value);
524    }
525
526    /// Returns a clone of the internal metadata map, if present.
527    pub fn clone_meta(&self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
528        self.meta.clone()
529    }
530
531    /// Deserializes the response body as JSON.
532    ///
533    /// # Type Parameters
534    ///
535    /// - `T`: The target type to deserialize into (must implement `DeserializeOwned`)
536    ///
537    /// # Errors
538    ///
539    /// Returns a [`serde_json::Error`] if the body cannot be parsed as JSON
540    /// or if it cannot be deserialized into type `T`.
541    ///
542    /// ## Example
543    ///
544    /// ```rust,ignore
545    /// # use spider_util::response::Response;
546    /// # use reqwest::StatusCode;
547    /// # use bytes::Bytes;
548    /// # use url::Url;
549    /// # use serde::Deserialize;
550    /// # #[derive(Deserialize)]
551    /// # struct Data { value: String }
552    /// # let response = Response {
553    /// #     url: Url::parse("https://api.example.com").unwrap(),
554    /// #     status: StatusCode::OK,
555    /// #     headers: http::header::HeaderMap::new(),
556    /// #     body: Bytes::from(r#"{"value": "test"}"#),
557    /// #     request_url: Url::parse("https://api.example.com").unwrap(),
558    /// #     meta: None,
559    /// #     cached: false,
560    /// # };
561    /// let data: Data = response.json()?;
562    /// # Ok::<(), serde_json::Error>(())
563    /// ```
564    pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
565        serde_json::from_slice(&self.body)
566    }
567
568    /// Parses the response body as HTML.
569    ///
570    /// Returns a [`scraper::Html`] document that can be queried using CSS selectors.
571    ///
572    /// # Errors
573    ///
574    /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
575    ///
576    /// ## Example
577    ///
578    /// ```rust,ignore
579    /// # use spider_util::response::Response;
580    /// # use reqwest::StatusCode;
581    /// # use bytes::Bytes;
582    /// # use url::Url;
583    /// # let response = Response {
584    /// #     url: Url::parse("https://example.com").unwrap(),
585    /// #     status: StatusCode::OK,
586    /// #     headers: http::header::HeaderMap::new(),
587    /// #     body: Bytes::from("<html><body>Hello</body></html>"),
588    /// #     request_url: Url::parse("https://example.com").unwrap(),
589    /// #     meta: None,
590    /// #     cached: false,
591    /// # };
592    /// let html = response.to_html()?;
593    /// # Ok::<(), std::str::Utf8Error>(())
594    /// ```
595    pub fn to_html(&self) -> Result<Html, Utf8Error> {
596        let cache_key = self.html_cache_key();
597
598        HTML_CACHE.with(|cache| {
599            if let Some(html) = cache.borrow().get(&cache_key).cloned() {
600                return Ok(html);
601            }
602
603            let body_str = from_utf8(&self.body)?;
604            let html = Html::parse_document(body_str);
605            cache.borrow_mut().insert(cache_key, html.clone());
606            Ok(html)
607        })
608    }
609
610    /// Lazily parses the response body as HTML.
611    ///
612    /// Returns a closure that can be called when the HTML is actually needed.
613    /// This avoids parsing HTML for responses where it may not be used.
614    ///
615    /// # Errors
616    ///
617    /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
618    ///
619    /// ## Example
620    ///
621    /// ```rust,ignore
622    /// # use spider_util::response::Response;
623    /// # use reqwest::StatusCode;
624    /// # use bytes::Bytes;
625    /// # use url::Url;
626    /// # let response = Response {
627    /// #     url: Url::parse("https://example.com").unwrap(),
628    /// #     status: StatusCode::OK,
629    /// #     headers: http::header::HeaderMap::new(),
630    /// #     body: Bytes::from("<html><body>Hello</body></html>"),
631    /// #     request_url: Url::parse("https://example.com").unwrap(),
632    /// #     meta: None,
633    /// #     cached: false,
634    /// # };
635    /// let html_fn = response.lazy_html()?;
636    /// // Parse HTML only when needed
637    /// let html = html_fn()?;
638    /// # Ok::<(), std::str::Utf8Error>(())
639    /// ```
640    pub fn lazy_html(&self) -> Result<impl Fn() -> Result<Html, Utf8Error> + '_, Utf8Error> {
641        Ok(move || self.to_html())
642    }
643
644    /// Returns the response body as UTF-8 text.
645    pub fn text(&self) -> Result<&str, Utf8Error> {
646        from_utf8(&self.body)
647    }
648
649    /// Extracts structured page metadata from HTML responses.
650    pub fn page_metadata(&self) -> Result<PageMetadata, Utf8Error> {
651        let html = self.to_html()?;
652        let mut metadata = PageMetadata::default();
653
654        if let Some(selector) = get_cached_selector("title") {
655            metadata.title = html
656                .select(&selector)
657                .next()
658                .map(|node| node.text().collect::<String>().trim().to_string())
659                .filter(|value| !value.is_empty());
660        }
661
662        if let Some(selector) = get_cached_selector("meta[name], meta[property], meta[content]") {
663            for element in html.select(&selector) {
664                let Some(content) = element.value().attr("content") else {
665                    continue;
666                };
667                let content = content.trim();
668                if content.is_empty() {
669                    continue;
670                }
671
672                if let Some(name) = element.value().attr("name")
673                    && name.eq_ignore_ascii_case("description")
674                    && metadata.description.is_none()
675                {
676                    metadata.description = Some(content.to_string());
677                }
678
679                if let Some(property) = element.value().attr("property")
680                    && property.len() >= 3
681                    && property[..3].eq_ignore_ascii_case("og:")
682                {
683                    metadata
684                        .open_graph
685                        .entry(property.to_string())
686                        .or_insert_with(|| content.to_string());
687                }
688            }
689        }
690
691        if let Some(selector) = get_cached_selector("link[href]") {
692            for element in html.select(&selector) {
693                let Some(href) = element.value().attr("href") else {
694                    continue;
695                };
696                let rel = element.value().attr("rel").unwrap_or_default();
697
698                if rel
699                    .split_ascii_whitespace()
700                    .any(|token| token.eq_ignore_ascii_case("canonical"))
701                    && metadata.canonical_url.is_none()
702                    && let Ok(url) = self.url.join(href)
703                {
704                    metadata.canonical_url = Some(url);
705                }
706
707                let is_alternate = rel
708                    .split_ascii_whitespace()
709                    .any(|token| token.eq_ignore_ascii_case("alternate"));
710                let ty = element.value().attr("type").unwrap_or_default();
711                let is_feed = ty.eq_ignore_ascii_case("application/rss+xml")
712                    || ty.eq_ignore_ascii_case("application/atom+xml")
713                    || ty.eq_ignore_ascii_case("application/xml")
714                    || ty.eq_ignore_ascii_case("text/xml");
715
716                if is_alternate
717                    && is_feed
718                    && let Ok(url) = self.url.join(href)
719                    && !metadata.feed_urls.contains(&url)
720                {
721                    metadata.feed_urls.push(url);
722                }
723            }
724        }
725
726        Ok(metadata)
727    }
728
729    /// Returns a customizable iterator of links discovered in the response body.
730    ///
731    /// Unlike [`Response::links`], this method does not deduplicate results.
732    /// Callers that need uniqueness can collect into a set or use [`Response::links`].
733    ///
734    /// ## Example
735    ///
736    /// ```rust,ignore
737    /// # use spider_util::response::{LinkExtractOptions, Response};
738    /// # use reqwest::StatusCode;
739    /// # use bytes::Bytes;
740    /// # use url::Url;
741    /// # let response = Response {
742    /// #     url: Url::parse("https://example.com").unwrap(),
743    /// #     status: StatusCode::OK,
744    /// #     headers: http::header::HeaderMap::new(),
745    /// #     body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
746    /// #     request_url: Url::parse("https://example.com").unwrap(),
747    /// #     meta: None,
748    /// #     cached: false,
749    /// # };
750    /// let links: Vec<_> = response
751    ///     .links_iter(LinkExtractOptions::default())
752    ///     .collect();
753    /// assert!(!links.is_empty());
754    /// ```
755    pub fn links_iter(&self, options: LinkExtractOptions) -> impl Iterator<Item = Link> {
756        self.parse_links(options).unwrap_or_default().into_iter()
757    }
758
759    /// Extracts all unique, same-site links from the response body.
760    ///
761    /// This method discovers links from:
762    /// - HTML elements with `href` or `src` attributes (`<a>`, `<link>`, `<script>`, `<img>`, etc.)
763    /// - URLs found in text content (using link detection)
764    ///
765    /// Only links pointing to the same site (same registered domain) are included.
766    ///
767    /// ## Returns
768    ///
769    /// A [`DashSet`] of [`Link`] objects containing the URL and link type.
770    ///
771    /// ## Example
772    ///
773    /// ```rust,ignore
774    /// # use spider_util::response::Response;
775    /// # use reqwest::StatusCode;
776    /// # use bytes::Bytes;
777    /// # use url::Url;
778    /// # let response = Response {
779    /// #     url: Url::parse("https://example.com").unwrap(),
780    /// #     status: StatusCode::OK,
781    /// #     headers: http::header::HeaderMap::new(),
782    /// #     body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
783    /// #     request_url: Url::parse("https://example.com").unwrap(),
784    /// #     meta: None,
785    /// #     cached: false,
786    /// # };
787    /// let links = response.links();
788    /// for link in links.iter() {
789    ///     println!("Found {:?} link: {}", link.link_type, link.url);
790    /// }
791    /// ```
792    pub fn links(&self) -> DashSet<Link> {
793        let links = DashSet::new();
794
795        for link in self.links_iter(LinkExtractOptions::default()) {
796            links.insert(link);
797        }
798
799        links
800    }
801
802    fn parse_links(&self, options: LinkExtractOptions) -> Result<Vec<Link>, Utf8Error> {
803        let html_fn = self.lazy_html()?;
804        let html = html_fn()?;
805        let mut links = Vec::new();
806
807        self.collect_attribute_links(&html, &options, &mut links);
808
809        if options.include_text_links {
810            self.collect_text_links(&html, &options, &mut links);
811        }
812
813        Ok(links)
814    }
815
816    fn collect_attribute_links(
817        &self,
818        html: &Html,
819        options: &LinkExtractOptions,
820        links: &mut Vec<Link>,
821    ) {
822        for source in &options.sources {
823            if !options
824                .allowed_attributes
825                .as_ref()
826                .is_none_or(|allowed| allowed.iter().any(|attr| attr == &source.attribute))
827            {
828                continue;
829            }
830
831            let Some(selector) = get_cached_selector(&source.selector) else {
832                continue;
833            };
834
835            for element in html.select(&selector) {
836                let tag_name = element.value().name();
837                if !options
838                    .allowed_tags
839                    .as_ref()
840                    .is_none_or(|allowed| allowed.iter().any(|tag| tag == tag_name))
841                {
842                    continue;
843                }
844
845                let Some(attr_value) = element.value().attr(&source.attribute) else {
846                    continue;
847                };
848
849                let link_type = source
850                    .link_type
851                    .clone()
852                    .unwrap_or_else(|| infer_link_type(&element));
853
854                if let Some(link) = self.build_link(attr_value, link_type, options) {
855                    links.push(link);
856                }
857            }
858        }
859    }
860
861    fn collect_text_links(&self, html: &Html, options: &LinkExtractOptions, links: &mut Vec<Link>) {
862        let finder = LinkFinder::new();
863
864        for text_node in html.tree.values().filter_map(|node| node.as_text()) {
865            for link in finder.links(text_node) {
866                if link.kind() != &LinkKind::Url {
867                    continue;
868                }
869
870                if let Some(link) = self.build_link(link.as_str(), LinkType::Page, options) {
871                    links.push(link);
872                }
873            }
874        }
875    }
876
877    fn build_link(
878        &self,
879        raw_url: &str,
880        link_type: LinkType,
881        options: &LinkExtractOptions,
882    ) -> Option<Link> {
883        let url = self.url.join(raw_url).ok()?;
884
885        if options.same_site_only && !util::is_same_site(&url, &self.url) {
886            return None;
887        }
888
889        if !options
890            .allowed_link_types
891            .as_ref()
892            .is_none_or(|allowed| allowed.contains(&link_type))
893        {
894            return None;
895        }
896
897        if options.denied_link_types.contains(&link_type) {
898            return None;
899        }
900
901        let absolute_url = url.as_str();
902        if !options.allow_patterns.is_empty()
903            && !options
904                .allow_patterns
905                .iter()
906                .any(|pattern| glob_matches(pattern, absolute_url))
907        {
908            return None;
909        }
910
911        if options
912            .deny_patterns
913            .iter()
914            .any(|pattern| glob_matches(pattern, absolute_url))
915        {
916            return None;
917        }
918
919        let host = url.host_str().unwrap_or_default();
920        if !options.allow_domains.is_empty()
921            && !options
922                .allow_domains
923                .iter()
924                .any(|domain| domain_matches(host, domain))
925        {
926            return None;
927        }
928
929        if options
930            .deny_domains
931            .iter()
932            .any(|domain| domain_matches(host, domain))
933        {
934            return None;
935        }
936
937        let path = url.path();
938        if !options.allow_path_prefixes.is_empty()
939            && !options
940                .allow_path_prefixes
941                .iter()
942                .any(|prefix| path.starts_with(prefix))
943        {
944            return None;
945        }
946
947        if options
948            .deny_path_prefixes
949            .iter()
950            .any(|prefix| path.starts_with(prefix))
951        {
952            return None;
953        }
954
955        Some(Link { url, link_type })
956    }
957
958    fn html_cache_key(&self) -> u64 {
959        let mut hasher = SeaHasher::new();
960        self.url.as_str().hash(&mut hasher);
961        self.request_url.as_str().hash(&mut hasher);
962        self.body.hash(&mut hasher);
963        hasher.finish()
964    }
965}
966
967impl Clone for Response {
968    fn clone(&self) -> Self {
969        Response {
970            url: self.url.clone(),
971            status: self.status,
972            headers: self.headers.clone(),
973            body: self.body.clone(),
974            request_url: self.request_url.clone(),
975            request_priority: self.request_priority,
976            meta: self.meta.clone(),
977            cached: self.cached,
978        }
979    }
980}
981
982fn default_link_sources() -> Vec<LinkSource> {
983    vec![
984        LinkSource::new("a[href]", "href"),
985        LinkSource::new("link[href]", "href"),
986        LinkSource::new("script[src]", "src"),
987        LinkSource::new("img[src]", "src"),
988        LinkSource::new("audio[src]", "src"),
989        LinkSource::new("video[src]", "src"),
990        LinkSource::new("source[src]", "src"),
991    ]
992}
993
994fn infer_link_type(element: &ElementRef<'_>) -> LinkType {
995    match element.value().name() {
996        "a" => LinkType::Page,
997        "link" => {
998            if let Some(rel) = element.value().attr("rel") {
999                if rel.eq_ignore_ascii_case("stylesheet") {
1000                    LinkType::Stylesheet
1001                } else {
1002                    LinkType::Other(rel.to_string())
1003                }
1004            } else {
1005                LinkType::Other("link".to_string())
1006            }
1007        }
1008        "script" => LinkType::Script,
1009        "img" => LinkType::Image,
1010        "audio" | "video" | "source" => LinkType::Media,
1011        _ => LinkType::Other(element.value().name().to_string()),
1012    }
1013}
1014
1015fn normalize_domain_filter(domain: impl Into<String>) -> String {
1016    domain
1017        .into()
1018        .trim()
1019        .trim_start_matches('.')
1020        .to_ascii_lowercase()
1021}
1022
1023fn normalize_path_prefix(prefix: impl Into<String>) -> String {
1024    let prefix = prefix.into();
1025    let prefix = prefix.trim();
1026    if prefix.is_empty() || prefix == "/" {
1027        "/".to_string()
1028    } else if prefix.starts_with('/') {
1029        prefix.to_string()
1030    } else {
1031        format!("/{prefix}")
1032    }
1033}
1034
1035fn domain_matches(host: &str, filter: &str) -> bool {
1036    let host = host.to_ascii_lowercase();
1037    let filter = filter.to_ascii_lowercase();
1038    host == filter || host.ends_with(&format!(".{filter}"))
1039}
1040
1041fn glob_matches(pattern: &str, input: &str) -> bool {
1042    let pattern = pattern.as_bytes();
1043    let input = input.as_bytes();
1044    let (mut p, mut s) = (0usize, 0usize);
1045    let mut last_star = None;
1046    let mut match_after_star = 0usize;
1047
1048    while s < input.len() {
1049        if p < pattern.len() && (pattern[p] == b'?' || pattern[p] == input[s]) {
1050            p += 1;
1051            s += 1;
1052        } else if p < pattern.len() && pattern[p] == b'*' {
1053            last_star = Some(p);
1054            p += 1;
1055            match_after_star = s;
1056        } else if let Some(star_idx) = last_star {
1057            p = star_idx + 1;
1058            match_after_star += 1;
1059            s = match_after_star;
1060        } else {
1061            return false;
1062        }
1063    }
1064
1065    while p < pattern.len() && pattern[p] == b'*' {
1066        p += 1;
1067    }
1068
1069    p == pattern.len()
1070}
spider_util/response.rs

spider_util/
response.rs