spider_util/response.rs
1//! Response types and response-side helpers.
2//!
3//! [`Response`] wraps the downloaded body together with the final URL, status,
4//! headers, and request metadata. It also provides convenience methods for
5//! Scrapy-like CSS extraction, parsing HTML or JSON, and extracting links.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_util::response::Response;
11//! use reqwest::StatusCode;
12//! use bytes::Bytes;
13//! use url::Url;
14//!
15//! // Create a response (typically done internally by the downloader)
16//! let response = Response {
17//! url: Url::parse("https://example.com").unwrap(),
18//! status: StatusCode::OK,
19//! headers: http::header::HeaderMap::new(),
20//! body: Bytes::from("<html><body>Hello</body></html>"),
21//! request_url: Url::parse("https://example.com").unwrap(),
22//! request_priority: 0,
23//! meta: None,
24//! cached: false,
25//! };
26//!
27//! // Extract text with the builtin selector API
28//! let heading = response.css("h1::text").unwrap().get();
29//!
30//! // Extract links from the response
31//! let links = response.links();
32//! ```
33//!
34//! In the crawler lifecycle, a [`Response`] is produced by the downloader,
35//! optionally rewritten by middleware, and then handed to
36//! [`Spider::parse`](spider_core::Spider::parse).
37
38use crate::error::SpiderError;
39use crate::request::Request;
40use crate::selector::{SelectorList, get_cached_selector};
41use crate::util;
42use dashmap::{DashMap, DashSet};
43use linkify::{LinkFinder, LinkKind};
44use reqwest::StatusCode;
45use scraper::{ElementRef, Html};
46use seahash::SeaHasher;
47use serde::de::DeserializeOwned;
48use serde::{Deserialize, Serialize};
49use serde_json;
50use std::cell::RefCell;
51use std::collections::HashMap;
52use std::hash::{Hash, Hasher};
53use std::{str::Utf8Error, str::from_utf8, sync::Arc};
54use url::Url;
55
56thread_local! {
57 static HTML_CACHE: RefCell<HashMap<u64, Arc<Html>>> = RefCell::new(HashMap::new());
58}
59
60const DISCOVERY_RULE_META_KEY: &str = "__discovery_rule";
61
62/// Classification for links discovered in a response.
63///
64/// ## Variants
65///
66/// - `Page`: Links to other web pages (typically `<a>` tags)
67/// - `Script`: Links to JavaScript files (`<script>` tags)
68/// - `Stylesheet`: Links to CSS stylesheets (`<link rel="stylesheet">`)
69/// - `Image`: Links to images (`<img>` tags)
70/// - `Media`: Links to audio/video files (`<audio>`, `<video>`, `<source>`)
71/// - `Other`: Any other type of resource with a custom identifier
72#[derive(Debug, Clone, PartialEq, Eq, Hash)]
73pub enum LinkType {
74 /// A link to another web page.
75 Page,
76 /// A link to a script file.
77 Script,
78 /// A link to a stylesheet.
79 Stylesheet,
80 /// A link to an image.
81 Image,
82 /// A link to a media file (audio/video).
83 Media,
84 /// A link to another type of resource.
85 Other(String),
86}
87
88/// A link discovered while extracting URLs from a response.
89///
90/// ## Example
91///
92/// ```rust,ignore
93/// use spider_util::response::{Link, LinkType};
94/// use url::Url;
95///
96/// let link = Link {
97/// url: Url::parse("https://example.com/page").unwrap(),
98/// link_type: LinkType::Page,
99/// };
100/// ```
101#[derive(Debug, Clone, PartialEq, Eq, Hash)]
102pub struct Link {
103 /// The URL of the discovered link.
104 pub url: Url,
105 /// The type of the discovered link.
106 pub link_type: LinkType,
107}
108
109/// One selector/attribute pair used during link extraction.
110///
111/// This is useful when the default HTML link sources are not enough for the
112/// target site and you need to teach the extractor about custom attributes.
113#[derive(Debug, Clone, PartialEq, Eq)]
114pub struct LinkSource {
115 /// CSS selector used to find candidate elements.
116 pub selector: String,
117 /// Attribute name that contains the URL.
118 pub attribute: String,
119 /// Optional fixed link type for matches from this source.
120 pub link_type: Option<LinkType>,
121}
122
123impl LinkSource {
124 /// Creates a new source definition.
125 pub fn new(selector: impl Into<String>, attribute: impl Into<String>) -> Self {
126 Self {
127 selector: selector.into(),
128 attribute: attribute.into(),
129 link_type: None,
130 }
131 }
132
133 /// Overrides the inferred link type for this source.
134 pub fn with_link_type(mut self, link_type: LinkType) -> Self {
135 self.link_type = Some(link_type);
136 self
137 }
138}
139
140/// Options that control link extraction from a [`Response`].
141///
142/// The defaults are intentionally conservative for crawler use: same-site
143/// filtering is enabled, text links are included, and common HTML elements are
144/// scanned for navigable URLs.
145#[derive(Debug, Clone, PartialEq, Eq)]
146pub struct LinkExtractOptions {
147 /// Restrict discovered links to the same registered domain.
148 pub same_site_only: bool,
149 /// Include URLs found in text content.
150 pub include_text_links: bool,
151 /// HTML sources used to discover attribute-based links.
152 pub sources: Vec<LinkSource>,
153 /// Optional allow-list of link types to include.
154 pub allowed_link_types: Option<Vec<LinkType>>,
155 /// Optional deny-list of link types to exclude.
156 pub denied_link_types: Vec<LinkType>,
157 /// Optional allow-list of glob-style URL patterns (`*` and `?` supported).
158 pub allow_patterns: Vec<String>,
159 /// Optional deny-list of glob-style URL patterns (`*` and `?` supported).
160 pub deny_patterns: Vec<String>,
161 /// Optional allow-list of domains or registered-domain suffixes.
162 pub allow_domains: Vec<String>,
163 /// Optional deny-list of domains or registered-domain suffixes.
164 pub deny_domains: Vec<String>,
165 /// Optional allow-list of URL path prefixes.
166 pub allow_path_prefixes: Vec<String>,
167 /// Optional deny-list of URL path prefixes.
168 pub deny_path_prefixes: Vec<String>,
169 /// Optional allow-list of HTML tag names used for attribute extraction.
170 pub allowed_tags: Option<Vec<String>>,
171 /// Optional allow-list of attribute names used for attribute extraction.
172 pub allowed_attributes: Option<Vec<String>>,
173}
174
175impl Default for LinkExtractOptions {
176 fn default() -> Self {
177 Self {
178 same_site_only: true,
179 include_text_links: true,
180 sources: default_link_sources(),
181 allowed_link_types: None,
182 denied_link_types: Vec::new(),
183 allow_patterns: Vec::new(),
184 deny_patterns: Vec::new(),
185 allow_domains: Vec::new(),
186 deny_domains: Vec::new(),
187 allow_path_prefixes: Vec::new(),
188 deny_path_prefixes: Vec::new(),
189 allowed_tags: None,
190 allowed_attributes: None,
191 }
192 }
193}
194
195impl LinkExtractOptions {
196 /// Sets whether only same-site URLs should be returned.
197 pub fn same_site_only(mut self, same_site_only: bool) -> Self {
198 self.same_site_only = same_site_only;
199 self
200 }
201
202 /// Sets whether URLs found in text content should be returned.
203 pub fn include_text_links(mut self, include_text_links: bool) -> Self {
204 self.include_text_links = include_text_links;
205 self
206 }
207
208 /// Replaces the configured HTML extraction sources.
209 pub fn with_sources(mut self, sources: impl IntoIterator<Item = LinkSource>) -> Self {
210 self.sources = sources.into_iter().collect();
211 self
212 }
213
214 /// Adds an HTML extraction source.
215 pub fn add_source(mut self, source: LinkSource) -> Self {
216 self.sources.push(source);
217 self
218 }
219
220 /// Restricts extraction to the provided link types.
221 pub fn with_allowed_link_types(
222 mut self,
223 allowed_link_types: impl IntoIterator<Item = LinkType>,
224 ) -> Self {
225 self.allowed_link_types = Some(allowed_link_types.into_iter().collect());
226 self
227 }
228
229 /// Adds link types that should be excluded even if discovered.
230 pub fn with_denied_link_types(
231 mut self,
232 denied_link_types: impl IntoIterator<Item = LinkType>,
233 ) -> Self {
234 self.denied_link_types = denied_link_types.into_iter().collect();
235 self
236 }
237
238 /// Adds a glob-style allow pattern that URLs must match.
239 pub fn allow_pattern(mut self, pattern: impl Into<String>) -> Self {
240 self.allow_patterns.push(pattern.into());
241 self
242 }
243
244 /// Replaces the glob-style allow patterns.
245 pub fn with_allow_patterns(
246 mut self,
247 patterns: impl IntoIterator<Item = impl Into<String>>,
248 ) -> Self {
249 self.allow_patterns = patterns.into_iter().map(Into::into).collect();
250 self
251 }
252
253 /// Adds a glob-style deny pattern that excludes matching URLs.
254 pub fn deny_pattern(mut self, pattern: impl Into<String>) -> Self {
255 self.deny_patterns.push(pattern.into());
256 self
257 }
258
259 /// Replaces the glob-style deny patterns.
260 pub fn with_deny_patterns(
261 mut self,
262 patterns: impl IntoIterator<Item = impl Into<String>>,
263 ) -> Self {
264 self.deny_patterns = patterns.into_iter().map(Into::into).collect();
265 self
266 }
267
268 /// Adds a domain or registered-domain suffix to allow.
269 pub fn allow_domain(mut self, domain: impl Into<String>) -> Self {
270 self.allow_domains.push(normalize_domain_filter(domain));
271 self
272 }
273
274 /// Replaces the allowed domains.
275 pub fn with_allow_domains(
276 mut self,
277 domains: impl IntoIterator<Item = impl Into<String>>,
278 ) -> Self {
279 self.allow_domains = domains.into_iter().map(normalize_domain_filter).collect();
280 self
281 }
282
283 /// Adds a domain or registered-domain suffix to deny.
284 pub fn deny_domain(mut self, domain: impl Into<String>) -> Self {
285 self.deny_domains.push(normalize_domain_filter(domain));
286 self
287 }
288
289 /// Replaces the denied domains.
290 pub fn with_deny_domains(
291 mut self,
292 domains: impl IntoIterator<Item = impl Into<String>>,
293 ) -> Self {
294 self.deny_domains = domains.into_iter().map(normalize_domain_filter).collect();
295 self
296 }
297
298 /// Adds a URL path prefix that links must match.
299 pub fn allow_path_prefix(mut self, prefix: impl Into<String>) -> Self {
300 self.allow_path_prefixes.push(normalize_path_prefix(prefix));
301 self
302 }
303
304 /// Replaces the allowed URL path prefixes.
305 pub fn with_allow_path_prefixes(
306 mut self,
307 prefixes: impl IntoIterator<Item = impl Into<String>>,
308 ) -> Self {
309 self.allow_path_prefixes = prefixes.into_iter().map(normalize_path_prefix).collect();
310 self
311 }
312
313 /// Adds a URL path prefix that should be excluded.
314 pub fn deny_path_prefix(mut self, prefix: impl Into<String>) -> Self {
315 self.deny_path_prefixes.push(normalize_path_prefix(prefix));
316 self
317 }
318
319 /// Replaces the denied URL path prefixes.
320 pub fn with_deny_path_prefixes(
321 mut self,
322 prefixes: impl IntoIterator<Item = impl Into<String>>,
323 ) -> Self {
324 self.deny_path_prefixes = prefixes.into_iter().map(normalize_path_prefix).collect();
325 self
326 }
327
328 /// Restricts attribute-based extraction to specific HTML tag names.
329 pub fn with_allowed_tags(mut self, tags: impl IntoIterator<Item = impl Into<String>>) -> Self {
330 self.allowed_tags = Some(
331 tags.into_iter()
332 .map(Into::into)
333 .map(|tag: String| tag.to_ascii_lowercase())
334 .collect(),
335 );
336 self
337 }
338
339 /// Restricts attribute-based extraction to specific attribute names.
340 pub fn with_allowed_attributes(
341 mut self,
342 attributes: impl IntoIterator<Item = impl Into<String>>,
343 ) -> Self {
344 self.allowed_attributes = Some(
345 attributes
346 .into_iter()
347 .map(Into::into)
348 .map(|attr: String| attr.to_ascii_lowercase())
349 .collect(),
350 );
351 self
352 }
353}
354
355/// Structured page metadata extracted from an HTML response.
356#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
357pub struct PageMetadata {
358 /// Contents of the `<title>` element.
359 pub title: Option<String>,
360 /// Contents of `<meta name="description">`.
361 pub description: Option<String>,
362 /// Canonical URL from `<link rel="canonical">`.
363 pub canonical_url: Option<Url>,
364 /// Open Graph metadata such as `og:title` or `og:image`.
365 pub open_graph: HashMap<String, String>,
366 /// Feed URLs discovered from alternate RSS/Atom link tags.
367 pub feed_urls: Vec<Url>,
368}
369
370impl PageMetadata {
371 /// Returns `true` when no metadata fields were extracted.
372 pub fn is_empty(&self) -> bool {
373 self.title.is_none()
374 && self.description.is_none()
375 && self.canonical_url.is_none()
376 && self.open_graph.is_empty()
377 && self.feed_urls.is_empty()
378 }
379}
380
381/// Represents an HTTP response received from a server.
382///
383/// [`Response`] contains all information about an HTTP response, including
384/// the final URL (after redirects), status code, headers, body content,
385/// and metadata carried over from the original request.
386///
387/// The type is designed for parse-time ergonomics:
388/// - [`Response::css`] exposes the recommended Scrapy-like selector API
389/// - [`Response::to_html`] remains available for lower-level DOM access
390/// - [`Response::json`] deserializes JSON payloads
391/// - [`Response::links`] and related helpers extract follow-up links
392/// - [`Response::to_request`] reconstructs the originating request context
393///
394/// ## Example
395///
396/// ```rust,ignore
397/// use spider_util::response::Response;
398/// use reqwest::StatusCode;
399/// use bytes::Bytes;
400/// use url::Url;
401///
402/// let response = Response {
403/// url: Url::parse("https://example.com").unwrap(),
404/// status: StatusCode::OK,
405/// headers: http::header::HeaderMap::new(),
406/// body: Bytes::from("<html><body>Hello</body></html>"),
407/// request_url: Url::parse("https://example.com").unwrap(),
408/// meta: None,
409/// cached: false,
410/// };
411///
412/// // Extract text using the builtin selector API
413/// let title = response.css("title::text").ok().and_then(|list| list.get());
414/// ```
415#[derive(Debug)]
416pub struct Response {
417 /// The final URL of the response after any redirects.
418 pub url: Url,
419 /// The HTTP status code of the response.
420 pub status: StatusCode,
421 /// The headers of the response.
422 pub headers: http::header::HeaderMap,
423 /// The body of the response.
424 pub body: bytes::Bytes,
425 /// The original URL of the request that led to this response.
426 pub request_url: Url,
427 /// The scheduling priority of the original request.
428 pub request_priority: i32,
429 /// Metadata associated with the response, carried over from the request.
430 /// Uses Option to allow lazy initialization.
431 pub meta: Option<Arc<DashMap<String, serde_json::Value>>>,
432 /// Indicates if the response was served from a cache.
433 pub cached: bool,
434}
435
436impl Response {
437 /// Creates a new response with an empty HTML cache.
438 ///
439 /// Most application code receives responses from the runtime rather than
440 /// constructing them directly. This constructor is mainly useful for custom
441 /// downloaders and lower-level integrations.
442 pub fn new(
443 url: Url,
444 status: StatusCode,
445 headers: http::header::HeaderMap,
446 body: bytes::Bytes,
447 request_url: Url,
448 ) -> Self {
449 Self {
450 url,
451 status,
452 headers,
453 body,
454 request_url,
455 request_priority: 0,
456 meta: None,
457 cached: false,
458 }
459 }
460
461 /// Reconstructs the original [`Request`] that led to this response.
462 ///
463 /// This method creates a new [`Request`] with the same URL and metadata
464 /// as the request that produced this response. Useful for retry scenarios
465 /// or when you need to re-request the same resource.
466 ///
467 /// ## Example
468 ///
469 /// ```rust,ignore
470 /// # use spider_util::response::Response;
471 /// # use reqwest::StatusCode;
472 /// # use bytes::Bytes;
473 /// # use url::Url;
474 /// # let response = Response {
475 /// # url: Url::parse("https://example.com").unwrap(),
476 /// # status: StatusCode::OK,
477 /// # headers: http::header::HeaderMap::new(),
478 /// # body: Bytes::from("hello"),
479 /// # request_url: Url::parse("https://example.com").unwrap(),
480 /// # request_priority: 0,
481 /// # meta: None,
482 /// # cached: false,
483 /// # };
484 /// let original_request = response.request_from_response();
485 /// ```
486 pub fn request_from_response(&self) -> Request {
487 let mut request =
488 Request::new(self.request_url.clone()).with_priority(self.request_priority);
489 request.set_meta_from_option(self.meta.clone());
490 request
491 }
492
493 /// Returns a cloned metadata value by key.
494 pub fn get_meta(&self, key: &str) -> Option<serde_json::Value> {
495 self.meta
496 .as_ref()
497 .and_then(|m| m.get(key).map(|entry| entry.value().clone()))
498 }
499
500 /// Deserializes a metadata value into the requested type.
501 pub fn meta_value<T>(&self, key: &str) -> Result<Option<T>, serde_json::Error>
502 where
503 T: DeserializeOwned,
504 {
505 self.get_meta(key).map(serde_json::from_value).transpose()
506 }
507
508 /// Returns the runtime discovery rule name attached to this response, if any.
509 pub fn discovery_rule_name(&self) -> Option<String> {
510 self.get_meta(DISCOVERY_RULE_META_KEY)
511 .and_then(|value| value.as_str().map(ToOwned::to_owned))
512 }
513
514 /// Returns `true` when the response was reached through the named discovery rule.
515 pub fn matches_discovery_rule(&self, rule_name: &str) -> bool {
516 self.discovery_rule_name().as_deref() == Some(rule_name)
517 }
518
519 /// Inserts a metadata value, lazily allocating the map if needed.
520 pub fn insert_meta(&mut self, key: impl Into<String>, value: serde_json::Value) {
521 self.meta
522 .get_or_insert_with(|| Arc::new(DashMap::new()))
523 .insert(key.into(), value);
524 }
525
526 /// Returns a clone of the internal metadata map, if present.
527 pub fn clone_meta(&self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
528 self.meta.clone()
529 }
530
531 /// Deserializes the response body as JSON.
532 ///
533 /// # Type Parameters
534 ///
535 /// - `T`: The target type to deserialize into (must implement `DeserializeOwned`)
536 ///
537 /// # Errors
538 ///
539 /// Returns a [`serde_json::Error`] if the body cannot be parsed as JSON
540 /// or if it cannot be deserialized into type `T`.
541 ///
542 /// ## Example
543 ///
544 /// ```rust,ignore
545 /// # use spider_util::response::Response;
546 /// # use reqwest::StatusCode;
547 /// # use bytes::Bytes;
548 /// # use url::Url;
549 /// # use serde::Deserialize;
550 /// # #[derive(Deserialize)]
551 /// # struct Data { value: String }
552 /// # let response = Response {
553 /// # url: Url::parse("https://api.example.com").unwrap(),
554 /// # status: StatusCode::OK,
555 /// # headers: http::header::HeaderMap::new(),
556 /// # body: Bytes::from(r#"{"value": "test"}"#),
557 /// # request_url: Url::parse("https://api.example.com").unwrap(),
558 /// # meta: None,
559 /// # cached: false,
560 /// # };
561 /// let data: Data = response.json()?;
562 /// # Ok::<(), serde_json::Error>(())
563 /// ```
564 pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
565 serde_json::from_slice(&self.body)
566 }
567
568 /// Parses the response body as HTML.
569 ///
570 /// This method is kept for lower-level DOM access and interop. For most
571 /// spider code, prefer [`Response::css`] and the builtin selector API.
572 ///
573 /// Returns a [`scraper::Html`] document that can be queried directly.
574 ///
575 /// # Errors
576 ///
577 /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
578 ///
579 /// ## Example
580 ///
581 /// ```rust,ignore
582 /// # use spider_util::response::Response;
583 /// # use reqwest::StatusCode;
584 /// # use bytes::Bytes;
585 /// # use url::Url;
586 /// # let response = Response {
587 /// # url: Url::parse("https://example.com").unwrap(),
588 /// # status: StatusCode::OK,
589 /// # headers: http::header::HeaderMap::new(),
590 /// # body: Bytes::from("<html><body>Hello</body></html>"),
591 /// # request_url: Url::parse("https://example.com").unwrap(),
592 /// # meta: None,
593 /// # cached: false,
594 /// # };
595 /// let html = response.to_html()?;
596 /// # Ok::<(), std::str::Utf8Error>(())
597 /// ```
598 pub fn to_html(&self) -> Result<Html, Utf8Error> {
599 Ok((*self.cached_html()?).clone())
600 }
601
602 /// Lazily parses the response body as HTML.
603 ///
604 /// Returns a closure that can be called when lower-level HTML access is
605 /// actually needed. Most spiders should prefer [`Response::css`].
606 ///
607 /// # Errors
608 ///
609 /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
610 ///
611 /// ## Example
612 ///
613 /// ```rust,ignore
614 /// # use spider_util::response::Response;
615 /// # use reqwest::StatusCode;
616 /// # use bytes::Bytes;
617 /// # use url::Url;
618 /// # let response = Response {
619 /// # url: Url::parse("https://example.com").unwrap(),
620 /// # status: StatusCode::OK,
621 /// # headers: http::header::HeaderMap::new(),
622 /// # body: Bytes::from("<html><body>Hello</body></html>"),
623 /// # request_url: Url::parse("https://example.com").unwrap(),
624 /// # meta: None,
625 /// # cached: false,
626 /// # };
627 /// let html_fn = response.lazy_html()?;
628 /// // Parse HTML only when needed
629 /// let html = html_fn()?;
630 /// # Ok::<(), std::str::Utf8Error>(())
631 /// ```
632 pub fn lazy_html(&self) -> Result<impl Fn() -> Result<Html, Utf8Error> + '_, Utf8Error> {
633 Ok(move || self.to_html())
634 }
635
636 /// Applies a builtin CSS selector to the response body using a Scrapy-like API.
637 ///
638 /// Supports standard CSS selectors plus terminal extraction suffixes:
639 /// - `::text`
640 /// - `::attr(name)`
641 ///
642 /// ## Example
643 ///
644 /// ```rust,ignore
645 /// # use spider_util::response::Response;
646 /// # use reqwest::StatusCode;
647 /// # use bytes::Bytes;
648 /// # use url::Url;
649 /// # let response = Response {
650 /// # url: Url::parse("https://example.com").unwrap(),
651 /// # status: StatusCode::OK,
652 /// # headers: http::header::HeaderMap::new(),
653 /// # body: Bytes::from(r#"<html><body><h1>Hello</h1><a href="/next">Next</a></body></html>"#),
654 /// # request_url: Url::parse("https://example.com").unwrap(),
655 /// # request_priority: 0,
656 /// # meta: None,
657 /// # cached: false,
658 /// # };
659 /// let heading = response.css("h1::text")?.get().unwrap_or_default();
660 /// let next_href = response.css("a::attr(href)")?.get();
661 /// # Ok::<(), crate::error::SpiderError>(())
662 /// ```
663 ///
664 /// # Errors
665 ///
666 /// Returns [`SpiderError::Utf8Error`] when the body is not valid UTF-8 and
667 /// [`SpiderError::HtmlParseError`] when the selector is invalid.
668 pub fn css(&self, query: &str) -> Result<SelectorList, SpiderError> {
669 let document = self.cached_html()?;
670 SelectorList::from_document_query(document, query)
671 }
672
673 /// Returns the response body as UTF-8 text.
674 pub fn text(&self) -> Result<&str, Utf8Error> {
675 from_utf8(&self.body)
676 }
677
678 /// Extracts structured page metadata from HTML responses.
679 pub fn page_metadata(&self) -> Result<PageMetadata, Utf8Error> {
680 let html = self.cached_html()?;
681 let mut metadata = PageMetadata::default();
682
683 if let Some(selector) = get_cached_selector("title") {
684 metadata.title = html
685 .select(&selector)
686 .next()
687 .map(|node| node.text().collect::<String>().trim().to_string())
688 .filter(|value| !value.is_empty());
689 }
690
691 if let Some(selector) = get_cached_selector("meta[name], meta[property], meta[content]") {
692 for element in html.select(&selector) {
693 let Some(content) = element.value().attr("content") else {
694 continue;
695 };
696 let content = content.trim();
697 if content.is_empty() {
698 continue;
699 }
700
701 if let Some(name) = element.value().attr("name")
702 && name.eq_ignore_ascii_case("description")
703 && metadata.description.is_none()
704 {
705 metadata.description = Some(content.to_string());
706 }
707
708 if let Some(property) = element.value().attr("property")
709 && property.len() >= 3
710 && property[..3].eq_ignore_ascii_case("og:")
711 {
712 metadata
713 .open_graph
714 .entry(property.to_string())
715 .or_insert_with(|| content.to_string());
716 }
717 }
718 }
719
720 if let Some(selector) = get_cached_selector("link[href]") {
721 for element in html.select(&selector) {
722 let Some(href) = element.value().attr("href") else {
723 continue;
724 };
725 let rel = element.value().attr("rel").unwrap_or_default();
726
727 if rel
728 .split_ascii_whitespace()
729 .any(|token| token.eq_ignore_ascii_case("canonical"))
730 && metadata.canonical_url.is_none()
731 && let Ok(url) = self.url.join(href)
732 {
733 metadata.canonical_url = Some(url);
734 }
735
736 let is_alternate = rel
737 .split_ascii_whitespace()
738 .any(|token| token.eq_ignore_ascii_case("alternate"));
739 let ty = element.value().attr("type").unwrap_or_default();
740 let is_feed = ty.eq_ignore_ascii_case("application/rss+xml")
741 || ty.eq_ignore_ascii_case("application/atom+xml")
742 || ty.eq_ignore_ascii_case("application/xml")
743 || ty.eq_ignore_ascii_case("text/xml");
744
745 if is_alternate
746 && is_feed
747 && let Ok(url) = self.url.join(href)
748 && !metadata.feed_urls.contains(&url)
749 {
750 metadata.feed_urls.push(url);
751 }
752 }
753 }
754
755 Ok(metadata)
756 }
757
758 /// Returns a customizable iterator of links discovered in the response body.
759 ///
760 /// Unlike [`Response::links`], this method does not deduplicate results.
761 /// Callers that need uniqueness can collect into a set or use [`Response::links`].
762 ///
763 /// ## Example
764 ///
765 /// ```rust,ignore
766 /// # use spider_util::response::{LinkExtractOptions, Response};
767 /// # use reqwest::StatusCode;
768 /// # use bytes::Bytes;
769 /// # use url::Url;
770 /// # let response = Response {
771 /// # url: Url::parse("https://example.com").unwrap(),
772 /// # status: StatusCode::OK,
773 /// # headers: http::header::HeaderMap::new(),
774 /// # body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
775 /// # request_url: Url::parse("https://example.com").unwrap(),
776 /// # meta: None,
777 /// # cached: false,
778 /// # };
779 /// let links: Vec<_> = response
780 /// .links_iter(LinkExtractOptions::default())
781 /// .collect();
782 /// assert!(!links.is_empty());
783 /// ```
784 pub fn links_iter(&self, options: LinkExtractOptions) -> impl Iterator<Item = Link> {
785 self.parse_links(options).unwrap_or_default().into_iter()
786 }
787
788 /// Extracts all unique, same-site links from the response body.
789 ///
790 /// This method discovers links from:
791 /// - HTML elements with `href` or `src` attributes (`<a>`, `<link>`, `<script>`, `<img>`, etc.)
792 /// - URLs found in text content (using link detection)
793 ///
794 /// Only links pointing to the same site (same registered domain) are included.
795 ///
796 /// ## Returns
797 ///
798 /// A [`DashSet`] of [`Link`] objects containing the URL and link type.
799 ///
800 /// ## Example
801 ///
802 /// ```rust,ignore
803 /// # use spider_util::response::Response;
804 /// # use reqwest::StatusCode;
805 /// # use bytes::Bytes;
806 /// # use url::Url;
807 /// # let response = Response {
808 /// # url: Url::parse("https://example.com").unwrap(),
809 /// # status: StatusCode::OK,
810 /// # headers: http::header::HeaderMap::new(),
811 /// # body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
812 /// # request_url: Url::parse("https://example.com").unwrap(),
813 /// # meta: None,
814 /// # cached: false,
815 /// # };
816 /// let links = response.links();
817 /// for link in links.iter() {
818 /// println!("Found {:?} link: {}", link.link_type, link.url);
819 /// }
820 /// ```
821 pub fn links(&self) -> DashSet<Link> {
822 let links = DashSet::new();
823
824 for link in self.links_iter(LinkExtractOptions::default()) {
825 links.insert(link);
826 }
827
828 links
829 }
830
831 fn parse_links(&self, options: LinkExtractOptions) -> Result<Vec<Link>, Utf8Error> {
832 let html = self.cached_html()?;
833 let mut links = Vec::new();
834
835 self.collect_attribute_links(&html, &options, &mut links);
836
837 if options.include_text_links {
838 self.collect_text_links(&html, &options, &mut links);
839 }
840
841 Ok(links)
842 }
843
844 fn collect_attribute_links(
845 &self,
846 html: &Html,
847 options: &LinkExtractOptions,
848 links: &mut Vec<Link>,
849 ) {
850 for source in &options.sources {
851 if !options
852 .allowed_attributes
853 .as_ref()
854 .is_none_or(|allowed| allowed.iter().any(|attr| attr == &source.attribute))
855 {
856 continue;
857 }
858
859 let Some(selector) = get_cached_selector(&source.selector) else {
860 continue;
861 };
862
863 for element in html.select(&selector) {
864 let tag_name = element.value().name();
865 if !options
866 .allowed_tags
867 .as_ref()
868 .is_none_or(|allowed| allowed.iter().any(|tag| tag == tag_name))
869 {
870 continue;
871 }
872
873 let Some(attr_value) = element.value().attr(&source.attribute) else {
874 continue;
875 };
876
877 let link_type = source
878 .link_type
879 .clone()
880 .unwrap_or_else(|| infer_link_type(&element));
881
882 if let Some(link) = self.build_link(attr_value, link_type, options) {
883 links.push(link);
884 }
885 }
886 }
887 }
888
889 fn collect_text_links(&self, html: &Html, options: &LinkExtractOptions, links: &mut Vec<Link>) {
890 let finder = LinkFinder::new();
891
892 for text_node in html.tree.values().filter_map(|node| node.as_text()) {
893 for link in finder.links(text_node) {
894 if link.kind() != &LinkKind::Url {
895 continue;
896 }
897
898 if let Some(link) = self.build_link(link.as_str(), LinkType::Page, options) {
899 links.push(link);
900 }
901 }
902 }
903 }
904
905 fn build_link(
906 &self,
907 raw_url: &str,
908 link_type: LinkType,
909 options: &LinkExtractOptions,
910 ) -> Option<Link> {
911 let url = self.url.join(raw_url).ok()?;
912
913 if options.same_site_only && !util::is_same_site(&url, &self.url) {
914 return None;
915 }
916
917 if !options
918 .allowed_link_types
919 .as_ref()
920 .is_none_or(|allowed| allowed.contains(&link_type))
921 {
922 return None;
923 }
924
925 if options.denied_link_types.contains(&link_type) {
926 return None;
927 }
928
929 let absolute_url = url.as_str();
930 if !options.allow_patterns.is_empty()
931 && !options
932 .allow_patterns
933 .iter()
934 .any(|pattern| glob_matches(pattern, absolute_url))
935 {
936 return None;
937 }
938
939 if options
940 .deny_patterns
941 .iter()
942 .any(|pattern| glob_matches(pattern, absolute_url))
943 {
944 return None;
945 }
946
947 let host = url.host_str().unwrap_or_default();
948 if !options.allow_domains.is_empty()
949 && !options
950 .allow_domains
951 .iter()
952 .any(|domain| domain_matches(host, domain))
953 {
954 return None;
955 }
956
957 if options
958 .deny_domains
959 .iter()
960 .any(|domain| domain_matches(host, domain))
961 {
962 return None;
963 }
964
965 let path = url.path();
966 if !options.allow_path_prefixes.is_empty()
967 && !options
968 .allow_path_prefixes
969 .iter()
970 .any(|prefix| path.starts_with(prefix))
971 {
972 return None;
973 }
974
975 if options
976 .deny_path_prefixes
977 .iter()
978 .any(|prefix| path.starts_with(prefix))
979 {
980 return None;
981 }
982
983 Some(Link { url, link_type })
984 }
985
986 fn html_cache_key(&self) -> u64 {
987 let mut hasher = SeaHasher::new();
988 self.url.as_str().hash(&mut hasher);
989 self.request_url.as_str().hash(&mut hasher);
990 self.body.hash(&mut hasher);
991 hasher.finish()
992 }
993
994 fn cached_html(&self) -> Result<Arc<Html>, Utf8Error> {
995 let cache_key = self.html_cache_key();
996
997 HTML_CACHE.with(|cache| {
998 if let Some(html) = cache.borrow().get(&cache_key).cloned() {
999 return Ok(html);
1000 }
1001
1002 let body_str = from_utf8(&self.body)?;
1003 let html = Arc::new(Html::parse_document(body_str));
1004 cache.borrow_mut().insert(cache_key, html.clone());
1005 Ok(html)
1006 })
1007 }
1008}
1009
1010impl Clone for Response {
1011 fn clone(&self) -> Self {
1012 Response {
1013 url: self.url.clone(),
1014 status: self.status,
1015 headers: self.headers.clone(),
1016 body: self.body.clone(),
1017 request_url: self.request_url.clone(),
1018 request_priority: self.request_priority,
1019 meta: self.meta.clone(),
1020 cached: self.cached,
1021 }
1022 }
1023}
1024
1025fn default_link_sources() -> Vec<LinkSource> {
1026 vec![
1027 LinkSource::new("a[href]", "href"),
1028 LinkSource::new("link[href]", "href"),
1029 LinkSource::new("script[src]", "src"),
1030 LinkSource::new("img[src]", "src"),
1031 LinkSource::new("audio[src]", "src"),
1032 LinkSource::new("video[src]", "src"),
1033 LinkSource::new("source[src]", "src"),
1034 ]
1035}
1036
1037fn infer_link_type(element: &ElementRef<'_>) -> LinkType {
1038 match element.value().name() {
1039 "a" => LinkType::Page,
1040 "link" => {
1041 if let Some(rel) = element.value().attr("rel") {
1042 if rel.eq_ignore_ascii_case("stylesheet") {
1043 LinkType::Stylesheet
1044 } else {
1045 LinkType::Other(rel.to_string())
1046 }
1047 } else {
1048 LinkType::Other("link".to_string())
1049 }
1050 }
1051 "script" => LinkType::Script,
1052 "img" => LinkType::Image,
1053 "audio" | "video" | "source" => LinkType::Media,
1054 _ => LinkType::Other(element.value().name().to_string()),
1055 }
1056}
1057
1058fn normalize_domain_filter(domain: impl Into<String>) -> String {
1059 domain
1060 .into()
1061 .trim()
1062 .trim_start_matches('.')
1063 .to_ascii_lowercase()
1064}
1065
1066fn normalize_path_prefix(prefix: impl Into<String>) -> String {
1067 let prefix = prefix.into();
1068 let prefix = prefix.trim();
1069 if prefix.is_empty() || prefix == "/" {
1070 "/".to_string()
1071 } else if prefix.starts_with('/') {
1072 prefix.to_string()
1073 } else {
1074 format!("/{prefix}")
1075 }
1076}
1077
1078fn domain_matches(host: &str, filter: &str) -> bool {
1079 let host = host.to_ascii_lowercase();
1080 let filter = filter.to_ascii_lowercase();
1081 host == filter || host.ends_with(&format!(".{filter}"))
1082}
1083
1084fn glob_matches(pattern: &str, input: &str) -> bool {
1085 let pattern = pattern.as_bytes();
1086 let input = input.as_bytes();
1087 let (mut p, mut s) = (0usize, 0usize);
1088 let mut last_star = None;
1089 let mut match_after_star = 0usize;
1090
1091 while s < input.len() {
1092 if p < pattern.len() && (pattern[p] == b'?' || pattern[p] == input[s]) {
1093 p += 1;
1094 s += 1;
1095 } else if p < pattern.len() && pattern[p] == b'*' {
1096 last_star = Some(p);
1097 p += 1;
1098 match_after_star = s;
1099 } else if let Some(star_idx) = last_star {
1100 p = star_idx + 1;
1101 match_after_star += 1;
1102 s = match_after_star;
1103 } else {
1104 return false;
1105 }
1106 }
1107
1108 while p < pattern.len() && pattern[p] == b'*' {
1109 p += 1;
1110 }
1111
1112 p == pattern.len()
1113}