spider_util/response.rs
1//! Response types and response-side helpers.
2//!
3//! [`Response`] wraps the downloaded body together with the final URL, status,
4//! headers, and request metadata. It also provides convenience methods for
5//! parsing HTML or JSON and for extracting links.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_util::response::Response;
11//! use reqwest::StatusCode;
12//! use bytes::Bytes;
13//! use url::Url;
14//!
15//! // Create a response (typically done internally by the downloader)
16//! let response = Response {
17//! url: Url::parse("https://example.com").unwrap(),
18//! status: StatusCode::OK,
19//! headers: http::header::HeaderMap::new(),
20//! body: Bytes::from("<html><body>Hello</body></html>"),
21//! request_url: Url::parse("https://example.com").unwrap(),
22//! request_priority: 0,
23//! meta: None,
24//! cached: false,
25//! };
26//!
27//! // Parse as HTML
28//! let html = response.to_html().unwrap();
29//!
30//! // Extract links from the response
31//! let links = response.links();
32//! ```
33//!
34//! In the crawler lifecycle, a [`Response`] is produced by the downloader,
35//! optionally rewritten by middleware, and then handed to
36//! [`Spider::parse`](spider_core::Spider::parse).
37
38use crate::request::Request;
39use crate::selector::get_cached_selector;
40use crate::util;
41use dashmap::{DashMap, DashSet};
42use linkify::{LinkFinder, LinkKind};
43use reqwest::StatusCode;
44use scraper::{ElementRef, Html};
45use seahash::SeaHasher;
46use serde::de::DeserializeOwned;
47use serde::{Deserialize, Serialize};
48use serde_json;
49use std::cell::RefCell;
50use std::collections::HashMap;
51use std::hash::{Hash, Hasher};
52use std::{str::Utf8Error, str::from_utf8, sync::Arc};
53use url::Url;
54
55thread_local! {
56 static HTML_CACHE: RefCell<HashMap<u64, Html>> = RefCell::new(HashMap::new());
57}
58
59const DISCOVERY_RULE_META_KEY: &str = "__discovery_rule";
60
61/// Classification for links discovered in a response.
62///
63/// ## Variants
64///
65/// - `Page`: Links to other web pages (typically `<a>` tags)
66/// - `Script`: Links to JavaScript files (`<script>` tags)
67/// - `Stylesheet`: Links to CSS stylesheets (`<link rel="stylesheet">`)
68/// - `Image`: Links to images (`<img>` tags)
69/// - `Media`: Links to audio/video files (`<audio>`, `<video>`, `<source>`)
70/// - `Other`: Any other type of resource with a custom identifier
71#[derive(Debug, Clone, PartialEq, Eq, Hash)]
72pub enum LinkType {
73 /// A link to another web page.
74 Page,
75 /// A link to a script file.
76 Script,
77 /// A link to a stylesheet.
78 Stylesheet,
79 /// A link to an image.
80 Image,
81 /// A link to a media file (audio/video).
82 Media,
83 /// A link to another type of resource.
84 Other(String),
85}
86
87/// A link discovered while extracting URLs from a response.
88///
89/// ## Example
90///
91/// ```rust,ignore
92/// use spider_util::response::{Link, LinkType};
93/// use url::Url;
94///
95/// let link = Link {
96/// url: Url::parse("https://example.com/page").unwrap(),
97/// link_type: LinkType::Page,
98/// };
99/// ```
100#[derive(Debug, Clone, PartialEq, Eq, Hash)]
101pub struct Link {
102 /// The URL of the discovered link.
103 pub url: Url,
104 /// The type of the discovered link.
105 pub link_type: LinkType,
106}
107
108/// One selector/attribute pair used during link extraction.
109///
110/// This is useful when the default HTML link sources are not enough for the
111/// target site and you need to teach the extractor about custom attributes.
112#[derive(Debug, Clone, PartialEq, Eq)]
113pub struct LinkSource {
114 /// CSS selector used to find candidate elements.
115 pub selector: String,
116 /// Attribute name that contains the URL.
117 pub attribute: String,
118 /// Optional fixed link type for matches from this source.
119 pub link_type: Option<LinkType>,
120}
121
122impl LinkSource {
123 /// Creates a new source definition.
124 pub fn new(selector: impl Into<String>, attribute: impl Into<String>) -> Self {
125 Self {
126 selector: selector.into(),
127 attribute: attribute.into(),
128 link_type: None,
129 }
130 }
131
132 /// Overrides the inferred link type for this source.
133 pub fn with_link_type(mut self, link_type: LinkType) -> Self {
134 self.link_type = Some(link_type);
135 self
136 }
137}
138
139/// Options that control link extraction from a [`Response`].
140///
141/// The defaults are intentionally conservative for crawler use: same-site
142/// filtering is enabled, text links are included, and common HTML elements are
143/// scanned for navigable URLs.
144#[derive(Debug, Clone, PartialEq, Eq)]
145pub struct LinkExtractOptions {
146 /// Restrict discovered links to the same registered domain.
147 pub same_site_only: bool,
148 /// Include URLs found in text content.
149 pub include_text_links: bool,
150 /// HTML sources used to discover attribute-based links.
151 pub sources: Vec<LinkSource>,
152 /// Optional allow-list of link types to include.
153 pub allowed_link_types: Option<Vec<LinkType>>,
154 /// Optional deny-list of link types to exclude.
155 pub denied_link_types: Vec<LinkType>,
156 /// Optional allow-list of glob-style URL patterns (`*` and `?` supported).
157 pub allow_patterns: Vec<String>,
158 /// Optional deny-list of glob-style URL patterns (`*` and `?` supported).
159 pub deny_patterns: Vec<String>,
160 /// Optional allow-list of domains or registered-domain suffixes.
161 pub allow_domains: Vec<String>,
162 /// Optional deny-list of domains or registered-domain suffixes.
163 pub deny_domains: Vec<String>,
164 /// Optional allow-list of URL path prefixes.
165 pub allow_path_prefixes: Vec<String>,
166 /// Optional deny-list of URL path prefixes.
167 pub deny_path_prefixes: Vec<String>,
168 /// Optional allow-list of HTML tag names used for attribute extraction.
169 pub allowed_tags: Option<Vec<String>>,
170 /// Optional allow-list of attribute names used for attribute extraction.
171 pub allowed_attributes: Option<Vec<String>>,
172}
173
174impl Default for LinkExtractOptions {
175 fn default() -> Self {
176 Self {
177 same_site_only: true,
178 include_text_links: true,
179 sources: default_link_sources(),
180 allowed_link_types: None,
181 denied_link_types: Vec::new(),
182 allow_patterns: Vec::new(),
183 deny_patterns: Vec::new(),
184 allow_domains: Vec::new(),
185 deny_domains: Vec::new(),
186 allow_path_prefixes: Vec::new(),
187 deny_path_prefixes: Vec::new(),
188 allowed_tags: None,
189 allowed_attributes: None,
190 }
191 }
192}
193
194impl LinkExtractOptions {
195 /// Sets whether only same-site URLs should be returned.
196 pub fn same_site_only(mut self, same_site_only: bool) -> Self {
197 self.same_site_only = same_site_only;
198 self
199 }
200
201 /// Sets whether URLs found in text content should be returned.
202 pub fn include_text_links(mut self, include_text_links: bool) -> Self {
203 self.include_text_links = include_text_links;
204 self
205 }
206
207 /// Replaces the configured HTML extraction sources.
208 pub fn with_sources(mut self, sources: impl IntoIterator<Item = LinkSource>) -> Self {
209 self.sources = sources.into_iter().collect();
210 self
211 }
212
213 /// Adds an HTML extraction source.
214 pub fn add_source(mut self, source: LinkSource) -> Self {
215 self.sources.push(source);
216 self
217 }
218
219 /// Restricts extraction to the provided link types.
220 pub fn with_allowed_link_types(
221 mut self,
222 allowed_link_types: impl IntoIterator<Item = LinkType>,
223 ) -> Self {
224 self.allowed_link_types = Some(allowed_link_types.into_iter().collect());
225 self
226 }
227
228 /// Adds link types that should be excluded even if discovered.
229 pub fn with_denied_link_types(
230 mut self,
231 denied_link_types: impl IntoIterator<Item = LinkType>,
232 ) -> Self {
233 self.denied_link_types = denied_link_types.into_iter().collect();
234 self
235 }
236
237 /// Adds a glob-style allow pattern that URLs must match.
238 pub fn allow_pattern(mut self, pattern: impl Into<String>) -> Self {
239 self.allow_patterns.push(pattern.into());
240 self
241 }
242
243 /// Replaces the glob-style allow patterns.
244 pub fn with_allow_patterns(
245 mut self,
246 patterns: impl IntoIterator<Item = impl Into<String>>,
247 ) -> Self {
248 self.allow_patterns = patterns.into_iter().map(Into::into).collect();
249 self
250 }
251
252 /// Adds a glob-style deny pattern that excludes matching URLs.
253 pub fn deny_pattern(mut self, pattern: impl Into<String>) -> Self {
254 self.deny_patterns.push(pattern.into());
255 self
256 }
257
258 /// Replaces the glob-style deny patterns.
259 pub fn with_deny_patterns(
260 mut self,
261 patterns: impl IntoIterator<Item = impl Into<String>>,
262 ) -> Self {
263 self.deny_patterns = patterns.into_iter().map(Into::into).collect();
264 self
265 }
266
267 /// Adds a domain or registered-domain suffix to allow.
268 pub fn allow_domain(mut self, domain: impl Into<String>) -> Self {
269 self.allow_domains.push(normalize_domain_filter(domain));
270 self
271 }
272
273 /// Replaces the allowed domains.
274 pub fn with_allow_domains(
275 mut self,
276 domains: impl IntoIterator<Item = impl Into<String>>,
277 ) -> Self {
278 self.allow_domains = domains.into_iter().map(normalize_domain_filter).collect();
279 self
280 }
281
282 /// Adds a domain or registered-domain suffix to deny.
283 pub fn deny_domain(mut self, domain: impl Into<String>) -> Self {
284 self.deny_domains.push(normalize_domain_filter(domain));
285 self
286 }
287
288 /// Replaces the denied domains.
289 pub fn with_deny_domains(
290 mut self,
291 domains: impl IntoIterator<Item = impl Into<String>>,
292 ) -> Self {
293 self.deny_domains = domains.into_iter().map(normalize_domain_filter).collect();
294 self
295 }
296
297 /// Adds a URL path prefix that links must match.
298 pub fn allow_path_prefix(mut self, prefix: impl Into<String>) -> Self {
299 self.allow_path_prefixes.push(normalize_path_prefix(prefix));
300 self
301 }
302
303 /// Replaces the allowed URL path prefixes.
304 pub fn with_allow_path_prefixes(
305 mut self,
306 prefixes: impl IntoIterator<Item = impl Into<String>>,
307 ) -> Self {
308 self.allow_path_prefixes = prefixes.into_iter().map(normalize_path_prefix).collect();
309 self
310 }
311
312 /// Adds a URL path prefix that should be excluded.
313 pub fn deny_path_prefix(mut self, prefix: impl Into<String>) -> Self {
314 self.deny_path_prefixes.push(normalize_path_prefix(prefix));
315 self
316 }
317
318 /// Replaces the denied URL path prefixes.
319 pub fn with_deny_path_prefixes(
320 mut self,
321 prefixes: impl IntoIterator<Item = impl Into<String>>,
322 ) -> Self {
323 self.deny_path_prefixes = prefixes.into_iter().map(normalize_path_prefix).collect();
324 self
325 }
326
327 /// Restricts attribute-based extraction to specific HTML tag names.
328 pub fn with_allowed_tags(mut self, tags: impl IntoIterator<Item = impl Into<String>>) -> Self {
329 self.allowed_tags = Some(
330 tags.into_iter()
331 .map(Into::into)
332 .map(|tag: String| tag.to_ascii_lowercase())
333 .collect(),
334 );
335 self
336 }
337
338 /// Restricts attribute-based extraction to specific attribute names.
339 pub fn with_allowed_attributes(
340 mut self,
341 attributes: impl IntoIterator<Item = impl Into<String>>,
342 ) -> Self {
343 self.allowed_attributes = Some(
344 attributes
345 .into_iter()
346 .map(Into::into)
347 .map(|attr: String| attr.to_ascii_lowercase())
348 .collect(),
349 );
350 self
351 }
352}
353
354/// Structured page metadata extracted from an HTML response.
355#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
356pub struct PageMetadata {
357 /// Contents of the `<title>` element.
358 pub title: Option<String>,
359 /// Contents of `<meta name="description">`.
360 pub description: Option<String>,
361 /// Canonical URL from `<link rel="canonical">`.
362 pub canonical_url: Option<Url>,
363 /// Open Graph metadata such as `og:title` or `og:image`.
364 pub open_graph: HashMap<String, String>,
365 /// Feed URLs discovered from alternate RSS/Atom link tags.
366 pub feed_urls: Vec<Url>,
367}
368
369impl PageMetadata {
370 /// Returns `true` when no metadata fields were extracted.
371 pub fn is_empty(&self) -> bool {
372 self.title.is_none()
373 && self.description.is_none()
374 && self.canonical_url.is_none()
375 && self.open_graph.is_empty()
376 && self.feed_urls.is_empty()
377 }
378}
379
380/// Represents an HTTP response received from a server.
381///
382/// [`Response`] contains all information about an HTTP response, including
383/// the final URL (after redirects), status code, headers, body content,
384/// and metadata carried over from the original request.
385///
386/// The type is designed for parse-time ergonomics:
387/// - [`Response::to_html`] parses the body as HTML
388/// - [`Response::json`] deserializes JSON payloads
389/// - [`Response::links`] and related helpers extract follow-up links
390/// - [`Response::to_request`] reconstructs the originating request context
391///
392/// ## Example
393///
394/// ```rust,ignore
395/// use spider_util::response::Response;
396/// use reqwest::StatusCode;
397/// use bytes::Bytes;
398/// use url::Url;
399///
400/// let response = Response {
401/// url: Url::parse("https://example.com").unwrap(),
402/// status: StatusCode::OK,
403/// headers: http::header::HeaderMap::new(),
404/// body: Bytes::from("<html><body>Hello</body></html>"),
405/// request_url: Url::parse("https://example.com").unwrap(),
406/// meta: None,
407/// cached: false,
408/// };
409///
410/// // Parse the response body as HTML
411/// if let Ok(html) = response.to_html() {
412/// // Process HTML...
413/// }
414/// ```
415#[derive(Debug)]
416pub struct Response {
417 /// The final URL of the response after any redirects.
418 pub url: Url,
419 /// The HTTP status code of the response.
420 pub status: StatusCode,
421 /// The headers of the response.
422 pub headers: http::header::HeaderMap,
423 /// The body of the response.
424 pub body: bytes::Bytes,
425 /// The original URL of the request that led to this response.
426 pub request_url: Url,
427 /// The scheduling priority of the original request.
428 pub request_priority: i32,
429 /// Metadata associated with the response, carried over from the request.
430 /// Uses Option to allow lazy initialization.
431 pub meta: Option<Arc<DashMap<String, serde_json::Value>>>,
432 /// Indicates if the response was served from a cache.
433 pub cached: bool,
434}
435
436impl Response {
437 /// Creates a new response with an empty HTML cache.
438 ///
439 /// Most application code receives responses from the runtime rather than
440 /// constructing them directly. This constructor is mainly useful for custom
441 /// downloaders and lower-level integrations.
442 pub fn new(
443 url: Url,
444 status: StatusCode,
445 headers: http::header::HeaderMap,
446 body: bytes::Bytes,
447 request_url: Url,
448 ) -> Self {
449 Self {
450 url,
451 status,
452 headers,
453 body,
454 request_url,
455 request_priority: 0,
456 meta: None,
457 cached: false,
458 }
459 }
460
461 /// Reconstructs the original [`Request`] that led to this response.
462 ///
463 /// This method creates a new [`Request`] with the same URL and metadata
464 /// as the request that produced this response. Useful for retry scenarios
465 /// or when you need to re-request the same resource.
466 ///
467 /// ## Example
468 ///
469 /// ```rust,ignore
470 /// # use spider_util::response::Response;
471 /// # use reqwest::StatusCode;
472 /// # use bytes::Bytes;
473 /// # use url::Url;
474 /// # let response = Response {
475 /// # url: Url::parse("https://example.com").unwrap(),
476 /// # status: StatusCode::OK,
477 /// # headers: http::header::HeaderMap::new(),
478 /// # body: Bytes::from("hello"),
479 /// # request_url: Url::parse("https://example.com").unwrap(),
480 /// # request_priority: 0,
481 /// # meta: None,
482 /// # cached: false,
483 /// # };
484 /// let original_request = response.request_from_response();
485 /// ```
486 pub fn request_from_response(&self) -> Request {
487 let mut request =
488 Request::new(self.request_url.clone()).with_priority(self.request_priority);
489 request.set_meta_from_option(self.meta.clone());
490 request
491 }
492
493 /// Returns a cloned metadata value by key.
494 pub fn get_meta(&self, key: &str) -> Option<serde_json::Value> {
495 self.meta
496 .as_ref()
497 .and_then(|m| m.get(key).map(|entry| entry.value().clone()))
498 }
499
500 /// Deserializes a metadata value into the requested type.
501 pub fn meta_value<T>(&self, key: &str) -> Result<Option<T>, serde_json::Error>
502 where
503 T: DeserializeOwned,
504 {
505 self.get_meta(key).map(serde_json::from_value).transpose()
506 }
507
508 /// Returns the runtime discovery rule name attached to this response, if any.
509 pub fn discovery_rule_name(&self) -> Option<String> {
510 self.get_meta(DISCOVERY_RULE_META_KEY)
511 .and_then(|value| value.as_str().map(ToOwned::to_owned))
512 }
513
514 /// Returns `true` when the response was reached through the named discovery rule.
515 pub fn matches_discovery_rule(&self, rule_name: &str) -> bool {
516 self.discovery_rule_name().as_deref() == Some(rule_name)
517 }
518
519 /// Inserts a metadata value, lazily allocating the map if needed.
520 pub fn insert_meta(&mut self, key: impl Into<String>, value: serde_json::Value) {
521 self.meta
522 .get_or_insert_with(|| Arc::new(DashMap::new()))
523 .insert(key.into(), value);
524 }
525
526 /// Returns a clone of the internal metadata map, if present.
527 pub fn clone_meta(&self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
528 self.meta.clone()
529 }
530
531 /// Deserializes the response body as JSON.
532 ///
533 /// # Type Parameters
534 ///
535 /// - `T`: The target type to deserialize into (must implement `DeserializeOwned`)
536 ///
537 /// # Errors
538 ///
539 /// Returns a [`serde_json::Error`] if the body cannot be parsed as JSON
540 /// or if it cannot be deserialized into type `T`.
541 ///
542 /// ## Example
543 ///
544 /// ```rust,ignore
545 /// # use spider_util::response::Response;
546 /// # use reqwest::StatusCode;
547 /// # use bytes::Bytes;
548 /// # use url::Url;
549 /// # use serde::Deserialize;
550 /// # #[derive(Deserialize)]
551 /// # struct Data { value: String }
552 /// # let response = Response {
553 /// # url: Url::parse("https://api.example.com").unwrap(),
554 /// # status: StatusCode::OK,
555 /// # headers: http::header::HeaderMap::new(),
556 /// # body: Bytes::from(r#"{"value": "test"}"#),
557 /// # request_url: Url::parse("https://api.example.com").unwrap(),
558 /// # meta: None,
559 /// # cached: false,
560 /// # };
561 /// let data: Data = response.json()?;
562 /// # Ok::<(), serde_json::Error>(())
563 /// ```
564 pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
565 serde_json::from_slice(&self.body)
566 }
567
568 /// Parses the response body as HTML.
569 ///
570 /// Returns a [`scraper::Html`] document that can be queried using CSS selectors.
571 ///
572 /// # Errors
573 ///
574 /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
575 ///
576 /// ## Example
577 ///
578 /// ```rust,ignore
579 /// # use spider_util::response::Response;
580 /// # use reqwest::StatusCode;
581 /// # use bytes::Bytes;
582 /// # use url::Url;
583 /// # let response = Response {
584 /// # url: Url::parse("https://example.com").unwrap(),
585 /// # status: StatusCode::OK,
586 /// # headers: http::header::HeaderMap::new(),
587 /// # body: Bytes::from("<html><body>Hello</body></html>"),
588 /// # request_url: Url::parse("https://example.com").unwrap(),
589 /// # meta: None,
590 /// # cached: false,
591 /// # };
592 /// let html = response.to_html()?;
593 /// # Ok::<(), std::str::Utf8Error>(())
594 /// ```
595 pub fn to_html(&self) -> Result<Html, Utf8Error> {
596 let cache_key = self.html_cache_key();
597
598 HTML_CACHE.with(|cache| {
599 if let Some(html) = cache.borrow().get(&cache_key).cloned() {
600 return Ok(html);
601 }
602
603 let body_str = from_utf8(&self.body)?;
604 let html = Html::parse_document(body_str);
605 cache.borrow_mut().insert(cache_key, html.clone());
606 Ok(html)
607 })
608 }
609
610 /// Lazily parses the response body as HTML.
611 ///
612 /// Returns a closure that can be called when the HTML is actually needed.
613 /// This avoids parsing HTML for responses where it may not be used.
614 ///
615 /// # Errors
616 ///
617 /// Returns a [`Utf8Error`] if the response body is not valid UTF-8.
618 ///
619 /// ## Example
620 ///
621 /// ```rust,ignore
622 /// # use spider_util::response::Response;
623 /// # use reqwest::StatusCode;
624 /// # use bytes::Bytes;
625 /// # use url::Url;
626 /// # let response = Response {
627 /// # url: Url::parse("https://example.com").unwrap(),
628 /// # status: StatusCode::OK,
629 /// # headers: http::header::HeaderMap::new(),
630 /// # body: Bytes::from("<html><body>Hello</body></html>"),
631 /// # request_url: Url::parse("https://example.com").unwrap(),
632 /// # meta: None,
633 /// # cached: false,
634 /// # };
635 /// let html_fn = response.lazy_html()?;
636 /// // Parse HTML only when needed
637 /// let html = html_fn()?;
638 /// # Ok::<(), std::str::Utf8Error>(())
639 /// ```
640 pub fn lazy_html(&self) -> Result<impl Fn() -> Result<Html, Utf8Error> + '_, Utf8Error> {
641 Ok(move || self.to_html())
642 }
643
644 /// Returns the response body as UTF-8 text.
645 pub fn text(&self) -> Result<&str, Utf8Error> {
646 from_utf8(&self.body)
647 }
648
649 /// Extracts structured page metadata from HTML responses.
650 pub fn page_metadata(&self) -> Result<PageMetadata, Utf8Error> {
651 let html = self.to_html()?;
652 let mut metadata = PageMetadata::default();
653
654 if let Some(selector) = get_cached_selector("title") {
655 metadata.title = html
656 .select(&selector)
657 .next()
658 .map(|node| node.text().collect::<String>().trim().to_string())
659 .filter(|value| !value.is_empty());
660 }
661
662 if let Some(selector) = get_cached_selector("meta[name], meta[property], meta[content]") {
663 for element in html.select(&selector) {
664 let Some(content) = element.value().attr("content") else {
665 continue;
666 };
667 let content = content.trim();
668 if content.is_empty() {
669 continue;
670 }
671
672 if let Some(name) = element.value().attr("name")
673 && name.eq_ignore_ascii_case("description")
674 && metadata.description.is_none()
675 {
676 metadata.description = Some(content.to_string());
677 }
678
679 if let Some(property) = element.value().attr("property")
680 && property.len() >= 3
681 && property[..3].eq_ignore_ascii_case("og:")
682 {
683 metadata
684 .open_graph
685 .entry(property.to_string())
686 .or_insert_with(|| content.to_string());
687 }
688 }
689 }
690
691 if let Some(selector) = get_cached_selector("link[href]") {
692 for element in html.select(&selector) {
693 let Some(href) = element.value().attr("href") else {
694 continue;
695 };
696 let rel = element.value().attr("rel").unwrap_or_default();
697
698 if rel
699 .split_ascii_whitespace()
700 .any(|token| token.eq_ignore_ascii_case("canonical"))
701 && metadata.canonical_url.is_none()
702 && let Ok(url) = self.url.join(href)
703 {
704 metadata.canonical_url = Some(url);
705 }
706
707 let is_alternate = rel
708 .split_ascii_whitespace()
709 .any(|token| token.eq_ignore_ascii_case("alternate"));
710 let ty = element.value().attr("type").unwrap_or_default();
711 let is_feed = ty.eq_ignore_ascii_case("application/rss+xml")
712 || ty.eq_ignore_ascii_case("application/atom+xml")
713 || ty.eq_ignore_ascii_case("application/xml")
714 || ty.eq_ignore_ascii_case("text/xml");
715
716 if is_alternate
717 && is_feed
718 && let Ok(url) = self.url.join(href)
719 && !metadata.feed_urls.contains(&url)
720 {
721 metadata.feed_urls.push(url);
722 }
723 }
724 }
725
726 Ok(metadata)
727 }
728
729 /// Returns a customizable iterator of links discovered in the response body.
730 ///
731 /// Unlike [`Response::links`], this method does not deduplicate results.
732 /// Callers that need uniqueness can collect into a set or use [`Response::links`].
733 ///
734 /// ## Example
735 ///
736 /// ```rust,ignore
737 /// # use spider_util::response::{LinkExtractOptions, Response};
738 /// # use reqwest::StatusCode;
739 /// # use bytes::Bytes;
740 /// # use url::Url;
741 /// # let response = Response {
742 /// # url: Url::parse("https://example.com").unwrap(),
743 /// # status: StatusCode::OK,
744 /// # headers: http::header::HeaderMap::new(),
745 /// # body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
746 /// # request_url: Url::parse("https://example.com").unwrap(),
747 /// # meta: None,
748 /// # cached: false,
749 /// # };
750 /// let links: Vec<_> = response
751 /// .links_iter(LinkExtractOptions::default())
752 /// .collect();
753 /// assert!(!links.is_empty());
754 /// ```
755 pub fn links_iter(&self, options: LinkExtractOptions) -> impl Iterator<Item = Link> {
756 self.parse_links(options).unwrap_or_default().into_iter()
757 }
758
759 /// Extracts all unique, same-site links from the response body.
760 ///
761 /// This method discovers links from:
762 /// - HTML elements with `href` or `src` attributes (`<a>`, `<link>`, `<script>`, `<img>`, etc.)
763 /// - URLs found in text content (using link detection)
764 ///
765 /// Only links pointing to the same site (same registered domain) are included.
766 ///
767 /// ## Returns
768 ///
769 /// A [`DashSet`] of [`Link`] objects containing the URL and link type.
770 ///
771 /// ## Example
772 ///
773 /// ```rust,ignore
774 /// # use spider_util::response::Response;
775 /// # use reqwest::StatusCode;
776 /// # use bytes::Bytes;
777 /// # use url::Url;
778 /// # let response = Response {
779 /// # url: Url::parse("https://example.com").unwrap(),
780 /// # status: StatusCode::OK,
781 /// # headers: http::header::HeaderMap::new(),
782 /// # body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
783 /// # request_url: Url::parse("https://example.com").unwrap(),
784 /// # meta: None,
785 /// # cached: false,
786 /// # };
787 /// let links = response.links();
788 /// for link in links.iter() {
789 /// println!("Found {:?} link: {}", link.link_type, link.url);
790 /// }
791 /// ```
792 pub fn links(&self) -> DashSet<Link> {
793 let links = DashSet::new();
794
795 for link in self.links_iter(LinkExtractOptions::default()) {
796 links.insert(link);
797 }
798
799 links
800 }
801
802 fn parse_links(&self, options: LinkExtractOptions) -> Result<Vec<Link>, Utf8Error> {
803 let html_fn = self.lazy_html()?;
804 let html = html_fn()?;
805 let mut links = Vec::new();
806
807 self.collect_attribute_links(&html, &options, &mut links);
808
809 if options.include_text_links {
810 self.collect_text_links(&html, &options, &mut links);
811 }
812
813 Ok(links)
814 }
815
816 fn collect_attribute_links(
817 &self,
818 html: &Html,
819 options: &LinkExtractOptions,
820 links: &mut Vec<Link>,
821 ) {
822 for source in &options.sources {
823 if !options
824 .allowed_attributes
825 .as_ref()
826 .is_none_or(|allowed| allowed.iter().any(|attr| attr == &source.attribute))
827 {
828 continue;
829 }
830
831 let Some(selector) = get_cached_selector(&source.selector) else {
832 continue;
833 };
834
835 for element in html.select(&selector) {
836 let tag_name = element.value().name();
837 if !options
838 .allowed_tags
839 .as_ref()
840 .is_none_or(|allowed| allowed.iter().any(|tag| tag == tag_name))
841 {
842 continue;
843 }
844
845 let Some(attr_value) = element.value().attr(&source.attribute) else {
846 continue;
847 };
848
849 let link_type = source
850 .link_type
851 .clone()
852 .unwrap_or_else(|| infer_link_type(&element));
853
854 if let Some(link) = self.build_link(attr_value, link_type, options) {
855 links.push(link);
856 }
857 }
858 }
859 }
860
861 fn collect_text_links(&self, html: &Html, options: &LinkExtractOptions, links: &mut Vec<Link>) {
862 let finder = LinkFinder::new();
863
864 for text_node in html.tree.values().filter_map(|node| node.as_text()) {
865 for link in finder.links(text_node) {
866 if link.kind() != &LinkKind::Url {
867 continue;
868 }
869
870 if let Some(link) = self.build_link(link.as_str(), LinkType::Page, options) {
871 links.push(link);
872 }
873 }
874 }
875 }
876
877 fn build_link(
878 &self,
879 raw_url: &str,
880 link_type: LinkType,
881 options: &LinkExtractOptions,
882 ) -> Option<Link> {
883 let url = self.url.join(raw_url).ok()?;
884
885 if options.same_site_only && !util::is_same_site(&url, &self.url) {
886 return None;
887 }
888
889 if !options
890 .allowed_link_types
891 .as_ref()
892 .is_none_or(|allowed| allowed.contains(&link_type))
893 {
894 return None;
895 }
896
897 if options.denied_link_types.contains(&link_type) {
898 return None;
899 }
900
901 let absolute_url = url.as_str();
902 if !options.allow_patterns.is_empty()
903 && !options
904 .allow_patterns
905 .iter()
906 .any(|pattern| glob_matches(pattern, absolute_url))
907 {
908 return None;
909 }
910
911 if options
912 .deny_patterns
913 .iter()
914 .any(|pattern| glob_matches(pattern, absolute_url))
915 {
916 return None;
917 }
918
919 let host = url.host_str().unwrap_or_default();
920 if !options.allow_domains.is_empty()
921 && !options
922 .allow_domains
923 .iter()
924 .any(|domain| domain_matches(host, domain))
925 {
926 return None;
927 }
928
929 if options
930 .deny_domains
931 .iter()
932 .any(|domain| domain_matches(host, domain))
933 {
934 return None;
935 }
936
937 let path = url.path();
938 if !options.allow_path_prefixes.is_empty()
939 && !options
940 .allow_path_prefixes
941 .iter()
942 .any(|prefix| path.starts_with(prefix))
943 {
944 return None;
945 }
946
947 if options
948 .deny_path_prefixes
949 .iter()
950 .any(|prefix| path.starts_with(prefix))
951 {
952 return None;
953 }
954
955 Some(Link { url, link_type })
956 }
957
958 fn html_cache_key(&self) -> u64 {
959 let mut hasher = SeaHasher::new();
960 self.url.as_str().hash(&mut hasher);
961 self.request_url.as_str().hash(&mut hasher);
962 self.body.hash(&mut hasher);
963 hasher.finish()
964 }
965}
966
967impl Clone for Response {
968 fn clone(&self) -> Self {
969 Response {
970 url: self.url.clone(),
971 status: self.status,
972 headers: self.headers.clone(),
973 body: self.body.clone(),
974 request_url: self.request_url.clone(),
975 request_priority: self.request_priority,
976 meta: self.meta.clone(),
977 cached: self.cached,
978 }
979 }
980}
981
982fn default_link_sources() -> Vec<LinkSource> {
983 vec![
984 LinkSource::new("a[href]", "href"),
985 LinkSource::new("link[href]", "href"),
986 LinkSource::new("script[src]", "src"),
987 LinkSource::new("img[src]", "src"),
988 LinkSource::new("audio[src]", "src"),
989 LinkSource::new("video[src]", "src"),
990 LinkSource::new("source[src]", "src"),
991 ]
992}
993
994fn infer_link_type(element: &ElementRef<'_>) -> LinkType {
995 match element.value().name() {
996 "a" => LinkType::Page,
997 "link" => {
998 if let Some(rel) = element.value().attr("rel") {
999 if rel.eq_ignore_ascii_case("stylesheet") {
1000 LinkType::Stylesheet
1001 } else {
1002 LinkType::Other(rel.to_string())
1003 }
1004 } else {
1005 LinkType::Other("link".to_string())
1006 }
1007 }
1008 "script" => LinkType::Script,
1009 "img" => LinkType::Image,
1010 "audio" | "video" | "source" => LinkType::Media,
1011 _ => LinkType::Other(element.value().name().to_string()),
1012 }
1013}
1014
1015fn normalize_domain_filter(domain: impl Into<String>) -> String {
1016 domain
1017 .into()
1018 .trim()
1019 .trim_start_matches('.')
1020 .to_ascii_lowercase()
1021}
1022
1023fn normalize_path_prefix(prefix: impl Into<String>) -> String {
1024 let prefix = prefix.into();
1025 let prefix = prefix.trim();
1026 if prefix.is_empty() || prefix == "/" {
1027 "/".to_string()
1028 } else if prefix.starts_with('/') {
1029 prefix.to_string()
1030 } else {
1031 format!("/{prefix}")
1032 }
1033}
1034
1035fn domain_matches(host: &str, filter: &str) -> bool {
1036 let host = host.to_ascii_lowercase();
1037 let filter = filter.to_ascii_lowercase();
1038 host == filter || host.ends_with(&format!(".{filter}"))
1039}
1040
1041fn glob_matches(pattern: &str, input: &str) -> bool {
1042 let pattern = pattern.as_bytes();
1043 let input = input.as_bytes();
1044 let (mut p, mut s) = (0usize, 0usize);
1045 let mut last_star = None;
1046 let mut match_after_star = 0usize;
1047
1048 while s < input.len() {
1049 if p < pattern.len() && (pattern[p] == b'?' || pattern[p] == input[s]) {
1050 p += 1;
1051 s += 1;
1052 } else if p < pattern.len() && pattern[p] == b'*' {
1053 last_star = Some(p);
1054 p += 1;
1055 match_after_star = s;
1056 } else if let Some(star_idx) = last_star {
1057 p = star_idx + 1;
1058 match_after_star += 1;
1059 s = match_after_star;
1060 } else {
1061 return false;
1062 }
1063 }
1064
1065 while p < pattern.len() && pattern[p] == b'*' {
1066 p += 1;
1067 }
1068
1069 p == pattern.len()
1070}