Skip to main content

spider_util/
response.rs

1//! Data structures and utilities for handling HTTP responses in `spider-lib`.
2//!
3//! This module defines the `Response` struct, which represents an HTTP response
4//! received from a web server. It encapsulates crucial information such as
5//! the URL, status code, headers, and body of the response, along with any
6//! associated metadata.
7//!
8//! Additionally, this module provides:
9//! - Helper methods for `Response` to facilitate common tasks like parsing
10//!   the body as HTML or JSON, and reconstructing the original `Request`.
11//! - `Link` and `LinkType` enums for structured representation and extraction
12//!   of hyperlinks found within the response content.
13
14use crate::request::Request;
15use crate::selector_cache::get_cached_selector;
16use crate::utils;
17use bytes::Bytes;
18use dashmap::{DashMap, DashSet};
19use linkify::{LinkFinder, LinkKind};
20use reqwest::StatusCode;
21use reqwest::header::HeaderMap;
22use scraper::Html;
23use serde::de::DeserializeOwned;
24use serde_json::{self, Value};
25use std::{borrow::Cow, str::Utf8Error, str::from_utf8};
26use url::Url;
27
28/// Represents the type of a discovered link.
29#[derive(Debug, Clone, PartialEq, Eq, Hash)]
30pub enum LinkType {
31    /// A link to another web page.
32    Page,
33    /// A link to a script file.
34    Script,
35    /// A link to a stylesheet.
36    Stylesheet,
37    /// A link to an image.
38    Image,
39    /// A link to a media file (audio/video).
40    Media,
41    /// A link to another type of resource.
42    Other(String),
43}
44
45/// Represents a link discovered on a web page.
46#[derive(Debug, Clone, PartialEq, Eq, Hash)]
47pub struct Link {
48    /// The URL of the discovered link.
49    pub url: Url,
50    /// The type of the discovered link.
51    pub link_type: LinkType,
52}
53
54/// Represents an HTTP response received from a server.
55#[derive(Debug)]
56pub struct Response {
57    /// The final URL of the response after any redirects.
58    pub url: Url,
59    /// The HTTP status code of the response.
60    pub status: StatusCode,
61    /// The headers of the response.
62    pub headers: HeaderMap,
63    /// The body of the response.
64    pub body: Bytes,
65    /// The original URL of the request that led to this response.
66    pub request_url: Url,
67    /// Metadata associated with the response, carried over from the request.
68    pub meta: DashMap<Cow<'static, str>, Value>,
69    /// Indicates if the response was served from a cache.
70    pub cached: bool,
71}
72
73
74impl Response {
75    /// Reconstructs the original `Request` that led to this response.
76    pub fn request_from_response(&self) -> Request {
77        let mut request = Request::new(self.request_url.clone());
78        request.meta = self.meta.clone();
79        request
80    }
81
82    /// Deserializes the response body as JSON.
83    pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
84        serde_json::from_slice(&self.body)
85    }
86
87    /// Parses the response body as HTML.
88    pub fn to_html(&self) -> Result<Html, Utf8Error> {
89        let body_str = from_utf8(&self.body)?;
90        Ok(Html::parse_document(body_str))
91    }
92
93    /// Lazily parses the response body as HTML, returning a closure that can be called when needed.
94    pub fn lazy_html(&self) -> Result<impl Fn() -> Result<Html, Utf8Error> + '_, Utf8Error> {
95        let body_bytes = &self.body;
96        Ok(move || {
97            let body_str = from_utf8(body_bytes)?;
98            Ok(Html::parse_document(body_str))
99        })
100    }
101
102    /// Extracts all unique, same-site links from the response body.
103    pub fn links(&self) -> DashSet<Link> {
104        let links = DashSet::new();
105
106        if let Ok(html_fn) = self.lazy_html()
107            && let Ok(html) = html_fn()
108        {
109            let selectors = vec![
110                ("a[href]", "href"),
111                ("link[href]", "href"),
112                ("script[src]", "src"),
113                ("img[src]", "src"),
114                ("audio[src]", "src"),
115                ("video[src]", "src"),
116                ("source[src]", "src"),
117            ];
118
119            for (selector_str, attr_name) in selectors {
120                if let Some(selector) = get_cached_selector(selector_str) {
121                    for element in html.select(&selector) {
122                        if let Some(attr_value) = element.value().attr(attr_name)
123                            && let Ok(url) = self.url.join(attr_value)
124                            && utils::is_same_site(&url, &self.url)
125                        {
126                            let link_type = match element.value().name() {
127                                "a" => LinkType::Page,
128                                "link" => {
129                                    if let Some(rel) = element.value().attr("rel") {
130                                        if rel.eq_ignore_ascii_case("stylesheet") {
131                                            LinkType::Stylesheet
132                                        } else {
133                                            LinkType::Other(rel.to_string())
134                                        }
135                                    } else {
136                                        LinkType::Other("link".to_string())
137                                    }
138                                }
139                                "script" => LinkType::Script,
140                                "img" => LinkType::Image,
141                                "audio" | "video" | "source" => LinkType::Media,
142                                _ => LinkType::Other(element.value().name().to_string()),
143                            };
144                            links.insert(Link { url, link_type });
145                        }
146                    }
147                }
148            }
149
150            let finder = LinkFinder::new();
151            for text_node in html.tree.values().filter_map(|node| node.as_text()) {
152                for link in finder.links(text_node) {
153                    if link.kind() == &LinkKind::Url
154                        && let Ok(url) = self.url.join(link.as_str())
155                        && utils::is_same_site(&url, &self.url)
156                    {
157                        links.insert(Link {
158                            url,
159                            link_type: LinkType::Page,
160                        });
161                    }
162                }
163            }
164        }
165
166        links
167    }
168
169}
170
171impl Clone for Response {
172    fn clone(&self) -> Self {
173        Response {
174            url: self.url.clone(),
175            status: self.status,
176            headers: self.headers.clone(),
177            body: self.body.clone(),
178            request_url: self.request_url.clone(),
179            meta: self.meta.clone(),
180            cached: self.cached,
181        }
182    }
183}