Skip to main content

spider_util/
response.rs

1//! Data structures and utilities for handling HTTP responses in `spider-lib`.
2//!
3//! This module defines the `Response` struct, which represents an HTTP response
4//! received from a web server. It encapsulates crucial information such as
5//! the URL, status code, headers, and body of the response, along with any
6//! associated metadata.
7//!
8//! Additionally, this module provides:
9//! - Helper methods for `Response` to facilitate common tasks like parsing
10//!   the body as HTML or JSON, and reconstructing the original `Request`.
11//! - `Link` and `LinkType` enums for structured representation and extraction
12//!   of hyperlinks found within the response content.
13
14use crate::request::Request;
15use crate::selector_cache::get_cached_selector;
16use crate::utils;
17use bytes::Bytes;
18use dashmap::{DashMap, DashSet};
19use linkify::{LinkFinder, LinkKind};
20use reqwest::StatusCode;
21use reqwest::header::HeaderMap;
22use scraper::Html;
23use serde::de::DeserializeOwned;
24use serde_json::{self, Value};
25use std::{borrow::Cow, str::Utf8Error, str::from_utf8};
26use url::Url;
27
28/// Represents the type of a discovered link.
29#[derive(Debug, Clone, PartialEq, Eq, Hash)]
30pub enum LinkType {
31    /// A link to another web page.
32    Page,
33    /// A link to a script file.
34    Script,
35    /// A link to a stylesheet.
36    Stylesheet,
37    /// A link to an image.
38    Image,
39    /// A link to a media file (audio/video).
40    Media,
41    /// A link to another type of resource.
42    Other(String),
43}
44
45/// Represents a link discovered on a web page.
46#[derive(Debug, Clone, PartialEq, Eq, Hash)]
47pub struct Link {
48    /// The URL of the discovered link.
49    pub url: Url,
50    /// The type of the discovered link.
51    pub link_type: LinkType,
52}
53
54/// Represents an HTTP response received from a server.
55#[derive(Debug)]
56pub struct Response {
57    /// The final URL of the response after any redirects.
58    pub url: Url,
59    /// The HTTP status code of the response.
60    pub status: StatusCode,
61    /// The headers of the response.
62    pub headers: HeaderMap,
63    /// The body of the response.
64    pub body: Bytes,
65    /// The original URL of the request that led to this response.
66    pub request_url: Url,
67    /// Metadata associated with the response, carried over from the request.
68    pub meta: DashMap<Cow<'static, str>, Value>,
69    /// Indicates if the response was served from a cache.
70    pub cached: bool,
71}
72
73impl Clone for Response {
74    fn clone(&self) -> Self {
75        Response {
76            url: self.url.clone(),
77            status: self.status,
78            headers: self.headers.clone(),
79            body: self.body.clone(),
80            request_url: self.request_url.clone(),
81            meta: self.meta.clone(),
82            cached: self.cached,
83        }
84    }
85}
86
87impl Response {
88    /// Reconstructs the original `Request` that led to this response.
89    pub fn request_from_response(&self) -> Request {
90        let mut request = Request::new(self.request_url.clone());
91        request.meta = self.meta.clone();
92        request
93    }
94
95    /// Deserializes the response body as JSON.
96    pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
97        serde_json::from_slice(&self.body)
98    }
99
100    /// Parses the response body as HTML.
101    pub fn to_html(&self) -> Result<Html, Utf8Error> {
102        let body_str = from_utf8(&self.body)?;
103        Ok(Html::parse_document(body_str))
104    }
105
106    /// Lazily parses the response body as HTML, returning a closure that can be called when needed.
107    pub fn lazy_html(&self) -> Result<impl Fn() -> Result<Html, Utf8Error> + '_, Utf8Error> {
108        let body_bytes = &self.body;
109        Ok(move || {
110            let body_str = from_utf8(body_bytes)?;
111            Ok(Html::parse_document(body_str))
112        })
113    }
114
115    /// Extracts all unique, same-site links from the response body.
116    pub fn links(&self) -> DashSet<Link> {
117        let links = DashSet::new();
118
119        if let Ok(html_fn) = self.lazy_html()
120            && let Ok(html) = html_fn()
121        {
122            let selectors = vec![
123                ("a[href]", "href"),
124                ("link[href]", "href"),
125                ("script[src]", "src"),
126                ("img[src]", "src"),
127                ("audio[src]", "src"),
128                ("video[src]", "src"),
129                ("source[src]", "src"),
130            ];
131
132            for (selector_str, attr_name) in selectors {
133                if let Some(selector) = get_cached_selector(selector_str) {
134                    for element in html.select(&selector) {
135                        if let Some(attr_value) = element.value().attr(attr_name)
136                            && let Ok(url) = self.url.join(attr_value)
137                            && utils::is_same_site(&url, &self.url)
138                        {
139                            let link_type = match element.value().name() {
140                                "a" => LinkType::Page,
141                                "link" => {
142                                    if let Some(rel) = element.value().attr("rel") {
143                                        if rel.eq_ignore_ascii_case("stylesheet") {
144                                            LinkType::Stylesheet
145                                        } else {
146                                            LinkType::Other(rel.to_string())
147                                        }
148                                    } else {
149                                        LinkType::Other("link".to_string())
150                                    }
151                                }
152                                "script" => LinkType::Script,
153                                "img" => LinkType::Image,
154                                "audio" | "video" | "source" => LinkType::Media,
155                                _ => LinkType::Other(element.value().name().to_string()),
156                            };
157                            links.insert(Link { url, link_type });
158                        }
159                    }
160                }
161            }
162
163            let finder = LinkFinder::new();
164            for text_node in html.tree.values().filter_map(|node| node.as_text()) {
165                for link in finder.links(text_node) {
166                    if link.kind() == &LinkKind::Url
167                        && let Ok(url) = self.url.join(link.as_str())
168                        && utils::is_same_site(&url, &self.url)
169                    {
170                        links.insert(Link {
171                            url,
172                            link_type: LinkType::Page,
173                        });
174                    }
175                }
176            }
177        }
178
179        links
180    }
181
182    /// Converts this response to a stream response.
183    #[cfg(feature = "stream")]
184    pub async fn to_stream_response(
185        &self,
186    ) -> Result<crate::stream_response::StreamResponse, std::io::Error> {
187        use futures_util::stream;
188        use std::io;
189
190        let body_chunk = self.body.clone();
191        let body_stream = stream::iter(vec![Ok::<bytes::Bytes, io::Error>(body_chunk)]);
192
193        Ok(crate::stream_response::StreamResponse {
194            url: self.url.clone(),
195            status: self.status,
196            headers: self.headers.clone(),
197            body_stream: Box::pin(body_stream),
198            request_url: self.request_url.clone(),
199            meta: self.meta.clone(),
200            cached: self.cached,
201        })
202    }
203}