Skip to main content

spider_lib/
response.rs

1use crate::{request::Request, utils};
2use bytes::Bytes;
3use dashmap::{DashMap, DashSet};
4use linkify::{LinkFinder, LinkKind};
5use reqwest::StatusCode;
6use reqwest::header::HeaderMap;
7use scraper::{Html, Selector};
8use serde::de::DeserializeOwned;
9use serde_json;
10use std::borrow::Cow;
11use url::Url;
12
13/// Represents the type of a discovered link.
14#[derive(Debug, Clone, PartialEq, Eq, Hash)]
15pub enum LinkType {
16    /// A link to another web page.
17    Page,
18    /// A link to a script file.
19    Script,
20    /// A link to a stylesheet.
21    Stylesheet,
22    /// A link to an image.
23    Image,
24    /// A link to a media file (audio/video).
25    Media,
26    /// A link to another type of resource.
27    Other(String),
28}
29
30/// Represents a link discovered on a web page.
31#[derive(Debug, Clone, PartialEq, Eq, Hash)]
32pub struct Link {
33    /// The URL of the discovered link.
34    pub url: Url,
35    /// The type of the discovered link.
36    pub link_type: LinkType,
37}
38
39/// Represents an HTTP response received from a server.
40#[derive(Debug, Clone)]
41pub struct Response {
42    /// The final URL of the response after any redirects.
43    pub url: Url,
44    /// The HTTP status code of the response.
45    pub status: StatusCode,
46    /// The headers of the response.
47    pub headers: HeaderMap,
48    /// The body of the response.
49    pub body: Bytes,
50    /// The original URL of the request that led to this response.
51    pub request_url: Url,
52    /// Metadata associated with the response, carried over from the request.
53    pub meta: DashMap<Cow<'static, str>, serde_json::Value>,
54}
55
56impl Response {
57    /// Reconstructs the original `Request` that led to this response.
58    pub fn request_from_response(&self) -> Request {
59        let mut request = Request::new(self.request_url.clone());
60        request.meta = self.meta.clone();
61        request
62    }
63
64    /// Deserializes the response body as JSON.
65    pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
66        serde_json::from_slice(&self.body)
67    }
68
69    /// Parses the response body as HTML.
70    pub fn to_html(&self) -> Result<Html, std::str::Utf8Error> {
71        let body_str = std::str::from_utf8(&self.body)?;
72        Ok(Html::parse_document(body_str))
73    }
74
75    /// Extracts all unique, same-site links from the response body.
76    pub fn links(&self) -> DashSet<Link> {
77        let links = DashSet::new();
78
79        if let Ok(html) = self.to_html() {
80            let selectors = vec![
81                ("a[href]", "href"),
82                ("link[href]", "href"),
83                ("script[src]", "src"),
84                ("img[src]", "src"),
85                ("audio[src]", "src"),
86                ("video[src]", "src"),
87                ("source[src]", "src"),
88            ];
89
90            for (selector_str, attr_name) in selectors {
91                if let Ok(selector) = Selector::parse(selector_str) {
92                    for element in html.select(&selector) {
93                        if let Some(attr_value) = element.value().attr(attr_name)
94                            && let Ok(url) = self.url.join(attr_value)
95                            && utils::is_same_site(&url, &self.url)
96                        {
97                            let link_type = match element.value().name() {
98                                "a" => LinkType::Page,
99                                "link" => {
100                                    if let Some(rel) = element.value().attr("rel") {
101                                        if rel.eq_ignore_ascii_case("stylesheet") {
102                                            LinkType::Stylesheet
103                                        } else {
104                                            LinkType::Other(rel.to_string())
105                                        }
106                                    } else {
107                                        LinkType::Other("link".to_string())
108                                    }
109                                }
110                                "script" => LinkType::Script,
111                                "img" => LinkType::Image,
112                                "audio" | "video" | "source" => LinkType::Media,
113                                _ => LinkType::Other(element.value().name().to_string()),
114                            };
115                            links.insert(Link { url, link_type });
116                        }
117                    }
118                }
119            }
120
121            let finder = LinkFinder::new();
122            for text_node in html.tree.values().filter_map(|node| node.as_text()) {
123                for link in finder.links(text_node) {
124                    if link.kind() == &LinkKind::Url
125                        && let Ok(url) = self.url.join(link.as_str())
126                        && utils::is_same_site(&url, &self.url)
127                    {
128                        links.insert(Link {
129                            url,
130                            link_type: LinkType::Page,
131                        });
132                    }
133                }
134            }
135        }
136
137        links
138    }
139}