Skip to main content

spider_util/
response.rs

1//! Data structures and utilities for handling HTTP responses in `spider-lib`.
2//!
3//! This module defines the `Response` struct, which represents an HTTP response
4//! received from a web server. It encapsulates crucial information such as
5//! the URL, status code, headers, and body of the response, along with any
6//! associated metadata.
7//!
8//! Additionally, this module provides:
9//! - Helper methods for `Response` to facilitate common tasks like parsing
10//!   the body as HTML or JSON, and reconstructing the original `Request`.
11//! - `Link` and `LinkType` enums for structured representation and extraction
12//!   of hyperlinks found within the response content.
13
14use crate::request::Request;
15use crate::utils;
16use bytes::Bytes;
17use dashmap::{DashMap, DashSet};
18use linkify::{LinkFinder, LinkKind};
19use reqwest::StatusCode;
20use reqwest::header::HeaderMap;
21use scraper::{Html, Selector};
22use serde::de::DeserializeOwned;
23use serde_json::{self, Value};
24use std::{borrow::Cow, str::Utf8Error, str::from_utf8};
25use url::Url;
26
27/// Represents the type of a discovered link.
28#[derive(Debug, Clone, PartialEq, Eq, Hash)]
29pub enum LinkType {
30    /// A link to another web page.
31    Page,
32    /// A link to a script file.
33    Script,
34    /// A link to a stylesheet.
35    Stylesheet,
36    /// A link to an image.
37    Image,
38    /// A link to a media file (audio/video).
39    Media,
40    /// A link to another type of resource.
41    Other(String),
42}
43
44/// Represents a link discovered on a web page.
45#[derive(Debug, Clone, PartialEq, Eq, Hash)]
46pub struct Link {
47    /// The URL of the discovered link.
48    pub url: Url,
49    /// The type of the discovered link.
50    pub link_type: LinkType,
51}
52
53/// Represents an HTTP response received from a server.
54#[derive(Debug, Clone)]
55pub struct Response {
56    /// The final URL of the response after any redirects.
57    pub url: Url,
58    /// The HTTP status code of the response.
59    pub status: StatusCode,
60    /// The headers of the response.
61    pub headers: HeaderMap,
62    /// The body of the response.
63    pub body: Bytes,
64    /// The original URL of the request that led to this response.
65    pub request_url: Url,
66    /// Metadata associated with the response, carried over from the request.
67    pub meta: DashMap<Cow<'static, str>, Value>,
68    /// Indicates if the response was served from a cache.
69    pub cached: bool,
70}
71
72impl Response {
73    /// Reconstructs the original `Request` that led to this response.
74    pub fn request_from_response(&self) -> Request {
75        let mut request = Request::new(self.request_url.clone());
76        request.meta = self.meta.clone();
77        request
78    }
79
80    /// Deserializes the response body as JSON.
81    pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
82        serde_json::from_slice(&self.body)
83    }
84
85    /// Parses the response body as HTML.
86    pub fn to_html(&self) -> Result<Html, Utf8Error> {
87        let body_str = from_utf8(&self.body)?;
88        Ok(Html::parse_document(body_str))
89    }
90
91    /// Extracts all unique, same-site links from the response body.
92    pub fn links(&self) -> DashSet<Link> {
93        let links = DashSet::new();
94
95        if let Ok(html) = self.to_html() {
96            let selectors = vec![
97                ("a[href]", "href"),
98                ("link[href]", "href"),
99                ("script[src]", "src"),
100                ("img[src]", "src"),
101                ("audio[src]", "src"),
102                ("video[src]", "src"),
103                ("source[src]", "src"),
104            ];
105
106            for (selector_str, attr_name) in selectors {
107                if let Ok(selector) = Selector::parse(selector_str) {
108                    for element in html.select(&selector) {
109                        if let Some(attr_value) = element.value().attr(attr_name)
110                            && let Ok(url) = self.url.join(attr_value)
111                            && utils::is_same_site(&url, &self.url)
112                        {
113                            let link_type = match element.value().name() {
114                                "a" => LinkType::Page,
115                                "link" => {
116                                    if let Some(rel) = element.value().attr("rel") {
117                                        if rel.eq_ignore_ascii_case("stylesheet") {
118                                            LinkType::Stylesheet
119                                        } else {
120                                            LinkType::Other(rel.to_string())
121                                        }
122                                    } else {
123                                        LinkType::Other("link".to_string())
124                                    }
125                                }
126                                "script" => LinkType::Script,
127                                "img" => LinkType::Image,
128                                "audio" | "video" | "source" => LinkType::Media,
129                                _ => LinkType::Other(element.value().name().to_string()),
130                            };
131                            links.insert(Link { url, link_type });
132                        }
133                    }
134                }
135            }
136
137            let finder = LinkFinder::new();
138            for text_node in html.tree.values().filter_map(|node| node.as_text()) {
139                for link in finder.links(text_node) {
140                    if link.kind() == &LinkKind::Url
141                        && let Ok(url) = self.url.join(link.as_str())
142                        && utils::is_same_site(&url, &self.url)
143                    {
144                        links.insert(Link {
145                            url,
146                            link_type: LinkType::Page,
147                        });
148                    }
149                }
150            }
151        }
152
153        links
154    }
155}
156