Skip to main content

spider_lib/
response.rs

1//! Data structures and utilities for handling HTTP responses in `spider-lib`.
2//!
3//! This module defines the `Response` struct, which represents an HTTP response
4//! received from a web server. It encapsulates crucial information such as
5//! the URL, status code, headers, and body of the response, along with any
6//! associated metadata.
7//!
8//! Additionally, this module provides:
9//! - Helper methods for `Response` to facilitate common tasks like parsing
10//!   the body as HTML or JSON, and reconstructing the original `Request`.
11//! - `Link` and `LinkType` enums for structured representation and extraction
12//!   of hyperlinks found within the response content.
13use crate::{request::Request, utils};
14use bytes::Bytes;
15use dashmap::{DashMap, DashSet};
16use linkify::{LinkFinder, LinkKind};
17use reqwest::StatusCode;
18use reqwest::header::HeaderMap;
19use scraper::{Html, Selector};
20use serde::de::DeserializeOwned;
21use serde_json;
22use std::borrow::Cow;
23use url::Url;
24
25/// Represents the type of a discovered link.
26#[derive(Debug, Clone, PartialEq, Eq, Hash)]
27pub enum LinkType {
28    /// A link to another web page.
29    Page,
30    /// A link to a script file.
31    Script,
32    /// A link to a stylesheet.
33    Stylesheet,
34    /// A link to an image.
35    Image,
36    /// A link to a media file (audio/video).
37    Media,
38    /// A link to another type of resource.
39    Other(String),
40}
41
42/// Represents a link discovered on a web page.
43#[derive(Debug, Clone, PartialEq, Eq, Hash)]
44pub struct Link {
45    /// The URL of the discovered link.
46    pub url: Url,
47    /// The type of the discovered link.
48    pub link_type: LinkType,
49}
50
51/// Represents an HTTP response received from a server.
52#[derive(Debug, Clone)]
53pub struct Response {
54    /// The final URL of the response after any redirects.
55    pub url: Url,
56    /// The HTTP status code of the response.
57    pub status: StatusCode,
58    /// The headers of the response.
59    pub headers: HeaderMap,
60    /// The body of the response.
61    pub body: Bytes,
62    /// The original URL of the request that led to this response.
63    pub request_url: Url,
64    /// Metadata associated with the response, carried over from the request.
65    pub meta: DashMap<Cow<'static, str>, serde_json::Value>,
66}
67
68impl Response {
69    /// Reconstructs the original `Request` that led to this response.
70    pub fn request_from_response(&self) -> Request {
71        let mut request = Request::new(self.request_url.clone());
72        request.meta = self.meta.clone();
73        request
74    }
75
76    /// Deserializes the response body as JSON.
77    pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
78        serde_json::from_slice(&self.body)
79    }
80
81    /// Parses the response body as HTML.
82    pub fn to_html(&self) -> Result<Html, std::str::Utf8Error> {
83        let body_str = std::str::from_utf8(&self.body)?;
84        Ok(Html::parse_document(body_str))
85    }
86
87    /// Extracts all unique, same-site links from the response body.
88    pub fn links(&self) -> DashSet<Link> {
89        let links = DashSet::new();
90
91        if let Ok(html) = self.to_html() {
92            let selectors = vec![
93                ("a[href]", "href"),
94                ("link[href]", "href"),
95                ("script[src]", "src"),
96                ("img[src]", "src"),
97                ("audio[src]", "src"),
98                ("video[src]", "src"),
99                ("source[src]", "src"),
100            ];
101
102            for (selector_str, attr_name) in selectors {
103                if let Ok(selector) = Selector::parse(selector_str) {
104                    for element in html.select(&selector) {
105                        if let Some(attr_value) = element.value().attr(attr_name)
106                            && let Ok(url) = self.url.join(attr_value)
107                            && utils::is_same_site(&url, &self.url)
108                        {
109                            let link_type = match element.value().name() {
110                                "a" => LinkType::Page,
111                                "link" => {
112                                    if let Some(rel) = element.value().attr("rel") {
113                                        if rel.eq_ignore_ascii_case("stylesheet") {
114                                            LinkType::Stylesheet
115                                        } else {
116                                            LinkType::Other(rel.to_string())
117                                        }
118                                    } else {
119                                        LinkType::Other("link".to_string())
120                                    }
121                                }
122                                "script" => LinkType::Script,
123                                "img" => LinkType::Image,
124                                "audio" | "video" | "source" => LinkType::Media,
125                                _ => LinkType::Other(element.value().name().to_string()),
126                            };
127                            links.insert(Link { url, link_type });
128                        }
129                    }
130                }
131            }
132
133            let finder = LinkFinder::new();
134            for text_node in html.tree.values().filter_map(|node| node.as_text()) {
135                for link in finder.links(text_node) {
136                    if link.kind() == &LinkKind::Url
137                        && let Ok(url) = self.url.join(link.as_str())
138                        && utils::is_same_site(&url, &self.url)
139                    {
140                        links.insert(Link {
141                            url,
142                            link_type: LinkType::Page,
143                        });
144                    }
145                }
146            }
147        }
148
149        links
150    }
151}