1use crate::request::Request;
15use crate::selector_cache::get_cached_selector;
16use crate::utils;
17use bytes::Bytes;
18use dashmap::{DashMap, DashSet};
19use linkify::{LinkFinder, LinkKind};
20use reqwest::StatusCode;
21use reqwest::header::HeaderMap;
22use scraper::Html;
23use serde::de::DeserializeOwned;
24use serde_json::{self, Value};
25use std::{borrow::Cow, str::Utf8Error, str::from_utf8};
26use url::Url;
27
28#[derive(Debug, Clone, PartialEq, Eq, Hash)]
30pub enum LinkType {
31 Page,
33 Script,
35 Stylesheet,
37 Image,
39 Media,
41 Other(String),
43}
44
45#[derive(Debug, Clone, PartialEq, Eq, Hash)]
47pub struct Link {
48 pub url: Url,
50 pub link_type: LinkType,
52}
53
54#[derive(Debug)]
56pub struct Response {
57 pub url: Url,
59 pub status: StatusCode,
61 pub headers: HeaderMap,
63 pub body: Bytes,
65 pub request_url: Url,
67 pub meta: DashMap<Cow<'static, str>, Value>,
69 pub cached: bool,
71}
72
73impl Clone for Response {
74 fn clone(&self) -> Self {
75 Response {
76 url: self.url.clone(),
77 status: self.status,
78 headers: self.headers.clone(),
79 body: self.body.clone(),
80 request_url: self.request_url.clone(),
81 meta: self.meta.clone(),
82 cached: self.cached,
83 }
84 }
85}
86
87impl Response {
88 pub fn request_from_response(&self) -> Request {
90 let mut request = Request::new(self.request_url.clone());
91 request.meta = self.meta.clone();
92 request
93 }
94
95 pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
97 serde_json::from_slice(&self.body)
98 }
99
100 pub fn to_html(&self) -> Result<Html, Utf8Error> {
102 let body_str = from_utf8(&self.body)?;
103 Ok(Html::parse_document(body_str))
104 }
105
106 pub fn lazy_html(&self) -> Result<impl Fn() -> Result<Html, Utf8Error> + '_, Utf8Error> {
108 let body_bytes = &self.body;
109 Ok(move || {
110 let body_str = from_utf8(body_bytes)?;
111 Ok(Html::parse_document(body_str))
112 })
113 }
114
115 pub fn links(&self) -> DashSet<Link> {
117 let links = DashSet::new();
118
119 if let Ok(html_fn) = self.lazy_html()
120 && let Ok(html) = html_fn()
121 {
122 let selectors = vec![
123 ("a[href]", "href"),
124 ("link[href]", "href"),
125 ("script[src]", "src"),
126 ("img[src]", "src"),
127 ("audio[src]", "src"),
128 ("video[src]", "src"),
129 ("source[src]", "src"),
130 ];
131
132 for (selector_str, attr_name) in selectors {
133 if let Some(selector) = get_cached_selector(selector_str) {
134 for element in html.select(&selector) {
135 if let Some(attr_value) = element.value().attr(attr_name)
136 && let Ok(url) = self.url.join(attr_value)
137 && utils::is_same_site(&url, &self.url)
138 {
139 let link_type = match element.value().name() {
140 "a" => LinkType::Page,
141 "link" => {
142 if let Some(rel) = element.value().attr("rel") {
143 if rel.eq_ignore_ascii_case("stylesheet") {
144 LinkType::Stylesheet
145 } else {
146 LinkType::Other(rel.to_string())
147 }
148 } else {
149 LinkType::Other("link".to_string())
150 }
151 }
152 "script" => LinkType::Script,
153 "img" => LinkType::Image,
154 "audio" | "video" | "source" => LinkType::Media,
155 _ => LinkType::Other(element.value().name().to_string()),
156 };
157 links.insert(Link { url, link_type });
158 }
159 }
160 }
161 }
162
163 let finder = LinkFinder::new();
164 for text_node in html.tree.values().filter_map(|node| node.as_text()) {
165 for link in finder.links(text_node) {
166 if link.kind() == &LinkKind::Url
167 && let Ok(url) = self.url.join(link.as_str())
168 && utils::is_same_site(&url, &self.url)
169 {
170 links.insert(Link {
171 url,
172 link_type: LinkType::Page,
173 });
174 }
175 }
176 }
177 }
178
179 links
180 }
181
182 #[cfg(feature = "stream")]
184 pub async fn to_stream_response(
185 &self,
186 ) -> Result<crate::stream_response::StreamResponse, std::io::Error> {
187 use futures_util::stream;
188 use std::io;
189
190 let body_chunk = self.body.clone();
191 let body_stream = stream::iter(vec![Ok::<bytes::Bytes, io::Error>(body_chunk)]);
192
193 Ok(crate::stream_response::StreamResponse {
194 url: self.url.clone(),
195 status: self.status,
196 headers: self.headers.clone(),
197 body_stream: Box::pin(body_stream),
198 request_url: self.request_url.clone(),
199 meta: self.meta.clone(),
200 cached: self.cached,
201 })
202 }
203}