scrapling_fetch/response.rs
1//! HTTP response type with lazy HTML parsing.
2//!
3//! The [`Response`] struct is what you get back from every request made through
4//! [`Fetcher`](crate::Fetcher) or [`FetcherSession`](crate::FetcherSession). It holds
5//! the raw response bytes, headers, cookies, status code, and metadata. The HTML body
6//! is not parsed until you first call [`selector()`](Response::selector), [`css()`](Response::css),
7//! or any other method that needs the DOM -- this keeps simple status-code checks fast.
8//!
9//! The internal [`build_response_async`] function converts a raw `wreq::Response` into
10//! this type and is used by the client module.
11
12use std::cell::OnceCell;
13use std::collections::HashMap;
14
15use bytes::Bytes;
16use serde_json::Value;
17
18use scrapling::selector::Selector;
19
20use crate::status::status_text;
21
22/// HTTP response with lazy-parsed HTML selector.
23///
24/// The response body is stored as raw bytes. The HTML [`Selector`] is parsed
25/// lazily on first access via [`selector()`](Response::selector). This means
26/// creating and inspecting a `Response` (checking status, reading headers) is
27/// cheap -- the potentially expensive HTML parse only happens when you need the DOM.
28///
29/// `Response` implements `Send` so it can be moved across threads, but the lazy
30/// [`OnceCell`] storing the parsed selector uses interior mutability, so it is not
31/// `Sync`. Parse on one thread, then share the results.
32pub struct Response {
33 /// The HTTP status code.
34 pub status: u16,
35 /// The reason phrase for the status code.
36 pub reason: String,
37 /// Cookies received in the response.
38 pub cookies: HashMap<String, String>,
39 /// Response headers.
40 pub headers: HashMap<String, String>,
41 /// Headers that were sent with the request.
42 pub request_headers: HashMap<String, String>,
43 /// Redirect history leading to this response.
44 pub history: Vec<Response>,
45 /// The character encoding of the response body.
46 pub encoding: String,
47 /// The HTTP method used for the request.
48 pub method: String,
49 /// Arbitrary metadata associated with this response.
50 pub meta: HashMap<String, Value>,
51 /// The raw response body bytes.
52 pub body: Bytes,
53 url: String,
54 parsed: OnceCell<Selector>,
55}
56
57unsafe impl Send for Response {}
58
59impl Response {
60 /// Creates a new response from its constituent parts. This is primarily used
61 /// internally by [`build_response_async`]. Most callers will receive `Response`
62 /// objects from [`Fetcher::get()`](crate::Fetcher::get) and similar methods.
63 #[allow(clippy::too_many_arguments)]
64 pub fn new(
65 url: &str,
66 body: Bytes,
67 status: u16,
68 reason: Option<String>,
69 cookies: HashMap<String, String>,
70 headers: HashMap<String, String>,
71 request_headers: HashMap<String, String>,
72 encoding: String,
73 method: String,
74 history: Vec<Response>,
75 meta: HashMap<String, Value>,
76 ) -> Self {
77 Self {
78 status,
79 reason: reason.unwrap_or_else(|| status_text(status).to_owned()),
80 cookies,
81 headers,
82 request_headers,
83 history,
84 encoding,
85 method,
86 meta,
87 body,
88 url: url.to_owned(),
89 parsed: OnceCell::new(),
90 }
91 }
92
93 /// Returns the final URL of the response after any redirects have been followed.
94 /// If no redirects occurred, this is the same as the original request URL.
95 pub fn url(&self) -> &str {
96 &self.url
97 }
98
99 /// Returns the parsed HTML selector, parsing the body on first call.
100 ///
101 /// The parse result is cached, so subsequent calls return immediately. The body
102 /// is decoded as UTF-8 (with lossy replacement for invalid sequences) before parsing.
103 pub fn selector(&self) -> &Selector {
104 self.parsed.get_or_init(|| {
105 let html = String::from_utf8_lossy(&self.body);
106 Selector::from_html_with_url(&html, &self.url)
107 })
108 }
109
110 /// Runs a CSS selector query against the parsed HTML and returns matching elements.
111 /// This is a convenience wrapper around `self.selector().css(query)`. Triggers a
112 /// lazy parse if the body has not been parsed yet.
113 pub fn css(&self, query: &str) -> scrapling::selector::Selectors {
114 self.selector().css(query)
115 }
116
117 /// Finds elements whose text content matches the given string. Use `partial` for
118 /// substring matching, `case_sensitive` to control case, and `clean_match` to
119 /// strip whitespace before comparing.
120 pub fn find_by_text(
121 &self,
122 text: &str,
123 partial: bool,
124 case_sensitive: bool,
125 clean_match: bool,
126 ) -> scrapling::selector::Selectors {
127 self.selector()
128 .find_by_text(text, partial, case_sensitive, clean_match)
129 }
130
131 /// Returns a text handler for extracting and manipulating text content from the
132 /// parsed HTML. Useful for getting the full visible text of the page.
133 pub fn text(&self) -> scrapling::TextHandler {
134 self.selector().text()
135 }
136
137 /// Resolves a relative URL against this response's base URL. For example, if the
138 /// response URL is `https://example.com/page` and you pass `"/other"`, this returns
139 /// `"https://example.com/other"`.
140 pub fn urljoin(&self, relative: &str) -> String {
141 self.selector().urljoin(relative)
142 }
143
144 /// Returns `true` if the status code is in the 2xx range (200-299), indicating
145 /// the request was successfully received, understood, and accepted.
146 pub fn is_success(&self) -> bool {
147 (200..300).contains(&self.status)
148 }
149
150 /// Returns `true` if the status code is in the 3xx range (300-399). You will only
151 /// see this when redirect following is disabled, since otherwise the client follows
152 /// redirects automatically.
153 pub fn is_redirect(&self) -> bool {
154 (300..400).contains(&self.status)
155 }
156
157 /// Returns `true` if the status code is in the 4xx range (400-499), indicating
158 /// a client error such as a bad request, unauthorized access, or not found.
159 pub fn is_client_error(&self) -> bool {
160 (400..500).contains(&self.status)
161 }
162
163 /// Returns `true` if the status code is in the 5xx range (500-599), indicating
164 /// a server-side error. These are often transient and may succeed on retry.
165 pub fn is_server_error(&self) -> bool {
166 (500..600).contains(&self.status)
167 }
168
169 /// Resolves a relative URL for following a link. This is a semantic alias for
170 /// [`urljoin()`](Self::urljoin) that makes crawler code read more naturally.
171 pub fn follow_url(&self, relative: &str) -> String {
172 self.urljoin(relative)
173 }
174
175 /// Converts the HTML body to Markdown using the scrapling shell converter. Useful
176 /// for feeding page content to LLMs or for human-readable text extraction.
177 pub fn to_markdown(&self) -> String {
178 scrapling::shell::Convertor::to_markdown(&String::from_utf8_lossy(&self.body))
179 }
180
181 /// Converts the HTML body to plain text, stripping all HTML tags and formatting.
182 /// Useful when you only care about the visible text content of a page.
183 pub fn to_text(&self) -> String {
184 scrapling::shell::Convertor::to_text(&String::from_utf8_lossy(&self.body))
185 }
186}
187
188impl std::fmt::Debug for Response {
189 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
190 f.debug_struct("Response")
191 .field("status", &self.status)
192 .field("url", &self.url)
193 .field("method", &self.method)
194 .finish()
195 }
196}
197
198impl std::fmt::Display for Response {
199 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
200 write!(f, "<{} {}>", self.status, self.url)
201 }
202}
203
204fn extract_encoding(headers: &HashMap<String, String>) -> String {
205 headers
206 .get("content-type")
207 .and_then(|ct| {
208 ct.split(';').find_map(|part| {
209 part.trim()
210 .strip_prefix("charset=")
211 .map(|c| c.trim().to_owned())
212 })
213 })
214 .unwrap_or_else(|| "utf-8".to_owned())
215}
216
217/// Builds a [`Response`] from a raw `wreq::Response` by extracting headers, cookies,
218/// status, encoding, and downloading the body bytes. This is called internally by
219/// [`Fetcher`](crate::Fetcher) and [`FetcherSession`](crate::FetcherSession) after
220/// each successful HTTP exchange.
221pub(crate) async fn build_response_async(
222 resp: wreq::Response,
223 request_headers: HashMap<String, String>,
224 method: &str,
225 meta: HashMap<String, Value>,
226) -> crate::error::Result<Response> {
227 let status = resp.status().as_u16();
228 let reason = resp.status().canonical_reason().map(|s| s.to_owned());
229
230 let headers: HashMap<String, String> = resp
231 .headers()
232 .iter()
233 .filter_map(|(name, value)| {
234 value
235 .to_str()
236 .ok()
237 .map(|v| (name.as_str().to_owned(), v.to_owned()))
238 })
239 .collect();
240
241 let cookies: HashMap<String, String> = resp
242 .cookies()
243 .map(|c| (c.name().to_owned(), c.value().to_owned()))
244 .collect();
245
246 let encoding = extract_encoding(&headers);
247 let url = resp.uri().to_string();
248 let body_bytes = resp.bytes().await?;
249
250 Ok(Response::new(
251 &url,
252 body_bytes,
253 status,
254 reason,
255 cookies,
256 headers,
257 request_headers,
258 encoding,
259 method.to_owned(),
260 Vec::new(),
261 meta,
262 ))
263}