Skip to main content

scrapling_fetch/
response.rs

1//! HTTP response type with lazy HTML parsing.
2//!
3//! The [`Response`] struct is what you get back from every request made through
4//! [`Fetcher`](crate::Fetcher) or [`FetcherSession`](crate::FetcherSession). It holds
5//! the raw response bytes, headers, cookies, status code, and metadata. The HTML body
6//! is not parsed until you first call [`selector()`](Response::selector), [`css()`](Response::css),
7//! or any other method that needs the DOM -- this keeps simple status-code checks fast.
8//!
9//! The internal [`build_response_async`] function converts a raw `wreq::Response` into
10//! this type and is used by the client module.
11
12use std::cell::OnceCell;
13use std::collections::HashMap;
14
15use bytes::Bytes;
16use serde_json::Value;
17
18use scrapling::selector::Selector;
19
20use crate::status::status_text;
21
22/// HTTP response with lazy-parsed HTML selector.
23///
24/// The response body is stored as raw bytes. The HTML [`Selector`] is parsed
25/// lazily on first access via [`selector()`](Response::selector). This means
26/// creating and inspecting a `Response` (checking status, reading headers) is
27/// cheap -- the potentially expensive HTML parse only happens when you need the DOM.
28///
29/// `Response` implements `Send` so it can be moved across threads, but the lazy
30/// [`OnceCell`] storing the parsed selector uses interior mutability, so it is not
31/// `Sync`. Parse on one thread, then share the results.
32pub struct Response {
33    /// The HTTP status code.
34    pub status: u16,
35    /// The reason phrase for the status code.
36    pub reason: String,
37    /// Cookies received in the response.
38    pub cookies: HashMap<String, String>,
39    /// Response headers.
40    pub headers: HashMap<String, String>,
41    /// Headers that were sent with the request.
42    pub request_headers: HashMap<String, String>,
43    /// Redirect history leading to this response.
44    pub history: Vec<Response>,
45    /// The character encoding of the response body.
46    pub encoding: String,
47    /// The HTTP method used for the request.
48    pub method: String,
49    /// Arbitrary metadata associated with this response.
50    pub meta: HashMap<String, Value>,
51    /// The raw response body bytes.
52    pub body: Bytes,
53    url: String,
54    parsed: OnceCell<Selector>,
55}
56
57unsafe impl Send for Response {}
58
59impl Response {
60    /// Creates a new response from its constituent parts. This is primarily used
61    /// internally by [`build_response_async`]. Most callers will receive `Response`
62    /// objects from [`Fetcher::get()`](crate::Fetcher::get) and similar methods.
63    #[allow(clippy::too_many_arguments)]
64    pub fn new(
65        url: &str,
66        body: Bytes,
67        status: u16,
68        reason: Option<String>,
69        cookies: HashMap<String, String>,
70        headers: HashMap<String, String>,
71        request_headers: HashMap<String, String>,
72        encoding: String,
73        method: String,
74        history: Vec<Response>,
75        meta: HashMap<String, Value>,
76    ) -> Self {
77        Self {
78            status,
79            reason: reason.unwrap_or_else(|| status_text(status).to_owned()),
80            cookies,
81            headers,
82            request_headers,
83            history,
84            encoding,
85            method,
86            meta,
87            body,
88            url: url.to_owned(),
89            parsed: OnceCell::new(),
90        }
91    }
92
93    /// Returns the final URL of the response after any redirects have been followed.
94    /// If no redirects occurred, this is the same as the original request URL.
95    pub fn url(&self) -> &str {
96        &self.url
97    }
98
99    /// Returns the parsed HTML selector, parsing the body on first call.
100    ///
101    /// The parse result is cached, so subsequent calls return immediately. The body
102    /// is decoded as UTF-8 (with lossy replacement for invalid sequences) before parsing.
103    pub fn selector(&self) -> &Selector {
104        self.parsed.get_or_init(|| {
105            let html = String::from_utf8_lossy(&self.body);
106            Selector::from_html_with_url(&html, &self.url)
107        })
108    }
109
110    /// Runs a CSS selector query against the parsed HTML and returns matching elements.
111    /// This is a convenience wrapper around `self.selector().css(query)`. Triggers a
112    /// lazy parse if the body has not been parsed yet.
113    pub fn css(&self, query: &str) -> scrapling::selector::Selectors {
114        self.selector().css(query)
115    }
116
117    /// Finds elements whose text content matches the given string. Use `partial` for
118    /// substring matching, `case_sensitive` to control case, and `clean_match` to
119    /// strip whitespace before comparing.
120    pub fn find_by_text(
121        &self,
122        text: &str,
123        partial: bool,
124        case_sensitive: bool,
125        clean_match: bool,
126    ) -> scrapling::selector::Selectors {
127        self.selector()
128            .find_by_text(text, partial, case_sensitive, clean_match)
129    }
130
131    /// Returns a text handler for extracting and manipulating text content from the
132    /// parsed HTML. Useful for getting the full visible text of the page.
133    pub fn text(&self) -> scrapling::TextHandler {
134        self.selector().text()
135    }
136
137    /// Resolves a relative URL against this response's base URL. For example, if the
138    /// response URL is `https://example.com/page` and you pass `"/other"`, this returns
139    /// `"https://example.com/other"`.
140    pub fn urljoin(&self, relative: &str) -> String {
141        self.selector().urljoin(relative)
142    }
143
144    /// Returns `true` if the status code is in the 2xx range (200-299), indicating
145    /// the request was successfully received, understood, and accepted.
146    pub fn is_success(&self) -> bool {
147        (200..300).contains(&self.status)
148    }
149
150    /// Returns `true` if the status code is in the 3xx range (300-399). You will only
151    /// see this when redirect following is disabled, since otherwise the client follows
152    /// redirects automatically.
153    pub fn is_redirect(&self) -> bool {
154        (300..400).contains(&self.status)
155    }
156
157    /// Returns `true` if the status code is in the 4xx range (400-499), indicating
158    /// a client error such as a bad request, unauthorized access, or not found.
159    pub fn is_client_error(&self) -> bool {
160        (400..500).contains(&self.status)
161    }
162
163    /// Returns `true` if the status code is in the 5xx range (500-599), indicating
164    /// a server-side error. These are often transient and may succeed on retry.
165    pub fn is_server_error(&self) -> bool {
166        (500..600).contains(&self.status)
167    }
168
169    /// Resolves a relative URL for following a link. This is a semantic alias for
170    /// [`urljoin()`](Self::urljoin) that makes crawler code read more naturally.
171    pub fn follow_url(&self, relative: &str) -> String {
172        self.urljoin(relative)
173    }
174
175    /// Converts the HTML body to Markdown using the scrapling shell converter. Useful
176    /// for feeding page content to LLMs or for human-readable text extraction.
177    pub fn to_markdown(&self) -> String {
178        scrapling::shell::Convertor::to_markdown(&String::from_utf8_lossy(&self.body))
179    }
180
181    /// Converts the HTML body to plain text, stripping all HTML tags and formatting.
182    /// Useful when you only care about the visible text content of a page.
183    pub fn to_text(&self) -> String {
184        scrapling::shell::Convertor::to_text(&String::from_utf8_lossy(&self.body))
185    }
186}
187
188impl std::fmt::Debug for Response {
189    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
190        f.debug_struct("Response")
191            .field("status", &self.status)
192            .field("url", &self.url)
193            .field("method", &self.method)
194            .finish()
195    }
196}
197
198impl std::fmt::Display for Response {
199    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
200        write!(f, "<{} {}>", self.status, self.url)
201    }
202}
203
204fn extract_encoding(headers: &HashMap<String, String>) -> String {
205    headers
206        .get("content-type")
207        .and_then(|ct| {
208            ct.split(';').find_map(|part| {
209                part.trim()
210                    .strip_prefix("charset=")
211                    .map(|c| c.trim().to_owned())
212            })
213        })
214        .unwrap_or_else(|| "utf-8".to_owned())
215}
216
217/// Builds a [`Response`] from a raw `wreq::Response` by extracting headers, cookies,
218/// status, encoding, and downloading the body bytes. This is called internally by
219/// [`Fetcher`](crate::Fetcher) and [`FetcherSession`](crate::FetcherSession) after
220/// each successful HTTP exchange.
221pub(crate) async fn build_response_async(
222    resp: wreq::Response,
223    request_headers: HashMap<String, String>,
224    method: &str,
225    meta: HashMap<String, Value>,
226) -> crate::error::Result<Response> {
227    let status = resp.status().as_u16();
228    let reason = resp.status().canonical_reason().map(|s| s.to_owned());
229
230    let headers: HashMap<String, String> = resp
231        .headers()
232        .iter()
233        .filter_map(|(name, value)| {
234            value
235                .to_str()
236                .ok()
237                .map(|v| (name.as_str().to_owned(), v.to_owned()))
238        })
239        .collect();
240
241    let cookies: HashMap<String, String> = resp
242        .cookies()
243        .map(|c| (c.name().to_owned(), c.value().to_owned()))
244        .collect();
245
246    let encoding = extract_encoding(&headers);
247    let url = resp.uri().to_string();
248    let body_bytes = resp.bytes().await?;
249
250    Ok(Response::new(
251        &url,
252        body_bytes,
253        status,
254        reason,
255        cookies,
256        headers,
257        request_headers,
258        encoding,
259        method.to_owned(),
260        Vec::new(),
261        meta,
262    ))
263}