Skip to main content

scrapling_browser/
response_factory.rs

1//! Factory for building [`scrapling_fetch::Response`] objects from Playwright pages.
2//!
3//! After a browser session navigates to a URL and the page stabilises, this module
4//! extracts everything needed to construct a unified [`Response`]: the rendered HTML
5//! content, HTTP status code, response headers, cookies from the browser context,
6//! and the character encoding parsed from the `Content-Type` header.
7//!
8//! The main entry point is [`from_browser_page`], which is called internally by
9//! [`DynamicSession`](crate::fetcher::DynamicSession) and
10//! [`StealthySession`](crate::fetcher::StealthySession) at the end of each fetch
11//! cycle. You generally do not need to call it directly.
12//!
13//! The module also defines [`XhrCapture`], a struct for holding intercepted XHR/fetch
14//! responses that were recorded during page navigation (when `capture_xhr` is set).
15
16use std::collections::HashMap;
17
18use bytes::Bytes;
19use serde_json::Value;
20use tracing::debug;
21
22use scrapling_fetch::Response;
23use scrapling_fetch::status_text;
24
25/// Build a [`Response`] from a Playwright page and its navigation responses.
26///
27/// This function extracts the rendered HTML via `page.content()` (retrying up to
28/// `max_retries` times), reads cookies from the browser context, parses the character
29/// encoding from the `Content-Type` header, and assembles everything into a
30/// [`scrapling_fetch::Response`]. The `meta` map and `_captured_xhr` list are passed
31/// through to the response for downstream consumers.
32pub async fn from_browser_page(
33    page: &playwright_rs::Page,
34    first_response: Option<&playwright_rs::Response>,
35    _final_response: Option<&playwright_rs::Response>,
36    meta: HashMap<String, Value>,
37    _captured_xhr: Vec<XhrCapture>,
38) -> crate::error::Result<Response> {
39    let active_response = _final_response.or(first_response);
40
41    let status = active_response.map(|r| r.status()).unwrap_or(200);
42    let reason = active_response
43        .map(|r| {
44            let st = r.status_text();
45            if st.is_empty() {
46                status_text(status).to_owned()
47            } else {
48                st.to_owned()
49            }
50        })
51        .unwrap_or_else(|| status_text(status).to_owned());
52
53    let headers = match first_response {
54        Some(resp) => resp.all_headers().await.unwrap_or_default(),
55        None => HashMap::new(),
56    };
57
58    let encoding = extract_encoding(&headers);
59
60    let content = get_page_content(page, 20).await?;
61    let page_url = page.url();
62    let body = Bytes::from(content.into_bytes());
63
64    let cookies = match page.context() {
65        Ok(ctx) => ctx
66            .cookies(None)
67            .await
68            .map(|c| c.into_iter().map(|ck| (ck.name, ck.value)).collect())
69            .unwrap_or_default(),
70        Err(_) => HashMap::new(),
71    };
72
73    let response = Response::new(
74        &page_url,
75        body,
76        status,
77        Some(reason),
78        cookies,
79        headers,
80        HashMap::new(),
81        encoding,
82        "GET".to_owned(),
83        Vec::new(),
84        meta,
85    );
86
87    Ok(response)
88}
89
90async fn get_page_content(
91    page: &playwright_rs::Page,
92    max_retries: u32,
93) -> crate::error::Result<String> {
94    for attempt in 0..max_retries {
95        match page.content().await {
96            Ok(content) => return Ok(content),
97            Err(e) => {
98                if attempt < max_retries - 1 {
99                    debug!(attempt = attempt + 1, "page.content() failed, retrying");
100                    tokio::time::sleep(std::time::Duration::from_millis(500)).await;
101                } else {
102                    return Err(crate::error::BrowserError::Navigation(format!(
103                        "page.content() failed after {max_retries} attempts: {e}"
104                    )));
105                }
106            }
107        }
108    }
109    unreachable!()
110}
111
112fn extract_encoding(headers: &HashMap<String, String>) -> String {
113    headers
114        .get("content-type")
115        .and_then(|ct| {
116            ct.split(';').find_map(|part| {
117                let part = part.trim();
118                part.strip_prefix("charset=").map(|c| c.trim().to_owned())
119            })
120        })
121        .unwrap_or_else(|| "utf-8".to_owned())
122}
123
124/// A captured XHR/fetch response recorded during page navigation.
125///
126/// When [`BrowserConfig::capture_xhr`] is set with a URL pattern, matching
127/// XHR/fetch responses are intercepted and stored in this struct. This lets you
128/// extract API data that the page fetches in the background without having to
129/// parse it out of the rendered HTML.
130#[derive(Debug)]
131pub struct XhrCapture {
132    /// URL of the captured XHR/fetch request.
133    /// This is the full URL including query parameters.
134    pub url: String,
135
136    /// HTTP status code of the captured response (e.g. `200`, `404`).
137    pub status: u16,
138
139    /// Response headers of the captured request.
140    /// Useful for checking `Content-Type` or pagination headers.
141    pub headers: HashMap<String, String>,
142
143    /// Raw response body bytes.
144    /// For JSON APIs you can deserialize this with `serde_json::from_slice`.
145    pub body: Bytes,
146}