scrapling_browser/response_factory.rs
1//! Factory for building [`scrapling_fetch::Response`] objects from Playwright pages.
2//!
3//! After a browser session navigates to a URL and the page stabilises, this module
4//! extracts everything needed to construct a unified [`Response`]: the rendered HTML
5//! content, HTTP status code, response headers, cookies from the browser context,
6//! and the character encoding parsed from the `Content-Type` header.
7//!
8//! The main entry point is [`from_browser_page`], which is called internally by
9//! [`DynamicSession`](crate::fetcher::DynamicSession) and
10//! [`StealthySession`](crate::fetcher::StealthySession) at the end of each fetch
11//! cycle. You generally do not need to call it directly.
12//!
13//! The module also defines [`XhrCapture`], a struct for holding intercepted XHR/fetch
14//! responses that were recorded during page navigation (when `capture_xhr` is set).
15
16use std::collections::HashMap;
17
18use bytes::Bytes;
19use serde_json::Value;
20use tracing::debug;
21
22use scrapling_fetch::Response;
23use scrapling_fetch::status_text;
24
25/// Build a [`Response`] from a Playwright page and its navigation responses.
26///
27/// This function extracts the rendered HTML via `page.content()` (retrying up to
28/// `max_retries` times), reads cookies from the browser context, parses the character
29/// encoding from the `Content-Type` header, and assembles everything into a
30/// [`scrapling_fetch::Response`]. The `meta` map and `_captured_xhr` list are passed
31/// through to the response for downstream consumers.
32pub async fn from_browser_page(
33 page: &playwright_rs::Page,
34 first_response: Option<&playwright_rs::Response>,
35 _final_response: Option<&playwright_rs::Response>,
36 meta: HashMap<String, Value>,
37 _captured_xhr: Vec<XhrCapture>,
38) -> crate::error::Result<Response> {
39 let active_response = _final_response.or(first_response);
40
41 let status = active_response.map(|r| r.status()).unwrap_or(200);
42 let reason = active_response
43 .map(|r| {
44 let st = r.status_text();
45 if st.is_empty() {
46 status_text(status).to_owned()
47 } else {
48 st.to_owned()
49 }
50 })
51 .unwrap_or_else(|| status_text(status).to_owned());
52
53 let headers = match first_response {
54 Some(resp) => resp.all_headers().await.unwrap_or_default(),
55 None => HashMap::new(),
56 };
57
58 let encoding = extract_encoding(&headers);
59
60 let content = get_page_content(page, 20).await?;
61 let page_url = page.url();
62 let body = Bytes::from(content.into_bytes());
63
64 let cookies = match page.context() {
65 Ok(ctx) => ctx
66 .cookies(None)
67 .await
68 .map(|c| c.into_iter().map(|ck| (ck.name, ck.value)).collect())
69 .unwrap_or_default(),
70 Err(_) => HashMap::new(),
71 };
72
73 let response = Response::new(
74 &page_url,
75 body,
76 status,
77 Some(reason),
78 cookies,
79 headers,
80 HashMap::new(),
81 encoding,
82 "GET".to_owned(),
83 Vec::new(),
84 meta,
85 );
86
87 Ok(response)
88}
89
90async fn get_page_content(
91 page: &playwright_rs::Page,
92 max_retries: u32,
93) -> crate::error::Result<String> {
94 for attempt in 0..max_retries {
95 match page.content().await {
96 Ok(content) => return Ok(content),
97 Err(e) => {
98 if attempt < max_retries - 1 {
99 debug!(attempt = attempt + 1, "page.content() failed, retrying");
100 tokio::time::sleep(std::time::Duration::from_millis(500)).await;
101 } else {
102 return Err(crate::error::BrowserError::Navigation(format!(
103 "page.content() failed after {max_retries} attempts: {e}"
104 )));
105 }
106 }
107 }
108 }
109 unreachable!()
110}
111
112fn extract_encoding(headers: &HashMap<String, String>) -> String {
113 headers
114 .get("content-type")
115 .and_then(|ct| {
116 ct.split(';').find_map(|part| {
117 let part = part.trim();
118 part.strip_prefix("charset=").map(|c| c.trim().to_owned())
119 })
120 })
121 .unwrap_or_else(|| "utf-8".to_owned())
122}
123
124/// A captured XHR/fetch response recorded during page navigation.
125///
126/// When [`BrowserConfig::capture_xhr`] is set with a URL pattern, matching
127/// XHR/fetch responses are intercepted and stored in this struct. This lets you
128/// extract API data that the page fetches in the background without having to
129/// parse it out of the rendered HTML.
130#[derive(Debug)]
131pub struct XhrCapture {
132 /// URL of the captured XHR/fetch request.
133 /// This is the full URL including query parameters.
134 pub url: String,
135
136 /// HTTP status code of the captured response (e.g. `200`, `404`).
137 pub status: u16,
138
139 /// Response headers of the captured request.
140 /// Useful for checking `Content-Type` or pagination headers.
141 pub headers: HashMap<String, String>,
142
143 /// Raw response body bytes.
144 /// For JSON APIs you can deserialize this with `serde_json::from_slice`.
145 pub body: Bytes,
146}