1#[cfg(feature = "auto-browser")]
36pub mod browser;
37#[cfg(feature = "cdp")]
38pub mod cdp;
39pub mod detector;
40pub mod http_only;
41pub mod traits;
42
43use crw_core::config::{BUILTIN_UA_POOL, RendererConfig, StealthConfig};
44use crw_core::error::{CrwError, CrwResult};
45use crw_core::types::FetchResult;
46use std::collections::HashMap;
47use std::sync::Arc;
48use traits::PageFetcher;
49
50fn pick_ua<'a>(default_ua: &'a str, stealth: &'a StealthConfig) -> String {
52 if stealth.enabled {
53 let pool: &[&str] = if stealth.user_agents.is_empty() {
54 BUILTIN_UA_POOL
55 } else {
56 return stealth.user_agents[rand::random::<usize>() % stealth.user_agents.len()]
58 .clone();
59 };
60 pool[rand::random::<usize>() % pool.len()].to_string()
61 } else {
62 default_ua.to_string()
63 }
64}
65
66pub struct FallbackRenderer {
68 http: Arc<dyn PageFetcher>,
69 js_renderers: Vec<Arc<dyn PageFetcher>>,
70}
71
72impl FallbackRenderer {
73 pub fn new(
74 config: &RendererConfig,
75 user_agent: &str,
76 proxy: Option<&str>,
77 stealth: &StealthConfig,
78 ) -> Self {
79 let effective_ua = pick_ua(user_agent, stealth);
80 let inject_headers = stealth.enabled && stealth.inject_headers;
81 let http = Arc::new(http_only::HttpFetcher::new(
82 &effective_ua,
83 proxy,
84 inject_headers,
85 )) as Arc<dyn PageFetcher>;
86
87 #[allow(unused_mut)]
88 let mut js_renderers: Vec<Arc<dyn PageFetcher>> = Vec::new();
89
90 if config.mode == "none" {
91 return Self { http, js_renderers };
92 }
93
94 #[cfg(feature = "cdp")]
95 {
96 if let Some(lp) = &config.lightpanda {
97 js_renderers.push(Arc::new(cdp::CdpRenderer::new(
98 "lightpanda",
99 &lp.ws_url,
100 config.page_timeout_ms,
101 config.pool_size,
102 )));
103 }
104 if let Some(pw) = &config.playwright {
105 js_renderers.push(Arc::new(cdp::CdpRenderer::new(
106 "playwright",
107 &pw.ws_url,
108 config.page_timeout_ms,
109 config.pool_size,
110 )));
111 }
112 if let Some(ch) = &config.chrome {
113 js_renderers.push(Arc::new(cdp::CdpRenderer::new(
114 "chrome",
115 &ch.ws_url,
116 config.page_timeout_ms,
117 config.pool_size,
118 )));
119 }
120 }
121
122 #[cfg(not(feature = "cdp"))]
123 if config.lightpanda.is_some() || config.playwright.is_some() || config.chrome.is_some() {
124 tracing::warn!(
125 "CDP renderers configured but 'cdp' feature not enabled. JS rendering disabled."
126 );
127 }
128
129 Self { http, js_renderers }
130 }
131
132 pub async fn fetch(
140 &self,
141 url: &str,
142 headers: &HashMap<String, String>,
143 render_js: Option<bool>,
144 wait_for_ms: Option<u64>,
145 ) -> CrwResult<FetchResult> {
146 match render_js {
147 Some(false) => self.http.fetch(url, headers, None).await,
148 Some(true) => {
149 let http_result = self.http.fetch(url, headers, None).await?;
151 if http_result.content_type.as_deref() == Some("application/pdf") {
152 return Ok(http_result);
153 }
154
155 if self.js_renderers.is_empty() {
156 tracing::warn!(
157 url,
158 "JS rendering requested but no renderer available — falling back to HTTP"
159 );
160 let mut result = http_result;
161 result.rendered_with = Some("http_only_fallback".to_string());
162 result.warning = Some("JS rendering was requested but no renderer is available. Content was fetched via HTTP only.".to_string());
163 Ok(result)
164 } else {
165 self.fetch_with_js(url, headers, wait_for_ms).await
166 }
167 }
168 None => {
169 let result = self.http.fetch(url, headers, None).await?;
170
171 if result.content_type.as_deref() == Some("application/pdf") {
173 return Ok(result);
174 }
175
176 let needs_js = detector::needs_js_rendering(&result.html);
177 let is_blocked = Self::looks_like_challenge(&result.html);
178 let is_auth_blocked = matches!(result.status_code, 401 | 403);
179
180 if !self.js_renderers.is_empty() && (needs_js || is_blocked || is_auth_blocked) {
181 if is_auth_blocked {
182 tracing::info!(
183 url,
184 status_code = result.status_code,
185 "HTTP {} received, escalating to JS renderer",
186 result.status_code
187 );
188 } else if is_blocked {
189 tracing::info!(
190 url,
191 "Anti-bot challenge detected in HTTP response, escalating to JS renderer"
192 );
193 } else {
194 tracing::info!(url, "SPA shell detected, retrying with JS renderer");
195 }
196 match self.fetch_with_js(url, headers, wait_for_ms).await {
197 Ok(js_result) => Ok(js_result),
198 Err(e) => {
199 tracing::warn!("JS rendering failed, falling back to HTTP result: {e}");
200 Ok(result)
201 }
202 }
203 } else {
204 Ok(result)
205 }
206 }
207 }
208 }
209
210 fn looks_like_challenge(html: &str) -> bool {
212 if html.len() > 50_000 {
213 return false;
214 }
215 let lower = html.to_lowercase();
216 lower.contains("just a moment")
217 || lower.contains("cf-browser-verification")
218 || lower.contains("cf-challenge-running")
219 || lower.contains("challenge-platform")
220 || (lower.contains("attention required") && lower.contains("cloudflare"))
221 }
222
223 const MIN_RENDERED_TEXT_LEN: usize = 50;
227
228 async fn fetch_with_js(
229 &self,
230 url: &str,
231 headers: &HashMap<String, String>,
232 wait_for_ms: Option<u64>,
233 ) -> CrwResult<FetchResult> {
234 let mut last_error = None;
235 let mut thin_result: Option<FetchResult> = None;
236 for renderer in &self.js_renderers {
237 match renderer.fetch(url, headers, wait_for_ms).await {
238 Ok(result) => {
239 let text_len = html_body_text_len(&result.html);
240 if text_len >= Self::MIN_RENDERED_TEXT_LEN {
241 return Ok(result);
242 }
243 tracing::info!(
244 renderer = renderer.name(),
245 text_len,
246 "JS renderer returned thin content, trying next renderer"
247 );
248 if thin_result.is_none() {
249 thin_result = Some(result);
250 }
251 }
252 Err(e) => {
253 tracing::warn!(renderer = renderer.name(), "JS renderer failed: {e}");
254 last_error = Some(e);
255 continue;
256 }
257 }
258 }
259 if let Some(result) = thin_result {
261 Ok(result)
262 } else {
263 Err(last_error
264 .unwrap_or_else(|| CrwError::RendererError("No JS renderer available".to_string())))
265 }
266 }
267
268 pub async fn check_health(&self) -> HashMap<String, bool> {
270 let mut health = HashMap::new();
271 health.insert("http".to_string(), self.http.is_available().await);
272 for r in &self.js_renderers {
273 health.insert(r.name().to_string(), r.is_available().await);
274 }
275 health
276 }
277}
278
279fn html_body_text_len(html: &str) -> usize {
283 let body = if let Some(start) = html.find("<body") {
285 let start = html[start..].find('>').map(|i| start + i + 1).unwrap_or(0);
286 let end = html.find("</body>").unwrap_or(html.len());
287 &html[start..end]
288 } else {
289 html
290 };
291 let mut in_tag = false;
293 let mut text_len = 0;
294 let mut prev_ws = true;
295 for ch in body.chars() {
296 if ch == '<' {
297 in_tag = true;
298 } else if ch == '>' {
299 in_tag = false;
300 } else if !in_tag {
301 if ch.is_whitespace() {
302 if !prev_ws {
303 text_len += 1;
304 prev_ws = true;
305 }
306 } else {
307 text_len += 1;
308 prev_ws = false;
309 }
310 }
311 }
312 text_len
313}