1#[cfg(feature = "cdp")]
36pub mod cdp;
37pub mod detector;
38pub mod http_only;
39pub mod traits;
40
41use crw_core::config::{BUILTIN_UA_POOL, RendererConfig, StealthConfig};
42use crw_core::error::{CrwError, CrwResult};
43use crw_core::types::FetchResult;
44use std::collections::HashMap;
45use std::sync::Arc;
46use traits::PageFetcher;
47
48fn pick_ua<'a>(default_ua: &'a str, stealth: &'a StealthConfig) -> String {
50 if stealth.enabled {
51 let pool: &[&str] = if stealth.user_agents.is_empty() {
52 BUILTIN_UA_POOL
53 } else {
54 return stealth.user_agents[rand::random::<usize>() % stealth.user_agents.len()]
56 .clone();
57 };
58 pool[rand::random::<usize>() % pool.len()].to_string()
59 } else {
60 default_ua.to_string()
61 }
62}
63
64pub struct FallbackRenderer {
66 http: Arc<dyn PageFetcher>,
67 js_renderers: Vec<Arc<dyn PageFetcher>>,
68}
69
70impl FallbackRenderer {
71 pub fn new(
72 config: &RendererConfig,
73 user_agent: &str,
74 proxy: Option<&str>,
75 stealth: &StealthConfig,
76 ) -> Self {
77 let effective_ua = pick_ua(user_agent, stealth);
78 let inject_headers = stealth.enabled && stealth.inject_headers;
79 let http = Arc::new(http_only::HttpFetcher::new(
80 &effective_ua,
81 proxy,
82 inject_headers,
83 )) as Arc<dyn PageFetcher>;
84
85 #[allow(unused_mut)]
86 let mut js_renderers: Vec<Arc<dyn PageFetcher>> = Vec::new();
87
88 if config.mode == "none" {
89 return Self { http, js_renderers };
90 }
91
92 #[cfg(feature = "cdp")]
93 {
94 if let Some(lp) = &config.lightpanda {
95 js_renderers.push(Arc::new(cdp::CdpRenderer::new(
96 "lightpanda",
97 &lp.ws_url,
98 config.page_timeout_ms,
99 config.pool_size,
100 )));
101 }
102 if let Some(pw) = &config.playwright {
103 js_renderers.push(Arc::new(cdp::CdpRenderer::new(
104 "playwright",
105 &pw.ws_url,
106 config.page_timeout_ms,
107 config.pool_size,
108 )));
109 }
110 if let Some(ch) = &config.chrome {
111 js_renderers.push(Arc::new(cdp::CdpRenderer::new(
112 "chrome",
113 &ch.ws_url,
114 config.page_timeout_ms,
115 config.pool_size,
116 )));
117 }
118 }
119
120 #[cfg(not(feature = "cdp"))]
121 if config.lightpanda.is_some() || config.playwright.is_some() || config.chrome.is_some() {
122 tracing::warn!(
123 "CDP renderers configured but 'cdp' feature not enabled. JS rendering disabled."
124 );
125 }
126
127 Self { http, js_renderers }
128 }
129
130 pub async fn fetch(
138 &self,
139 url: &str,
140 headers: &HashMap<String, String>,
141 render_js: Option<bool>,
142 wait_for_ms: Option<u64>,
143 ) -> CrwResult<FetchResult> {
144 match render_js {
145 Some(false) => self.http.fetch(url, headers, None).await,
146 Some(true) => {
147 let http_result = self.http.fetch(url, headers, None).await?;
149 if http_result.content_type.as_deref() == Some("application/pdf") {
150 return Ok(http_result);
151 }
152
153 if self.js_renderers.is_empty() {
154 tracing::warn!(
155 url,
156 "JS rendering requested but no renderer available — falling back to HTTP"
157 );
158 let mut result = http_result;
159 result.rendered_with = Some("http_only_fallback".to_string());
160 result.warning = Some("JS rendering was requested but no renderer is available. Content was fetched via HTTP only.".to_string());
161 Ok(result)
162 } else {
163 self.fetch_with_js(url, headers, wait_for_ms).await
164 }
165 }
166 None => {
167 let result = self.http.fetch(url, headers, None).await?;
168
169 if result.content_type.as_deref() == Some("application/pdf") {
171 return Ok(result);
172 }
173
174 let needs_js = detector::needs_js_rendering(&result.html);
175 let is_blocked = Self::looks_like_challenge(&result.html);
176 let is_auth_blocked = matches!(result.status_code, 401 | 403);
177
178 if !self.js_renderers.is_empty() && (needs_js || is_blocked || is_auth_blocked) {
179 if is_auth_blocked {
180 tracing::info!(
181 url,
182 status_code = result.status_code,
183 "HTTP {} received, escalating to JS renderer",
184 result.status_code
185 );
186 } else if is_blocked {
187 tracing::info!(
188 url,
189 "Anti-bot challenge detected in HTTP response, escalating to JS renderer"
190 );
191 } else {
192 tracing::info!(url, "SPA shell detected, retrying with JS renderer");
193 }
194 match self.fetch_with_js(url, headers, wait_for_ms).await {
195 Ok(js_result) => Ok(js_result),
196 Err(e) => {
197 tracing::warn!("JS rendering failed, falling back to HTTP result: {e}");
198 Ok(result)
199 }
200 }
201 } else {
202 Ok(result)
203 }
204 }
205 }
206 }
207
208 fn looks_like_challenge(html: &str) -> bool {
210 if html.len() > 50_000 {
211 return false;
212 }
213 let lower = html.to_lowercase();
214 lower.contains("just a moment")
215 || lower.contains("cf-browser-verification")
216 || lower.contains("cf-challenge-running")
217 || lower.contains("challenge-platform")
218 || (lower.contains("attention required") && lower.contains("cloudflare"))
219 }
220
221 const MIN_RENDERED_TEXT_LEN: usize = 50;
225
226 async fn fetch_with_js(
227 &self,
228 url: &str,
229 headers: &HashMap<String, String>,
230 wait_for_ms: Option<u64>,
231 ) -> CrwResult<FetchResult> {
232 let mut last_error = None;
233 let mut thin_result: Option<FetchResult> = None;
234 for renderer in &self.js_renderers {
235 match renderer.fetch(url, headers, wait_for_ms).await {
236 Ok(result) => {
237 let text_len = html_body_text_len(&result.html);
238 if text_len >= Self::MIN_RENDERED_TEXT_LEN {
239 return Ok(result);
240 }
241 tracing::info!(
242 renderer = renderer.name(),
243 text_len,
244 "JS renderer returned thin content, trying next renderer"
245 );
246 if thin_result.is_none() {
247 thin_result = Some(result);
248 }
249 }
250 Err(e) => {
251 tracing::warn!(renderer = renderer.name(), "JS renderer failed: {e}");
252 last_error = Some(e);
253 continue;
254 }
255 }
256 }
257 if let Some(result) = thin_result {
259 Ok(result)
260 } else {
261 Err(last_error
262 .unwrap_or_else(|| CrwError::RendererError("No JS renderer available".to_string())))
263 }
264 }
265
266 pub async fn check_health(&self) -> HashMap<String, bool> {
268 let mut health = HashMap::new();
269 health.insert("http".to_string(), self.http.is_available().await);
270 for r in &self.js_renderers {
271 health.insert(r.name().to_string(), r.is_available().await);
272 }
273 health
274 }
275}
276
277fn html_body_text_len(html: &str) -> usize {
281 let body = if let Some(start) = html.find("<body") {
283 let start = html[start..].find('>').map(|i| start + i + 1).unwrap_or(0);
284 let end = html.find("</body>").unwrap_or(html.len());
285 &html[start..end]
286 } else {
287 html
288 };
289 let mut in_tag = false;
291 let mut text_len = 0;
292 let mut prev_ws = true;
293 for ch in body.chars() {
294 if ch == '<' {
295 in_tag = true;
296 } else if ch == '>' {
297 in_tag = false;
298 } else if !in_tag {
299 if ch.is_whitespace() {
300 if !prev_ws {
301 text_len += 1;
302 prev_ws = true;
303 }
304 } else {
305 text_len += 1;
306 prev_ws = false;
307 }
308 }
309 }
310 text_len
311}