1use crate::{
2 engines::{browser::BrowserEngine, http::HttpEngine, RawScrapeResult, ScrapeEngine},
3 error::Result,
4 types::ScrapeRequest,
5};
6use std::time::{Duration, Instant};
7use tokio::select;
8use tracing::{debug, info, warn};
9
10#[derive(Debug, Clone)]
12pub struct RacerMetrics {
13 pub winning_engine: String,
15 pub elapsed_ms: u64,
17 pub browser_started: bool,
19 pub http_status: EngineStatus,
21 pub browser_status: Option<EngineStatus>,
23}
24
25#[derive(Debug, Clone)]
26pub enum EngineStatus {
27 Success { duration_ms: u64 },
28 Failed { duration_ms: u64, error: String },
29 NotStarted,
30 Cancelled,
31}
32
33pub struct EngineRacer {
42 http_engine: HttpEngine,
43 browser_engine: BrowserEngine,
44 waterfall_delay: Duration,
45 validate_quality: bool,
46}
47
48impl EngineRacer {
49 pub async fn new() -> Result<Self> {
51 let waterfall_delay = Duration::from_millis(
52 std::env::var("ENGINE_WATERFALL_DELAY_MS")
53 .ok()
54 .and_then(|s| s.parse().ok())
55 .unwrap_or(5000), );
57
58 Ok(Self {
59 http_engine: HttpEngine::new()?,
60 browser_engine: BrowserEngine::new().await?,
61 waterfall_delay,
62 validate_quality: true,
63 })
64 }
65
66 pub async fn with_delay(delay_ms: u64) -> Result<Self> {
68 Ok(Self {
69 http_engine: HttpEngine::new()?,
70 browser_engine: BrowserEngine::new().await?,
71 waterfall_delay: Duration::from_millis(delay_ms),
72 validate_quality: true,
73 })
74 }
75
76 pub async fn with_options(delay_ms: u64, validate_quality: bool) -> Result<Self> {
78 Ok(Self {
79 http_engine: HttpEngine::new()?,
80 browser_engine: BrowserEngine::new().await?,
81 waterfall_delay: Duration::from_millis(delay_ms),
82 validate_quality,
83 })
84 }
85
86 pub async fn race_scrape(&self, request: &ScrapeRequest) -> Result<RawScrapeResult> {
96 let start_time = Instant::now();
97
98 debug!(
99 "Starting waterfall race with {}ms delay for URL: {}",
100 self.waterfall_delay.as_millis(),
101 request.url
102 );
103
104 let http_start = Instant::now();
106 let http_future = self.http_engine.scrape(request);
107 tokio::pin!(http_future);
108
109 let http_result = select! {
111 result = &mut http_future => {
112 let http_duration = http_start.elapsed();
113 debug!("HTTP engine completed in {}ms", http_duration.as_millis());
114 Some((result, http_duration))
115 }
116 _ = tokio::time::sleep(self.waterfall_delay) => {
117 debug!("HTTP engine timeout ({}ms), starting browser engine", self.waterfall_delay.as_millis());
118 None
119 }
120 };
121
122 let http_completed = http_result.is_some();
124
125 if let Some((result, http_duration)) = http_result {
127 match result {
128 Ok(raw) => {
129 if should_fallback_to_browser(&raw) {
131 warn!(
132 "HTTP returned blocking/error status {} in {}ms, racing with browser",
133 raw.status_code,
134 http_duration.as_millis()
135 );
136 } else if self.validate_quality {
138 let html_text = scraper::Html::parse_document(&raw.html)
141 .root_element()
142 .text()
143 .collect::<String>();
144
145 if html_text.trim().len() > 100 {
146 info!(
147 "HTTP engine won the race ({}ms) with good quality",
148 http_duration.as_millis()
149 );
150 return Ok(raw);
151 } else {
152 warn!(
153 "HTTP result has low quality (text length: {}), racing with browser",
154 html_text.trim().len()
155 );
156 }
158 } else {
159 info!("HTTP engine won the race ({}ms)", http_duration.as_millis());
160 return Ok(raw);
161 }
162 }
163 Err(e) => {
164 warn!("HTTP engine failed in {}ms: {}, racing with browser", http_duration.as_millis(), e);
165 }
167 }
168 }
169
170 let browser_start = Instant::now();
176 let browser_future = self.browser_engine.scrape(request);
177
178 let (winning_result, winning_engine) = if !http_completed {
179 select! {
181 result = http_future => {
182 let duration = http_start.elapsed();
183 info!("HTTP engine completed after waterfall ({}ms)", duration.as_millis());
184 (result, "http_late")
185 }
186 result = browser_future => {
187 let duration = browser_start.elapsed();
188 info!("Browser engine won the race ({}ms)", duration.as_millis());
189 (result, "browser")
190 }
191 }
192 } else {
193 let result = browser_future.await;
195 let duration = browser_start.elapsed();
196 info!("Browser engine used as fallback ({}ms)", duration.as_millis());
197 (result, "browser_fallback")
198 };
199
200 let total_elapsed = start_time.elapsed();
201 debug!(
202 "Race completed in {}ms, winner: {}",
203 total_elapsed.as_millis(),
204 winning_engine
205 );
206
207 winning_result
208 }
209
210 pub async fn race_scrape_with_metrics(
212 &self,
213 request: &ScrapeRequest,
214 ) -> Result<(RawScrapeResult, RacerMetrics)> {
215 let start_time = Instant::now();
216 let mut http_status = EngineStatus::NotStarted;
217
218 debug!(
219 "Starting waterfall race with metrics for URL: {}",
220 request.url
221 );
222
223 let http_start = Instant::now();
225 let http_future = self.http_engine.scrape(request);
226 tokio::pin!(http_future);
227
228 let http_result = select! {
230 result = &mut http_future => {
231 let duration = http_start.elapsed();
232 http_status = match &result {
233 Ok(_) => EngineStatus::Success { duration_ms: duration.as_millis() as u64 },
234 Err(e) => EngineStatus::Failed {
235 duration_ms: duration.as_millis() as u64,
236 error: e.to_string()
237 },
238 };
239 Some(result)
240 }
241 _ = tokio::time::sleep(self.waterfall_delay) => None
242 };
243
244 let should_continue_to_browser = if let Some(Ok(ref raw)) = http_result {
246 if should_fallback_to_browser(raw) {
248 warn!(
249 "HTTP returned blocking/error status {}, falling back to browser engine",
250 raw.status_code
251 );
252 true } else {
254 false
256 }
257 } else {
258 true
260 };
261
262 if !should_continue_to_browser {
263 if let Some(Ok(raw)) = http_result {
265 let metrics = RacerMetrics {
266 winning_engine: "http".to_string(),
267 elapsed_ms: start_time.elapsed().as_millis() as u64,
268 browser_started: false,
269 http_status,
270 browser_status: None,
271 };
272 return Ok((raw, metrics));
273 }
274 }
275
276 let browser_start = Instant::now();
278 let browser_future = self.browser_engine.scrape(request);
279
280 let (result, winning_engine, browser_status) = if http_result.is_none() {
284 select! {
286 result = http_future => {
287 let duration = http_start.elapsed();
288 http_status = match &result {
289 Ok(_) => EngineStatus::Success { duration_ms: duration.as_millis() as u64 },
290 Err(e) => EngineStatus::Failed {
291 duration_ms: duration.as_millis() as u64,
292 error: e.to_string()
293 },
294 };
295 (result, "http_late", Some(EngineStatus::Cancelled))
296 }
297 result = browser_future => {
298 let duration = browser_start.elapsed();
299 let status = match &result {
300 Ok(_) => EngineStatus::Success { duration_ms: duration.as_millis() as u64 },
301 Err(e) => EngineStatus::Failed {
302 duration_ms: duration.as_millis() as u64,
303 error: e.to_string()
304 },
305 };
306 (result, "browser", Some(status))
307 }
308 }
309 } else {
310 let result = browser_future.await;
311 let duration = browser_start.elapsed();
312 let status = match &result {
313 Ok(_) => EngineStatus::Success { duration_ms: duration.as_millis() as u64 },
314 Err(e) => EngineStatus::Failed {
315 duration_ms: duration.as_millis() as u64,
316 error: e.to_string()
317 },
318 };
319 (result, "browser_fallback", Some(status))
320 };
321
322 let metrics = RacerMetrics {
323 winning_engine: winning_engine.to_string(),
324 elapsed_ms: start_time.elapsed().as_millis() as u64,
325 browser_started: true,
326 http_status,
327 browser_status,
328 };
329
330 result.map(|r| (r, metrics))
331 }
332}
333
334fn should_fallback_to_browser(raw: &RawScrapeResult) -> bool {
336 match raw.status_code {
338 401 | 403 => {
339 info!("Detected blocking status code {}, will try browser fallback", raw.status_code);
341 true
342 }
343 429 => {
344 info!("Detected rate limit (429), will try browser fallback");
346 true
347 }
348 503 => {
349 info!("Detected service unavailable (503), will try browser fallback");
351 true
352 }
353 _ if raw.status_code >= 400 => {
354 let html_lower = raw.html.to_lowercase();
356 let is_blocking_page = html_lower.contains("access denied")
357 || html_lower.contains("blocked")
358 || html_lower.contains("captcha")
359 || html_lower.contains("cloudflare")
360 || html_lower.contains("challenge")
361 || html_lower.contains("please verify")
362 || html_lower.contains("bot detection");
363
364 if is_blocking_page {
365 info!("Detected anti-bot page content, will try browser fallback");
366 }
367 is_blocking_page
368 }
369 _ => false,
370 }
371}
372
373#[cfg(test)]
374mod tests {
375 use super::*;
376 use crate::types::ScrapeRequest;
377
378 #[tokio::test]
379 #[ignore] async fn test_http_wins_race() {
381 let racer = EngineRacer::new().await.unwrap();
382 let request = ScrapeRequest {
383 url: "https://example.com".to_string(),
384 engine: "auto".to_string(),
385 formats: vec!["markdown".to_string()],
386 ..Default::default()
387 };
388
389 let result = racer.race_scrape(&request).await;
390 assert!(result.is_ok(), "HTTP-friendly site should succeed");
391
392 let raw = result.unwrap();
393 assert!(raw.html.len() > 0, "Should return HTML content");
394 assert_eq!(raw.status_code, 200, "Should return 200 status");
395 }
396
397 #[tokio::test]
398 #[ignore] async fn test_browser_wins_race() {
400 let racer = EngineRacer::new().await.unwrap();
402 let request = ScrapeRequest {
403 url: "https://react.dev".to_string(), engine: "auto".to_string(),
405 formats: vec!["markdown".to_string()],
406 ..Default::default()
407 };
408
409 let result = racer.race_scrape(&request).await;
410 assert!(result.is_ok(), "SPA site should succeed with browser");
411
412 let raw = result.unwrap();
413 assert!(raw.html.len() > 0, "Should return HTML content");
414 }
415
416 #[tokio::test]
417 #[ignore] async fn test_waterfall_timing() {
419 let racer = EngineRacer::with_delay(100).await.unwrap(); let request = ScrapeRequest {
423 url: "https://example.com".to_string(),
424 engine: "auto".to_string(),
425 formats: vec!["markdown".to_string()],
426 ..Default::default()
427 };
428
429 let start = std::time::Instant::now();
430 let result = racer.race_scrape(&request).await;
431 let elapsed = start.elapsed();
432
433 assert!(result.is_ok(), "Request should succeed");
434
435 assert!(
437 elapsed.as_secs() < 5,
438 "HTTP should complete quickly, took: {}ms",
439 elapsed.as_millis()
440 );
441 }
442
443 #[tokio::test]
444 #[ignore] async fn test_http_failure_fallback() {
446 let racer = EngineRacer::new().await.unwrap();
447
448 let request = ScrapeRequest {
450 url: "https://this-domain-does-not-exist-essence-test-12345.com".to_string(),
451 engine: "auto".to_string(),
452 formats: vec!["markdown".to_string()],
453 ..Default::default()
454 };
455
456 let result = racer.race_scrape(&request).await;
457 assert!(result.is_err(), "Should fail for non-existent domain");
459 }
460
461 #[tokio::test]
462 async fn test_racer_creation() {
463 let racer = EngineRacer::new().await;
464 assert!(racer.is_ok(), "Racer creation should succeed");
465 }
466
467 #[tokio::test]
468 async fn test_racer_with_custom_delay() {
469 let racer = EngineRacer::with_delay(3000).await;
470 assert!(racer.is_ok(), "Racer creation with custom delay should succeed");
471
472 let racer = racer.unwrap();
473 assert_eq!(
474 racer.waterfall_delay.as_millis(),
475 3000,
476 "Should use custom delay"
477 );
478 }
479}