Skip to main content

essence/engines/
racer.rs

1use crate::{
2    engines::{browser::BrowserEngine, http::HttpEngine, RawScrapeResult, ScrapeEngine},
3    error::Result,
4    types::ScrapeRequest,
5};
6use std::time::{Duration, Instant};
7use tokio::select;
8use tracing::{debug, info, warn};
9
10/// Metrics for racer results
11#[derive(Debug, Clone)]
12pub struct RacerMetrics {
13    /// Which engine won the race
14    pub winning_engine: String,
15    /// Total time elapsed
16    pub elapsed_ms: u64,
17    /// Whether browser was started
18    pub browser_started: bool,
19    /// HTTP engine status
20    pub http_status: EngineStatus,
21    /// Browser engine status (if started)
22    pub browser_status: Option<EngineStatus>,
23}
24
25#[derive(Debug, Clone)]
26pub enum EngineStatus {
27    Success { duration_ms: u64 },
28    Failed { duration_ms: u64, error: String },
29    NotStarted,
30    Cancelled,
31}
32
33/// Engine waterfall racer - races engines with staggered starts
34///
35/// This implements a Firecrawl-style waterfall racing strategy:
36/// 1. Start HTTP engine immediately
37/// 2. If HTTP doesn't complete in `waterfall_delay_ms`, start browser
38/// 3. Return first successful result (with quality validation)
39/// 4. Cancel slower engines automatically via tokio::select!
40/// 5. Track metrics for debugging and optimization
41pub struct EngineRacer {
42    http_engine: HttpEngine,
43    browser_engine: BrowserEngine,
44    waterfall_delay: Duration,
45    validate_quality: bool,
46}
47
48impl EngineRacer {
49    /// Create a new engine racer with default settings
50    pub async fn new() -> Result<Self> {
51        let waterfall_delay = Duration::from_millis(
52            std::env::var("ENGINE_WATERFALL_DELAY_MS")
53                .ok()
54                .and_then(|s| s.parse().ok())
55                .unwrap_or(5000), // 5s default
56        );
57
58        Ok(Self {
59            http_engine: HttpEngine::new()?,
60            browser_engine: BrowserEngine::new().await?,
61            waterfall_delay,
62            validate_quality: true,
63        })
64    }
65
66    /// Create a new engine racer with custom delay
67    pub async fn with_delay(delay_ms: u64) -> Result<Self> {
68        Ok(Self {
69            http_engine: HttpEngine::new()?,
70            browser_engine: BrowserEngine::new().await?,
71            waterfall_delay: Duration::from_millis(delay_ms),
72            validate_quality: true,
73        })
74    }
75
76    /// Create racer with custom options
77    pub async fn with_options(delay_ms: u64, validate_quality: bool) -> Result<Self> {
78        Ok(Self {
79            http_engine: HttpEngine::new()?,
80            browser_engine: BrowserEngine::new().await?,
81            waterfall_delay: Duration::from_millis(delay_ms),
82            validate_quality,
83        })
84    }
85
86    /// Race engines with waterfall timeout
87    ///
88    /// Strategy:
89    /// 1. Start HTTP engine immediately
90    /// 2. If HTTP doesn't complete in `waterfall_delay`, start browser in parallel
91    /// 3. Return first successful result (with quality validation if enabled)
92    /// 4. If HTTP fails early, still race both engines for best result
93    /// 5. Slower engines are automatically cancelled by tokio::select!
94    /// 6. Track detailed metrics for debugging
95    pub async fn race_scrape(&self, request: &ScrapeRequest) -> Result<RawScrapeResult> {
96        let start_time = Instant::now();
97
98        debug!(
99            "Starting waterfall race with {}ms delay for URL: {}",
100            self.waterfall_delay.as_millis(),
101            request.url
102        );
103
104        // Start HTTP engine immediately
105        let http_start = Instant::now();
106        let http_future = self.http_engine.scrape(request);
107        tokio::pin!(http_future);
108
109        // Wait for either HTTP to complete or waterfall timeout
110        let http_result = select! {
111            result = &mut http_future => {
112                let http_duration = http_start.elapsed();
113                debug!("HTTP engine completed in {}ms", http_duration.as_millis());
114                Some((result, http_duration))
115            }
116            _ = tokio::time::sleep(self.waterfall_delay) => {
117                debug!("HTTP engine timeout ({}ms), starting browser engine", self.waterfall_delay.as_millis());
118                None
119            }
120        };
121
122        // Check if we should return the HTTP result early
123        let http_completed = http_result.is_some();
124
125        // If HTTP completed before timeout, validate and potentially return it
126        if let Some((result, http_duration)) = http_result {
127            match result {
128                Ok(raw) => {
129                    // Check for blocking/error status codes first
130                    if should_fallback_to_browser(&raw) {
131                        warn!(
132                            "HTTP returned blocking/error status {} in {}ms, racing with browser",
133                            raw.status_code,
134                            http_duration.as_millis()
135                        );
136                        // Fall through to race with browser
137                    } else if self.validate_quality {
138                        // Validate quality if enabled
139                        // We need markdown to validate, so do a quick conversion
140                        let html_text = scraper::Html::parse_document(&raw.html)
141                            .root_element()
142                            .text()
143                            .collect::<String>();
144
145                        if html_text.trim().len() > 100 {
146                            info!(
147                                "HTTP engine won the race ({}ms) with good quality",
148                                http_duration.as_millis()
149                            );
150                            return Ok(raw);
151                        } else {
152                            warn!(
153                                "HTTP result has low quality (text length: {}), racing with browser",
154                                html_text.trim().len()
155                            );
156                            // Fall through to race with browser
157                        }
158                    } else {
159                        info!("HTTP engine won the race ({}ms)", http_duration.as_millis());
160                        return Ok(raw);
161                    }
162                }
163                Err(e) => {
164                    warn!("HTTP engine failed in {}ms: {}, racing with browser", http_duration.as_millis(), e);
165                    // Fall through to race with browser
166                }
167            }
168        }
169
170        // At this point, either:
171        // 1. HTTP timed out (still running)
172        // 2. HTTP failed or had low quality
173        // Race both engines and take the first successful result
174
175        let browser_start = Instant::now();
176        let browser_future = self.browser_engine.scrape(request);
177
178        let (winning_result, winning_engine) = if !http_completed {
179            // HTTP is still running, race it with browser
180            select! {
181                result = http_future => {
182                    let duration = http_start.elapsed();
183                    info!("HTTP engine completed after waterfall ({}ms)", duration.as_millis());
184                    (result, "http_late")
185                }
186                result = browser_future => {
187                    let duration = browser_start.elapsed();
188                    info!("Browser engine won the race ({}ms)", duration.as_millis());
189                    (result, "browser")
190                }
191            }
192        } else {
193            // HTTP already completed but failed/low quality, just use browser
194            let result = browser_future.await;
195            let duration = browser_start.elapsed();
196            info!("Browser engine used as fallback ({}ms)", duration.as_millis());
197            (result, "browser_fallback")
198        };
199
200        let total_elapsed = start_time.elapsed();
201        debug!(
202            "Race completed in {}ms, winner: {}",
203            total_elapsed.as_millis(),
204            winning_engine
205        );
206
207        winning_result
208    }
209
210    /// Race engines and return result with metrics
211    pub async fn race_scrape_with_metrics(
212        &self,
213        request: &ScrapeRequest,
214    ) -> Result<(RawScrapeResult, RacerMetrics)> {
215        let start_time = Instant::now();
216        let mut http_status = EngineStatus::NotStarted;
217
218        debug!(
219            "Starting waterfall race with metrics for URL: {}",
220            request.url
221        );
222
223        // Start HTTP engine
224        let http_start = Instant::now();
225        let http_future = self.http_engine.scrape(request);
226        tokio::pin!(http_future);
227
228        // Wait for HTTP or timeout
229        let http_result = select! {
230            result = &mut http_future => {
231                let duration = http_start.elapsed();
232                http_status = match &result {
233                    Ok(_) => EngineStatus::Success { duration_ms: duration.as_millis() as u64 },
234                    Err(e) => EngineStatus::Failed {
235                        duration_ms: duration.as_millis() as u64,
236                        error: e.to_string()
237                    },
238                };
239                Some(result)
240            }
241            _ = tokio::time::sleep(self.waterfall_delay) => None
242        };
243
244        // Early HTTP success check - but validate status code first
245        let should_continue_to_browser = if let Some(Ok(ref raw)) = http_result {
246            // Check if we should fallback to browser for error/blocking status codes
247            if should_fallback_to_browser(raw) {
248                warn!(
249                    "HTTP returned blocking/error status {}, falling back to browser engine",
250                    raw.status_code
251                );
252                true // Continue to browser fallback
253            } else {
254                // HTTP succeeded with good status code, return it
255                false
256            }
257        } else {
258            // HTTP failed or timed out, need browser
259            true
260        };
261
262        if !should_continue_to_browser {
263            // HTTP succeeded with good status, return it
264            if let Some(Ok(raw)) = http_result {
265                let metrics = RacerMetrics {
266                    winning_engine: "http".to_string(),
267                    elapsed_ms: start_time.elapsed().as_millis() as u64,
268                    browser_started: false,
269                    http_status,
270                    browser_status: None,
271                };
272                return Ok((raw, metrics));
273            }
274        }
275
276        // Start browser
277        let browser_start = Instant::now();
278        let browser_future = self.browser_engine.scrape(request);
279
280        // Race remaining futures
281        // If HTTP timed out (http_result is None), race both futures
282        // If HTTP completed but we're falling back, just use browser
283        let (result, winning_engine, browser_status) = if http_result.is_none() {
284            // HTTP is still running, race it with browser
285            select! {
286                result = http_future => {
287                    let duration = http_start.elapsed();
288                    http_status = match &result {
289                        Ok(_) => EngineStatus::Success { duration_ms: duration.as_millis() as u64 },
290                        Err(e) => EngineStatus::Failed {
291                            duration_ms: duration.as_millis() as u64,
292                            error: e.to_string()
293                        },
294                    };
295                    (result, "http_late", Some(EngineStatus::Cancelled))
296                }
297                result = browser_future => {
298                    let duration = browser_start.elapsed();
299                    let status = match &result {
300                        Ok(_) => EngineStatus::Success { duration_ms: duration.as_millis() as u64 },
301                        Err(e) => EngineStatus::Failed {
302                            duration_ms: duration.as_millis() as u64,
303                            error: e.to_string()
304                        },
305                    };
306                    (result, "browser", Some(status))
307                }
308            }
309        } else {
310            let result = browser_future.await;
311            let duration = browser_start.elapsed();
312            let status = match &result {
313                Ok(_) => EngineStatus::Success { duration_ms: duration.as_millis() as u64 },
314                Err(e) => EngineStatus::Failed {
315                    duration_ms: duration.as_millis() as u64,
316                    error: e.to_string()
317                },
318            };
319            (result, "browser_fallback", Some(status))
320        };
321
322        let metrics = RacerMetrics {
323            winning_engine: winning_engine.to_string(),
324            elapsed_ms: start_time.elapsed().as_millis() as u64,
325            browser_started: true,
326            http_status,
327            browser_status,
328        };
329
330        result.map(|r| (r, metrics))
331    }
332}
333
334/// Check if we should fallback to browser engine based on HTTP response
335fn should_fallback_to_browser(raw: &RawScrapeResult) -> bool {
336    // Status codes that indicate blocking, authentication, or anti-bot protection
337    match raw.status_code {
338        401 | 403 => {
339            // Unauthorized or Forbidden - likely anti-bot or auth required
340            info!("Detected blocking status code {}, will try browser fallback", raw.status_code);
341            true
342        }
343        429 => {
344            // Rate limited - browser might help with different fingerprint
345            info!("Detected rate limit (429), will try browser fallback");
346            true
347        }
348        503 => {
349            // Service unavailable - might be anti-bot protection
350            info!("Detected service unavailable (503), will try browser fallback");
351            true
352        }
353        _ if raw.status_code >= 400 => {
354            // Other client/server errors - check if page looks like anti-bot
355            let html_lower = raw.html.to_lowercase();
356            let is_blocking_page = html_lower.contains("access denied")
357                || html_lower.contains("blocked")
358                || html_lower.contains("captcha")
359                || html_lower.contains("cloudflare")
360                || html_lower.contains("challenge")
361                || html_lower.contains("please verify")
362                || html_lower.contains("bot detection");
363
364            if is_blocking_page {
365                info!("Detected anti-bot page content, will try browser fallback");
366            }
367            is_blocking_page
368        }
369        _ => false,
370    }
371}
372
373#[cfg(test)]
374mod tests {
375    use super::*;
376    use crate::types::ScrapeRequest;
377
378    #[tokio::test]
379    #[ignore] // Requires network
380    async fn test_http_wins_race() {
381        let racer = EngineRacer::new().await.unwrap();
382        let request = ScrapeRequest {
383            url: "https://example.com".to_string(),
384            engine: "auto".to_string(),
385            formats: vec!["markdown".to_string()],
386            ..Default::default()
387        };
388
389        let result = racer.race_scrape(&request).await;
390        assert!(result.is_ok(), "HTTP-friendly site should succeed");
391
392        let raw = result.unwrap();
393        assert!(raw.html.len() > 0, "Should return HTML content");
394        assert_eq!(raw.status_code, 200, "Should return 200 status");
395    }
396
397    #[tokio::test]
398    #[ignore] // Requires network and browser
399    async fn test_browser_wins_race() {
400        // Use a SPA-heavy site that needs browser rendering
401        let racer = EngineRacer::new().await.unwrap();
402        let request = ScrapeRequest {
403            url: "https://react.dev".to_string(), // React docs are a SPA
404            engine: "auto".to_string(),
405            formats: vec!["markdown".to_string()],
406            ..Default::default()
407        };
408
409        let result = racer.race_scrape(&request).await;
410        assert!(result.is_ok(), "SPA site should succeed with browser");
411
412        let raw = result.unwrap();
413        assert!(raw.html.len() > 0, "Should return HTML content");
414    }
415
416    #[tokio::test]
417    #[ignore] // Requires network
418    async fn test_waterfall_timing() {
419        // Set a very short waterfall delay to test the mechanism
420        let racer = EngineRacer::with_delay(100).await.unwrap(); // 100ms delay
421
422        let request = ScrapeRequest {
423            url: "https://example.com".to_string(),
424            engine: "auto".to_string(),
425            formats: vec!["markdown".to_string()],
426            ..Default::default()
427        };
428
429        let start = std::time::Instant::now();
430        let result = racer.race_scrape(&request).await;
431        let elapsed = start.elapsed();
432
433        assert!(result.is_ok(), "Request should succeed");
434
435        // HTTP should win quickly (< 5s for example.com)
436        assert!(
437            elapsed.as_secs() < 5,
438            "HTTP should complete quickly, took: {}ms",
439            elapsed.as_millis()
440        );
441    }
442
443    #[tokio::test]
444    #[ignore] // Requires network
445    async fn test_http_failure_fallback() {
446        let racer = EngineRacer::new().await.unwrap();
447
448        // Use an invalid URL that will fail quickly on HTTP
449        let request = ScrapeRequest {
450            url: "https://this-domain-does-not-exist-essence-test-12345.com".to_string(),
451            engine: "auto".to_string(),
452            formats: vec!["markdown".to_string()],
453            ..Default::default()
454        };
455
456        let result = racer.race_scrape(&request).await;
457        // Both engines should fail for a non-existent domain
458        assert!(result.is_err(), "Should fail for non-existent domain");
459    }
460
461    #[tokio::test]
462    async fn test_racer_creation() {
463        let racer = EngineRacer::new().await;
464        assert!(racer.is_ok(), "Racer creation should succeed");
465    }
466
467    #[tokio::test]
468    async fn test_racer_with_custom_delay() {
469        let racer = EngineRacer::with_delay(3000).await;
470        assert!(racer.is_ok(), "Racer creation with custom delay should succeed");
471
472        let racer = racer.unwrap();
473        assert_eq!(
474            racer.waterfall_delay.as_millis(),
475            3000,
476            "Should use custom delay"
477        );
478    }
479}