1use crate::{
2 engines::{RawScrapeResult, ScrapeEngine, stealth::{apply_stealth_techniques, StealthMode}},
3 error::{Result, ScrapeError},
4 types::{BrowserAction, ScrapeRequest},
5 utils::{url_rewrites::rewrite_url, user_agents::random_user_agent},
6};
7use async_trait::async_trait;
8use base64::{engine::general_purpose, Engine as _};
9use chromiumoxide::{
10 browser::{Browser, BrowserConfig},
11 cdp::browser_protocol::{
12 fetch::{
13 EnableParams, EventRequestPaused, FailRequestParams, ContinueRequestParams,
14 },
15 network::ErrorReason,
16 page::CaptureScreenshotFormat,
17 },
18 Page,
19};
20use futures::StreamExt;
21use std::{env, path::PathBuf, sync::Arc, time::{Duration, Instant}};
22use tokio::sync::{Mutex, OwnedSemaphorePermit, Semaphore};
23use tracing::{debug, info, warn};
24
25const BLOCKED_DOMAINS: &[&str] = &[
28 "googletagmanager.com",
29 "google-analytics.com",
30 "analytics.google.com",
31 "facebook.net",
32 "connect.facebook.net",
33 "doubleclick.net",
34 "googlesyndication.com",
35 "adservice.google.com",
36 "static.ads-twitter.com",
37 "ads-twitter.com",
38 "ads.linkedin.com",
39 "bat.bing.com",
40 "stats.wp.com",
41 "scorecardresearch.com",
42 "quantserve.com",
43 "chartbeat.com",
44 "hotjar.com",
45 "mouseflow.com",
46 "mixpanel.com",
47 "segment.io",
48 "segment.com",
49 "analytics.tiktok.com",
50 "mktoresp.com",
51 "pardot.com",
52];
53
54pub struct BrowserPool {
56 semaphore: Arc<Semaphore>,
57 #[allow(dead_code)]
58 max_instances: usize,
59 headless: bool,
60 user_agent: Option<String>,
61 browsers: Arc<Mutex<Vec<Browser>>>,
62 chrome_path: PathBuf,
63}
64
65impl BrowserPool {
66 pub async fn new(max_instances: usize) -> Result<Self> {
68 let headless = std::env::var("BROWSER_HEADLESS")
69 .ok()
70 .and_then(|v| v.parse().ok())
71 .unwrap_or(true);
72
73 let user_agent = std::env::var("BROWSER_USER_AGENT").ok();
74
75 let chrome_path = Self::find_chrome_executable()?;
77 info!("Found Chrome at: {}", chrome_path.display());
78
79 Ok(Self {
80 semaphore: Arc::new(Semaphore::new(max_instances)),
81 max_instances,
82 headless,
83 user_agent,
84 browsers: Arc::new(Mutex::new(Vec::new())),
85 chrome_path,
86 })
87 }
88
89 pub async fn get_browser(self: &Arc<Self>) -> Result<BrowserGuard> {
91 let permit = self
93 .semaphore
94 .clone()
95 .acquire_owned()
96 .await
97 .map_err(|e| ScrapeError::Internal(format!("Failed to acquire browser: {}", e)))?;
98
99 let browser = {
101 let mut browsers = self.browsers.lock().await;
102
103 while let Some(mut b) = browsers.pop() {
105 if Self::is_browser_healthy(&b).await {
106 debug!("Reusing existing browser instance");
107 return Ok(BrowserGuard {
108 browser: Some(b),
109 pool: self.browsers.clone(),
110 _permit: permit,
111 });
112 } else {
113 warn!("Discarding unhealthy browser");
115 let _ = b.close().await;
116 }
117 }
118
119 debug!("Creating new browser instance");
121 self.create_browser().await?
122 };
123
124 Ok(BrowserGuard {
125 browser: Some(browser),
126 pool: self.browsers.clone(),
127 _permit: permit,
128 })
129 }
130
131 async fn is_browser_healthy(browser: &Browser) -> bool {
133 match browser.version().await {
135 Ok(_) => true,
136 Err(_) => {
137 warn!("Browser health check failed");
138 false
139 }
140 }
141 }
142
143 async fn create_browser(&self) -> Result<Browser> {
145 info!("Launching new browser instance (headless: {})", self.headless);
146
147 let mut browser_config = BrowserConfig::builder()
148 .chrome_executable(&self.chrome_path);
149
150 if self.headless {
152 browser_config = browser_config.no_sandbox().disable_default_args();
153 }
154
155 let mut args = vec![
157 "--disable-blink-features=AutomationControlled",
158 "--disable-dev-shm-usage",
159 "--disable-web-security",
160 "--disable-features=IsolateOrigins,site-per-process",
161 "--allow-running-insecure-content",
162 "--disable-setuid-sandbox",
163 "--no-first-run",
164 "--no-default-browser-check",
165 "--disable-popup-blocking",
166 ];
167
168 if self.headless {
169 args.push("--headless=new");
170 }
171
172 args.push("--window-size=1920,1080");
174
175 for arg in args {
176 browser_config = browser_config.arg(arg);
177 }
178
179 let unique_dir = std::env::temp_dir()
182 .join(format!("essence-browser-{}", uuid::Uuid::new_v4()));
183 browser_config = browser_config.arg(format!(
184 "--user-data-dir={}",
185 unique_dir.display()
186 ));
187
188 let user_agent = match &self.user_agent {
190 Some(ua) => ua.as_str(),
191 None => random_user_agent(),
192 };
193 debug!("Using User-Agent for browser: {}", user_agent);
194 browser_config = browser_config.arg(format!("--user-agent={}", user_agent));
195
196 let (browser, mut handler) = Browser::launch(browser_config.build().map_err(|e| {
197 ScrapeError::BrowserLaunchFailed(format!("Failed to build browser config: {}", e))
198 })?)
199 .await
200 .map_err(|e| {
201 ScrapeError::BrowserLaunchFailed(format!("Failed to launch browser: {}", e))
202 })?;
203
204 tokio::spawn(async move {
206 while let Some(event) = handler.next().await {
207 if let Err(e) = event {
208 warn!("Browser handler error: {}", e);
209 }
210 }
211 debug!("Browser handler task finished");
212 });
213
214 info!("Browser instance created successfully");
215 Ok(browser)
216 }
217
218 fn find_chrome_executable() -> Result<PathBuf> {
220 if let Ok(path) = std::env::var("CHROME_PATH") {
222 let path_buf = PathBuf::from(&path);
223 if path_buf.exists() {
224 info!("Using Chrome from CHROME_PATH: {}", path);
225 return Ok(path_buf);
226 }
227 }
228
229 let paths: Vec<&str> = if cfg!(target_os = "macos") {
231 vec![
232 "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
233 "/Applications/Chromium.app/Contents/MacOS/Chromium",
234 "/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
235 "/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge",
236 ]
237 } else if cfg!(target_os = "linux") {
238 vec![
239 "/usr/bin/google-chrome",
240 "/usr/bin/google-chrome-stable",
241 "/usr/bin/chromium",
242 "/usr/bin/chromium-browser",
243 "/snap/bin/chromium",
244 "/usr/bin/microsoft-edge",
245 "/usr/bin/microsoft-edge-stable",
246 ]
247 } else if cfg!(target_os = "windows") {
248 vec![
249 "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
250 "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
251 "C:\\Program Files\\Chromium\\Application\\chrome.exe",
252 "C:\\Program Files (x86)\\Chromium\\Application\\chrome.exe",
253 "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe",
254 "C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe",
255 ]
256 } else {
257 vec![]
258 };
259
260 for path in paths {
261 if std::fs::metadata(path).is_ok() {
262 info!("Found Chrome at: {}", path);
263 return Ok(PathBuf::from(path));
264 }
265 }
266
267 let install_msg = if cfg!(target_os = "macos") {
269 "brew install --cask chromium"
270 } else if cfg!(target_os = "linux") {
271 "sudo apt-get install chromium-browser # Ubuntu/Debian\nsudo yum install chromium # RHEL/CentOS"
272 } else if cfg!(target_os = "windows") {
273 "Install Chrome from https://www.google.com/chrome/"
274 } else {
275 "Install Chrome or Chromium"
276 };
277
278 Err(ScrapeError::BrowserNotFound(format!(
279 "Chrome/Chromium not found. Please install it:\n{}",
280 install_msg
281 )))
282 }
283}
284
285impl Drop for BrowserPool {
286 fn drop(&mut self) {
287 info!("Shutting down browser pool");
288
289 if let Ok(browsers) = self.browsers.try_lock() {
292 info!("Browser pool had {} instances at shutdown", browsers.len());
293 } else {
294 info!("Browser pool shutting down (browsers still in use)");
295 }
296 }
297}
298
299pub struct BrowserGuard {
301 browser: Option<Browser>,
302 pool: Arc<Mutex<Vec<Browser>>>,
303 _permit: OwnedSemaphorePermit,
304}
305
306impl Drop for BrowserGuard {
307 fn drop(&mut self) {
308 if let Some(browser) = self.browser.take() {
310 let pool = self.pool.clone();
311
312 tokio::spawn(async move {
313 let mut browsers = pool.lock().await;
314 browsers.push(browser);
315 debug!("Browser returned to pool (total: {})", browsers.len());
316 });
317 }
318 }
319}
320
321impl std::ops::Deref for BrowserGuard {
322 type Target = Browser;
323
324 fn deref(&self) -> &Self::Target {
325 self.browser
326 .as_ref()
327 .expect("Browser guard must have a browser")
328 }
329}
330
331pub struct BrowserEngine {
333 pool: Arc<BrowserPool>,
334}
335
336impl BrowserEngine {
337 pub async fn new() -> Result<Self> {
339 Self::with_config(BrowserEngineConfig::default()).await
340 }
341
342 pub async fn with_config(config: BrowserEngineConfig) -> Result<Self> {
344 let pool = Arc::new(BrowserPool::new(config.pool_size).await?);
345
346 info!("Browser engine initialized with pool size: {}", config.pool_size);
347
348 Ok(Self { pool })
349 }
350
351 async fn setup_request_blocking(&self, page: &Page) -> Result<()> {
353 let block_ads = std::env::var("BROWSER_BLOCK_ADS")
355 .ok()
356 .and_then(|v| v.parse().ok())
357 .unwrap_or(true); if !block_ads {
360 debug!("Ad blocking disabled");
361 return Ok(());
362 }
363
364 info!("Enabling ad/analytics blocking for {} domains", BLOCKED_DOMAINS.len());
365
366 page.execute(EnableParams::default())
368 .await
369 .map_err(|e| ScrapeError::BrowserError(format!("Failed to enable fetch domain: {}", e)))?;
370
371 let page = page.clone();
373
374 tokio::spawn(async move {
376 let mut event_stream = match page.event_listener::<EventRequestPaused>().await {
378 Ok(stream) => stream,
379 Err(e) => {
380 warn!("Failed to create request event listener: {}", e);
381 return;
382 }
383 };
384
385 while let Some(event) = event_stream.next().await {
386 let url = &event.request.url;
387
388 let should_block = BLOCKED_DOMAINS.iter().any(|domain| url.contains(domain));
390
391 if should_block {
392 debug!("Blocking request to: {}", url);
393 let params = FailRequestParams::new(
395 event.request_id.clone(),
396 ErrorReason::BlockedByClient,
397 );
398 if let Err(e) = page.execute(params).await {
399 warn!("Failed to block request: {}", e);
400 }
401 } else {
402 let params = ContinueRequestParams::new(event.request_id.clone());
404 if let Err(e) = page.execute(params).await {
405 warn!("Failed to continue request: {}", e);
406 }
407 }
408 }
409
410 debug!("Request interception task finished");
411 });
412
413 Ok(())
414 }
415
416 async fn execute_actions(&self, page: &Page, actions: &[BrowserAction]) -> Result<()> {
418 for action in actions {
419 match action {
420 BrowserAction::Click { selector } => {
421 debug!("Clicking element: {}", selector);
422 let element = page.find_element(selector).await.map_err(|e| {
423 ScrapeError::ElementNotFound(format!(
424 "Failed to find element {}: {}",
425 selector, e
426 ))
427 })?;
428
429 element.click().await.map_err(|e| {
430 ScrapeError::BrowserError(format!("Failed to click element: {}", e))
431 })?;
432 }
433 BrowserAction::Type { selector, text } => {
434 debug!("Typing '{}' into element: {}", text, selector);
435 let element = page.find_element(selector).await.map_err(|e| {
436 ScrapeError::ElementNotFound(format!(
437 "Failed to find element {}: {}",
438 selector, e
439 ))
440 })?;
441
442 element.click().await.map_err(|e| {
443 ScrapeError::BrowserError(format!("Failed to focus element: {}", e))
444 })?;
445
446 element.type_str(text).await.map_err(|e| {
447 ScrapeError::BrowserError(format!("Failed to type text: {}", e))
448 })?;
449 }
450 BrowserAction::Scroll { direction } => {
451 debug!("Scrolling: {}", direction);
452 let script = match direction.as_str() {
453 "down" => "window.scrollBy(0, window.innerHeight);",
454 "up" => "window.scrollBy(0, -window.innerHeight);",
455 "bottom" => "window.scrollTo(0, document.body.scrollHeight);",
456 "top" => "window.scrollTo(0, 0);",
457 _ => {
458 return Err(ScrapeError::BrowserError(format!(
459 "Invalid scroll direction: {}",
460 direction
461 )))
462 }
463 };
464
465 page.evaluate(script).await.map_err(|e| {
466 ScrapeError::BrowserError(format!("Failed to scroll: {}", e))
467 })?;
468 }
469 BrowserAction::Wait { milliseconds } => {
470 debug!("Waiting for {} ms", milliseconds);
471 tokio::time::sleep(Duration::from_millis(*milliseconds)).await;
472 }
473 BrowserAction::WaitForSelector { selector } => {
474 debug!("Waiting for selector: {}", selector);
475 page.find_element(selector).await.map_err(|e| {
476 ScrapeError::ElementNotFound(format!(
477 "Element not found after waiting: {}",
478 e
479 ))
480 })?;
481 }
482 }
483 }
484
485 Ok(())
486 }
487
488 #[allow(dead_code)]
491 async fn capture_screenshot(&self, page: &Page, format: &str) -> Result<String> {
492 let screenshot_format = match format {
493 "jpeg" | "jpg" => CaptureScreenshotFormat::Jpeg,
494 _ => CaptureScreenshotFormat::Png,
495 };
496
497 let screenshot_bytes = page
498 .screenshot(
499 chromiumoxide::page::ScreenshotParams::builder()
500 .format(screenshot_format)
501 .full_page(true)
502 .build(),
503 )
504 .await
505 .map_err(|e| {
506 ScrapeError::BrowserError(format!("Failed to capture screenshot: {}", e))
507 })?;
508
509 Ok(general_purpose::STANDARD.encode(&screenshot_bytes))
510 }
511}
512
513#[async_trait]
514impl ScrapeEngine for BrowserEngine {
515 async fn scrape(&self, request: &ScrapeRequest) -> Result<RawScrapeResult> {
516 let start = Instant::now();
517
518 info!("Starting browser scrape for URL: {}", request.url);
519
520 let browser = self.pool.get_browser().await?;
522
523 let timeout = Duration::from_millis(request.timeout);
525
526 let result = tokio::time::timeout(timeout, async {
527 debug!("Creating new page");
529 let page = browser
530 .new_page("about:blank")
531 .await
532 .map_err(|e| ScrapeError::BrowserError(format!("Failed to create page: {}", e)))?;
533
534 let stealth_mode = StealthMode::from_env();
536 apply_stealth_techniques(&page, stealth_mode).await?;
537
538 self.setup_request_blocking(&page).await?;
540
541 let url_to_scrape = rewrite_url(&request.url);
543
544 debug!("Navigating to URL: {}", url_to_scrape);
546 page.goto(&url_to_scrape)
547 .await
548 .map_err(|e| ScrapeError::NavigationFailed(format!("Failed to navigate: {}", e)))?;
549
550 debug!("Waiting for network idle");
552 page.wait_for_navigation()
553 .await
554 .map_err(|e| ScrapeError::NavigationFailed(format!("Navigation timeout: {}", e)))?;
555
556 if let Some(selector) = &request.wait_for_selector {
558 debug!("Waiting for selector: {}", selector);
559 page.find_element(selector)
560 .await
561 .map_err(|e| ScrapeError::ElementNotFound(format!("Selector not found: {}", e)))?;
562 }
563
564 if request.wait_for > 0 {
566 debug!("Additional wait for {} ms", request.wait_for);
567 tokio::time::sleep(Duration::from_millis(request.wait_for)).await;
568 }
569
570 if !request.actions.is_empty() {
572 debug!("Executing {} browser actions", request.actions.len());
573 self.execute_actions(&page, &request.actions).await?;
574 }
575
576 let final_url = page
578 .url()
579 .await
580 .map_err(|e| ScrapeError::BrowserError(format!("Failed to get URL: {}", e)))?
581 .unwrap_or_else(|| request.url.clone());
582
583 debug!("Extracting HTML content");
585 let html = page
586 .content()
587 .await
588 .map_err(|e| ScrapeError::BrowserError(format!("Failed to get HTML: {}", e)))?;
589
590 let max_response_size_mb = std::env::var("MAX_RESPONSE_SIZE_MB")
592 .ok()
593 .and_then(|s| s.parse::<usize>().ok())
594 .unwrap_or(50);
595
596 let max_size_bytes = max_response_size_mb * 1024 * 1024;
597
598 if html.len() > max_size_bytes {
599 return Err(ScrapeError::ResourceLimit(format!(
600 "Response too large: {:.2}MB > {}MB",
601 html.len() as f64 / (1024.0 * 1024.0),
602 max_response_size_mb
603 )));
604 }
605
606 info!("Successfully scraped URL with browser: {}", final_url);
607
608 if let Err(e) = page.close().await {
610 warn!("Failed to close page: {}", e);
611 }
612
613 Ok::<_, ScrapeError>(RawScrapeResult {
614 url: final_url,
615 status_code: 200, content_type: Some("text/html".to_string()),
617 html,
618 headers: vec![],
619 })
620 })
621 .await;
622
623 let _duration = start.elapsed().as_secs_f64();
624
625 match result {
627 Ok(Ok(result)) => Ok(result),
628 Ok(Err(e)) => Err(e),
629 Err(_) => Err(ScrapeError::Timeout),
630 }
631 }
632}
633
634#[derive(Debug, Clone)]
636pub struct BrowserEngineConfig {
637 pub headless: bool,
638 pub pool_size: usize,
639 pub user_agent: Option<String>,
640}
641
642impl Default for BrowserEngineConfig {
643 fn default() -> Self {
644 Self {
645 headless: env::var("BROWSER_HEADLESS")
646 .unwrap_or_else(|_| "true".to_string())
647 .parse()
648 .unwrap_or(true),
649 pool_size: env::var("BROWSER_POOL_SIZE")
650 .unwrap_or_else(|_| "5".to_string())
651 .parse()
652 .unwrap_or(5),
653 user_agent: env::var("BROWSER_USER_AGENT").ok(),
654 }
655 }
656}
657
658impl BrowserEngineConfig {
659 pub fn new() -> Self {
660 Self::default()
661 }
662
663 pub fn headless(mut self, headless: bool) -> Self {
664 self.headless = headless;
665 self
666 }
667
668 pub fn pool_size(mut self, size: usize) -> Self {
669 self.pool_size = size;
670 self
671 }
672
673 pub fn user_agent(mut self, agent: impl Into<String>) -> Self {
674 self.user_agent = Some(agent.into());
675 self
676 }
677}
678
679#[cfg(test)]
680mod tests {
681 use super::*;
682
683 #[test]
684 fn test_config_creation() {
685 let config = BrowserEngineConfig::new();
686 assert!(config.headless);
687 assert_eq!(config.pool_size, 5);
688 }
689
690 #[test]
691 fn test_config_builder() {
692 let config = BrowserEngineConfig::new()
693 .headless(false)
694 .pool_size(10)
695 .user_agent("Custom Agent");
696
697 assert!(!config.headless);
698 assert_eq!(config.pool_size, 10);
699 assert_eq!(config.user_agent.as_deref(), Some("Custom Agent"));
700 }
701
702 #[test]
703 fn test_chrome_detection() {
704 let result = BrowserPool::find_chrome_executable();
706
707 if let Err(e) = result {
709 match e {
710 ScrapeError::BrowserNotFound(msg) => {
711 assert!(msg.contains("Chrome/Chromium not found"));
712 }
713 _ => panic!("Expected BrowserNotFound error"),
714 }
715 } else {
716 let path = result.unwrap();
718 assert!(path.exists(), "Chrome path should exist: {:?}", path);
719 }
720 }
721
722 #[tokio::test]
723 #[ignore] async fn test_browser_pool_creation() {
725 let pool = BrowserPool::new(2).await;
726 assert!(pool.is_ok(), "Browser pool creation failed: {:?}", pool.err());
727 }
728
729 #[tokio::test]
730 #[ignore] async fn test_browser_pool_limit() {
732 let pool = Arc::new(BrowserPool::new(2).await.unwrap());
733
734 let b1 = pool.get_browser().await.unwrap();
736 let b2 = pool.get_browser().await.unwrap();
737
738 let start = std::time::Instant::now();
740
741 let pool_ref = pool.clone();
742
743 tokio::spawn(async move {
744 tokio::time::sleep(Duration::from_millis(500)).await;
745 drop(b1); });
747
748 let _b3 = pool_ref.get_browser().await.unwrap();
749 let elapsed = start.elapsed();
750
751 assert!(
753 elapsed.as_millis() >= 400,
754 "Should have waited for browser release, got: {}ms",
755 elapsed.as_millis()
756 );
757
758 drop(b2);
759 }
760
761 #[tokio::test]
762 #[ignore] async fn test_browser_reuse() {
764 let pool = Arc::new(BrowserPool::new(1).await.unwrap());
765
766 {
767 let _b1 = pool.get_browser().await.unwrap();
768 } tokio::time::sleep(Duration::from_millis(100)).await;
773
774 let start = std::time::Instant::now();
775 let _b2 = pool.get_browser().await.unwrap();
776 let elapsed = start.elapsed();
777
778 assert!(
781 elapsed.as_millis() < 500,
782 "Browser should be reused (fast), but took: {}ms",
783 elapsed.as_millis()
784 );
785 }
786
787 #[tokio::test]
788 #[ignore] async fn test_browser_health_check() {
790 let pool = Arc::new(BrowserPool::new(1).await.unwrap());
791 let browser = pool.get_browser().await.unwrap();
792
793 assert!(
795 BrowserPool::is_browser_healthy(&browser).await,
796 "Browser should be healthy after creation"
797 );
798 }
799}