Skip to main content

spider/features/
chrome.rs

1use crate::features::chrome_args::CHROME_ARGS;
2use crate::utils::{detect_chrome::get_detect_chrome_executable, log};
3use crate::{configuration::Configuration, tokio_stream::StreamExt};
4use chromiumoxide::cdp::browser_protocol::browser::{
5    SetDownloadBehaviorBehavior, SetDownloadBehaviorParamsBuilder,
6};
7use chromiumoxide::cdp::browser_protocol::{
8    browser::BrowserContextId, emulation::SetGeolocationOverrideParams, network::CookieParam,
9    target::CreateTargetParams,
10};
11use chromiumoxide::error::CdpError;
12use chromiumoxide::handler::REQUEST_TIMEOUT;
13use chromiumoxide::serde_json;
14use chromiumoxide::Page;
15use chromiumoxide::{handler::HandlerConfig, Browser, BrowserConfig};
16use lazy_static::lazy_static;
17#[cfg(feature = "cookies")]
18use std::sync::Arc;
19use std::time::Duration;
20use tokio::task::JoinHandle;
21use url::Url;
22
23lazy_static! {
24    /// Enable loopback for proxy.
25    static ref LOOP_BACK_PROXY: bool = std::env::var("LOOP_BACK_PROXY").unwrap_or_default() == "true";
26}
27
28#[cfg(feature = "cookies")]
29/// Parse a cookie into a jar. This does nothing without the 'cookies' flag.
30pub fn parse_cookies_with_jar(
31    jar: &Arc<crate::client::cookie::Jar>,
32    cookie_str: &str,
33    url: &Url,
34) -> Result<Vec<CookieParam>, String> {
35    use crate::client::cookie::CookieStore;
36
37    // Retrieve cookies stored in the jar
38    if let Some(header_value) = jar.cookies(url) {
39        let cookie_header_str = header_value.to_str().map_err(|e| e.to_string())?;
40        let cookie_pairs: Vec<&str> = cookie_header_str.split(';').collect();
41
42        let mut cookies = Vec::new();
43
44        for pair in cookie_pairs {
45            let parts: Vec<&str> = pair.trim().splitn(2, '=').collect();
46
47            if parts.len() == 2 {
48                let name = parts[0].trim();
49                let value = parts[1].trim();
50
51                let mut builder = CookieParam::builder()
52                    .name(name)
53                    .value(value)
54                    .url(url.as_str());
55
56                if let Some(domain) = url.domain() {
57                    builder = builder.domain(domain.to_string());
58                }
59
60                let path = url.path();
61                builder = builder.path(if path.is_empty() { "/" } else { path });
62
63                if cookie_str.contains("Secure") {
64                    builder = builder.secure(true);
65                }
66
67                if cookie_str.contains("HttpOnly") {
68                    builder = builder.http_only(true);
69                }
70                match builder.build() {
71                    Ok(cookie_param) => cookies.push(cookie_param),
72                    Err(e) => return Err(e),
73                }
74            } else {
75                return Err(format!("Invalid cookie pair: {}", pair));
76            }
77        }
78
79        Ok(cookies)
80    } else {
81        Err("No cookies found".to_string())
82    }
83}
84
85/// Parse a cookie into a jar. This does nothing without the 'cookies' flag.
86#[cfg(not(feature = "cookies"))]
87pub fn parse_cookies_with_jar(cookie_str: &str, url: &Url) -> Result<Vec<CookieParam>, String> {
88    Ok(Default::default())
89}
90
91#[cfg(feature = "cookies")]
92/// Seed jar from cookie header.
93pub fn seed_jar_from_cookie_header(
94    jar: &std::sync::Arc<crate::client::cookie::Jar>,
95    cookie_header: &str,
96    url: &url::Url,
97) -> Result<(), String> {
98    for pair in cookie_header.split(';') {
99        let pair = pair.trim();
100        if pair.is_empty() {
101            continue;
102        }
103
104        let (name, value) = pair
105            .split_once('=')
106            .ok_or_else(|| format!("Invalid cookie pair: {pair}"))?;
107
108        let set_cookie = format!("{}={}; Path=/", name.trim(), value.trim());
109        jar.add_cookie_str(&set_cookie, url);
110    }
111    Ok(())
112}
113
114#[cfg(all(feature = "cookies", feature = "chrome"))]
115/// Set the page cookies.
116pub async fn set_page_cookies(
117    page: &chromiumoxide::Page,
118    cookies: Vec<chromiumoxide::cdp::browser_protocol::network::CookieParam>,
119) -> Result<(), String> {
120    use chromiumoxide::cdp::browser_protocol::network::SetCookiesParams;
121
122    if cookies.is_empty() {
123        return Ok(());
124    }
125
126    page.execute(SetCookiesParams::new(cookies))
127        .await
128        .map_err(|e| e.to_string())?;
129
130    Ok(())
131}
132
133#[cfg(feature = "cookies")]
134/// Set cookie params from jar.
135pub fn cookie_params_from_jar(
136    jar: &std::sync::Arc<crate::client::cookie::Jar>,
137    url: &url::Url,
138) -> Result<Vec<chromiumoxide::cdp::browser_protocol::network::CookieParam>, String> {
139    use crate::client::cookie::CookieStore;
140    use chromiumoxide::cdp::browser_protocol::network::CookieParam;
141
142    let Some(header_value) = jar.cookies(url) else {
143        return Ok(Vec::new());
144    };
145
146    let s = header_value.to_str().map_err(|e| e.to_string())?;
147    let mut out = Vec::new();
148
149    for pair in s.split(';') {
150        let pair = pair.trim();
151        if pair.is_empty() {
152            continue;
153        }
154
155        let (name, value) = pair
156            .split_once('=')
157            .ok_or_else(|| format!("Invalid cookie pair: {pair}"))?;
158
159        let cp = CookieParam::builder()
160            .name(name.trim())
161            .value(value.trim())
162            .url(url.as_str())
163            .build()
164            .map_err(|e| e.to_string())?;
165
166        out.push(cp);
167    }
168
169    Ok(out)
170}
171
172/// Handle the browser cookie configurations.
173#[cfg(feature = "cookies")]
174pub async fn set_cookies(
175    jar: &Arc<crate::client::cookie::Jar>,
176    config: &Configuration,
177    url_parsed: &Option<Box<Url>>,
178    browser: &Browser,
179) {
180    if config.cookie_str.is_empty() {
181        return;
182    }
183
184    let Some(parsed) = url_parsed.as_deref() else {
185        return;
186    };
187
188    let _ = seed_jar_from_cookie_header(jar, &config.cookie_str, parsed);
189
190    match parse_cookies_with_jar(jar, &config.cookie_str, parsed) {
191        Ok(cookies) if !cookies.is_empty() => {
192            let _ = browser.set_cookies(cookies).await;
193        }
194        _ => {}
195    }
196}
197
198/// Patch Chrome args to enable the built-in AI (LanguageModel / Gemini Nano).
199///
200/// Removes `OptimizationHints` from `--disable-features` (which blocks the
201/// on-device model), and adds the required `--enable-features` flags.
202fn patch_chrome_ai_args(args: &mut Vec<String>) {
203    for arg in args.iter_mut() {
204        // Remove OptimizationHints from --disable-features
205        if arg.starts_with("--disable-features=") {
206            let features: Vec<&str> = arg["--disable-features=".len()..]
207                .split(',')
208                .filter(|f| *f != "OptimizationHints")
209                .collect();
210            *arg = format!("--disable-features={}", features.join(","));
211        }
212        // Append AI features to existing --enable-features
213        if arg.starts_with("--enable-features=") {
214            arg.push_str(",OptimizationGuideOnDeviceModel:BypassPerfRequirement/true,PromptAPIForGeminiNano,PromptAPIForGeminiNanoMultimodalInput");
215        }
216    }
217    // If no --enable-features existed, add one
218    if !args.iter().any(|a| a.starts_with("--enable-features=")) {
219        args.push("--enable-features=OptimizationGuideOnDeviceModel:BypassPerfRequirement/true,PromptAPIForGeminiNano,PromptAPIForGeminiNanoMultimodalInput".to_string());
220    }
221}
222
223/// get chrome configuration
224#[cfg(not(feature = "chrome_headed"))]
225pub fn get_browser_config(
226    proxies: &Option<Vec<crate::configuration::RequestProxy>>,
227    intercept: bool,
228    cache_enabled: bool,
229    viewport: impl Into<Option<chromiumoxide::handler::viewport::Viewport>>,
230    request_timeout: &Option<core::time::Duration>,
231    use_chrome_ai: bool,
232) -> Option<BrowserConfig> {
233    let builder = BrowserConfig::builder()
234        .disable_default_args()
235        .no_sandbox()
236        .request_timeout(match request_timeout.as_ref() {
237            Some(timeout) => *timeout,
238            _ => Duration::from_millis(REQUEST_TIMEOUT),
239        });
240
241    let builder = if cache_enabled {
242        builder.enable_cache()
243    } else {
244        builder.disable_cache()
245    };
246
247    // request interception is required for all browser.new_page() creations. We also have to use "about:blank" as the base page to setup the listeners and navigate afterwards or the request will hang.
248    let builder = if intercept {
249        builder.enable_request_intercept()
250    } else {
251        builder
252    };
253
254    let builder = match proxies {
255        Some(proxies) => {
256            let mut chrome_args = Vec::from(CHROME_ARGS.map(|e| e.replace("://", "=").to_string()));
257            if use_chrome_ai {
258                patch_chrome_ai_args(&mut chrome_args);
259            }
260            let base_proxies = proxies
261                .iter()
262                .filter_map(|p| {
263                    if p.ignore == crate::configuration::ProxyIgnore::Chrome {
264                        None
265                    } else {
266                        Some(p.addr.to_owned())
267                    }
268                })
269                .collect::<Vec<String>>();
270
271            if !base_proxies.is_empty() {
272                chrome_args.push(string_concat!(r#"--proxy-server="#, base_proxies.join(";")));
273            }
274
275            builder.args(chrome_args)
276        }
277        _ => {
278            if use_chrome_ai {
279                let mut chrome_args: Vec<String> =
280                    CHROME_ARGS.iter().map(|e| e.to_string()).collect();
281                patch_chrome_ai_args(&mut chrome_args);
282                builder.args(chrome_args)
283            } else {
284                builder.args(CHROME_ARGS)
285            }
286        }
287    };
288    let builder = match get_detect_chrome_executable() {
289        Some(v) => builder.chrome_executable(v),
290        _ => builder,
291    };
292
293    match builder.viewport(viewport).build() {
294        Ok(b) => Some(b),
295        Err(error) => {
296            log("", error);
297            None
298        }
299    }
300}
301
302/// get chrome configuration headful
303#[cfg(feature = "chrome_headed")]
304pub fn get_browser_config(
305    proxies: &Option<Vec<crate::configuration::RequestProxy>>,
306    intercept: bool,
307    cache_enabled: bool,
308    viewport: impl Into<Option<chromiumoxide::handler::viewport::Viewport>>,
309    request_timeout: &Option<core::time::Duration>,
310    use_chrome_ai: bool,
311) -> Option<BrowserConfig> {
312    let builder = BrowserConfig::builder()
313        .disable_default_args()
314        .no_sandbox()
315        .request_timeout(match request_timeout.as_ref() {
316            Some(timeout) => *timeout,
317            _ => Duration::from_millis(REQUEST_TIMEOUT),
318        })
319        .with_head();
320
321    let builder = if cache_enabled {
322        builder.enable_cache()
323    } else {
324        builder.disable_cache()
325    };
326
327    let builder = if intercept {
328        builder.enable_request_intercept()
329    } else {
330        builder
331    };
332
333    let mut chrome_args = Vec::from(CHROME_ARGS.map(|e| {
334        if e == "--headless" {
335            "".to_string()
336        } else {
337            e.replace("://", "=").to_string()
338        }
339    }));
340
341    if use_chrome_ai {
342        patch_chrome_ai_args(&mut chrome_args);
343    }
344
345    let builder = match proxies {
346        Some(proxies) => {
347            let base_proxies = proxies
348                .iter()
349                .filter_map(|p| {
350                    if p.ignore == crate::configuration::ProxyIgnore::Chrome {
351                        None
352                    } else {
353                        Some(p.addr.to_owned())
354                    }
355                })
356                .collect::<Vec<String>>();
357
358            chrome_args.push(string_concat!(r#"--proxy-server="#, base_proxies.join(";")));
359
360            builder.args(chrome_args)
361        }
362        _ => builder.args(chrome_args),
363    };
364    let builder = match get_detect_chrome_executable() {
365        Some(v) => builder.chrome_executable(v),
366        _ => builder,
367    };
368    match builder.viewport(viewport).build() {
369        Ok(b) => Some(b),
370        Err(error) => {
371            log("", error);
372            None
373        }
374    }
375}
376
377/// create the browser handler configuration
378pub fn create_handler_config(config: &Configuration) -> HandlerConfig {
379    HandlerConfig {
380        request_timeout: match config.request_timeout.as_ref() {
381            Some(timeout) => *timeout,
382            _ => Duration::from_millis(REQUEST_TIMEOUT),
383        },
384        request_intercept: config.chrome_intercept.enabled,
385        cache_enabled: config.cache,
386        service_worker_enabled: config.service_worker_enabled,
387        viewport: match config.viewport {
388            Some(ref v) => Some(chromiumoxide::handler::viewport::Viewport::from(
389                v.to_owned(),
390            )),
391            _ => default_viewport(),
392        },
393        ignore_visuals: config.chrome_intercept.block_visuals,
394        whitelist_patterns: config.chrome_intercept.whitelist_patterns.clone(),
395        blacklist_patterns: config.chrome_intercept.blacklist_patterns.clone(),
396        ignore_ads: config.chrome_intercept.block_ads,
397        ignore_javascript: config.chrome_intercept.block_javascript,
398        ignore_analytics: config.chrome_intercept.block_analytics,
399        ignore_stylesheets: config.chrome_intercept.block_stylesheets,
400        extra_headers: match &config.headers {
401            Some(headers) => {
402                let mut hm = crate::utils::header_utils::header_map_to_hash_map(headers.inner());
403
404                cleanup_invalid_headers(&mut hm);
405
406                if hm.is_empty() {
407                    None
408                } else {
409                    if cfg!(feature = "real_browser") {
410                        crate::utils::header_utils::rewrite_headers_to_title_case(&mut hm);
411                    }
412                    Some(hm)
413                }
414            }
415            _ => None,
416        },
417        intercept_manager: config.chrome_intercept.intercept_manager,
418        only_html: config.only_html && !config.full_resources,
419        max_bytes_allowed: config.max_bytes_allowed,
420        ..HandlerConfig::default()
421    }
422}
423
424lazy_static! {
425    static ref CHROM_BASE: Option<String> = std::env::var("CHROME_URL").ok();
426}
427
428/// Lock-free failover across multiple remote Chrome endpoints.
429///
430/// Tracks per-endpoint consecutive errors with atomics. When an endpoint
431/// exceeds `max_retries` failures it is skipped and the next one is tried.
432/// Once all endpoints have been exhausted, returns `None`.
433///
434/// Zero overhead when only one endpoint is configured (inline fast-path).
435pub struct ChromeConnectionFailover {
436    urls: Vec<String>,
437    /// Per-endpoint consecutive error count.
438    errors: Vec<std::sync::atomic::AtomicU32>,
439    /// Max retries per endpoint before moving to the next.
440    max_retries: u32,
441}
442
443impl ChromeConnectionFailover {
444    /// Create a failover from a list of URLs.
445    pub fn new(urls: Vec<String>, max_retries: u32) -> Self {
446        let errors = urls
447            .iter()
448            .map(|_| std::sync::atomic::AtomicU32::new(0))
449            .collect();
450        Self {
451            urls,
452            errors,
453            max_retries,
454        }
455    }
456
457    /// Try to establish a browser connection, failing over across endpoints.
458    ///
459    /// For each endpoint: retry up to `max_retries` times with backoff.
460    /// If all retries fail, move to the next endpoint. Returns the first
461    /// successful connection or `None` if all endpoints are exhausted.
462    pub async fn connect(
463        &self,
464        config: &Configuration,
465    ) -> Option<(Browser, chromiumoxide::Handler)> {
466        let handler_config_base = create_handler_config(config);
467
468        for (idx, url) in self.urls.iter().enumerate() {
469            let err_count = &self.errors[idx];
470
471            for attempt in 0..=self.max_retries {
472                match Browser::connect_with_config(url.as_str(), handler_config_base.clone()).await
473                {
474                    Ok(pair) => {
475                        // Reset error count on success.
476                        err_count.store(0, std::sync::atomic::Ordering::Relaxed);
477                        if idx > 0 {
478                            log::info!(
479                                "[chrome-failover] connected to endpoint {} ({}) after skipping {}",
480                                idx,
481                                url,
482                                idx
483                            );
484                        }
485                        return Some(pair);
486                    }
487                    Err(e) => {
488                        let n = err_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed) + 1;
489                        log::warn!(
490                            "[chrome-failover] endpoint {} ({}) attempt {}/{} failed: {:?}",
491                            idx,
492                            url,
493                            attempt + 1,
494                            self.max_retries + 1,
495                            e
496                        );
497                        if attempt < self.max_retries {
498                            let backoff = crate::utils::backoff::backoff_delay(attempt, 100, 5_000);
499                            tokio::time::sleep(backoff).await;
500                        } else {
501                            log::warn!(
502                                "[chrome-failover] endpoint {} exhausted ({} errors), trying next",
503                                idx,
504                                n
505                            );
506                        }
507                    }
508                }
509            }
510        }
511
512        log::error!(
513            "[chrome-failover] all {} endpoints exhausted",
514            self.urls.len()
515        );
516        None
517    }
518
519    /// Number of endpoints.
520    #[inline]
521    pub fn len(&self) -> usize {
522        self.urls.len()
523    }
524
525    /// Whether the failover list is empty.
526    #[inline]
527    pub fn is_empty(&self) -> bool {
528        self.urls.is_empty()
529    }
530}
531
532/// Get the default viewport
533#[cfg(not(feature = "real_browser"))]
534pub fn default_viewport() -> Option<chromiumoxide::handler::viewport::Viewport> {
535    None
536}
537
538/// Get the default viewport
539#[cfg(feature = "real_browser")]
540pub fn default_viewport() -> Option<chromiumoxide::handler::viewport::Viewport> {
541    use super::chrome_viewport::get_random_viewport;
542    Some(chromiumoxide::handler::viewport::Viewport::from(
543        get_random_viewport(),
544    ))
545}
546
547/// Cleanup the headermap.
548pub fn cleanup_invalid_headers(hm: &mut std::collections::HashMap<String, String>) {
549    hm.remove("User-Agent");
550    hm.remove("user-agent");
551    hm.remove("host");
552    hm.remove("Host");
553    hm.remove("connection");
554    hm.remove("Connection");
555    hm.remove("content-length");
556    hm.remove("Content-Length");
557}
558
559/// Setup the browser configuration.
560pub async fn setup_browser_configuration(
561    config: &Configuration,
562) -> Option<(Browser, chromiumoxide::Handler)> {
563    let proxies = &config.proxies;
564
565    // ── Multi-endpoint failover path (priority) ──
566    if let Some(ref urls) = config.chrome_connection_urls {
567        if !urls.is_empty() {
568            let failover = ChromeConnectionFailover::new(urls.clone(), 3);
569            return failover.connect(config).await;
570        }
571    }
572
573    // ── Single-endpoint path (unchanged behavior) ──
574    let chrome_connection = if config.chrome_connection_url.is_some() {
575        config.chrome_connection_url.as_ref()
576    } else {
577        CHROM_BASE.as_ref()
578    };
579
580    match chrome_connection {
581        Some(v) => {
582            let mut attempts = 0;
583            let max_retries = 10;
584            let mut browser = None;
585
586            // Attempt reconnections for instances that may be on load balancers (LBs)
587            // experiencing shutdowns or degradation. This logic implements a retry
588            // mechanism to improve robustness by allowing multiple attempts to establish.
589            while attempts <= max_retries {
590                match Browser::connect_with_config(v, create_handler_config(config)).await {
591                    Ok(b) => {
592                        browser = Some(b);
593                        break;
594                    }
595                    Err(err) => {
596                        log::error!("{:?}", err);
597                        attempts += 1;
598                        if attempts > max_retries {
599                            log::error!("Exceeded maximum retry attempts");
600                            break;
601                        }
602                        let backoff = crate::utils::backoff::backoff_delay(attempts, 100, 5_000);
603                        tokio::time::sleep(backoff).await;
604                    }
605                }
606            }
607
608            browser
609        }
610        _ => match get_browser_config(
611            proxies,
612            config.chrome_intercept.enabled,
613            config.cache,
614            match config.viewport {
615                Some(ref v) => Some(chromiumoxide::handler::viewport::Viewport::from(
616                    v.to_owned(),
617                )),
618                _ => default_viewport(),
619            },
620            &config.request_timeout,
621            config
622                .remote_multimodal
623                .as_ref()
624                .map(|m| m.should_use_chrome_ai())
625                .unwrap_or(false),
626        ) {
627            Some(mut browser_config) => {
628                browser_config.ignore_visuals = config.chrome_intercept.block_visuals;
629                browser_config.ignore_javascript = config.chrome_intercept.block_javascript;
630                browser_config.ignore_ads = config.chrome_intercept.block_ads;
631                browser_config.whitelist_patterns =
632                    config.chrome_intercept.whitelist_patterns.clone();
633                browser_config.blacklist_patterns =
634                    config.chrome_intercept.blacklist_patterns.clone();
635                browser_config.ignore_stylesheets = config.chrome_intercept.block_stylesheets;
636                browser_config.ignore_analytics = config.chrome_intercept.block_analytics;
637                browser_config.extra_headers = match &config.headers {
638                    Some(headers) => {
639                        let mut hm =
640                            crate::utils::header_utils::header_map_to_hash_map(headers.inner());
641
642                        cleanup_invalid_headers(&mut hm);
643
644                        if hm.is_empty() {
645                            None
646                        } else {
647                            if cfg!(feature = "real_browser") {
648                                crate::utils::header_utils::rewrite_headers_to_title_case(&mut hm);
649                            }
650                            Some(hm)
651                        }
652                    }
653                    _ => None,
654                };
655                browser_config.intercept_manager = config.chrome_intercept.intercept_manager;
656                browser_config.only_html = config.only_html && !config.full_resources;
657
658                match Browser::launch(browser_config).await {
659                    Ok(browser) => Some(browser),
660                    Err(e) => {
661                        log::error!("Browser::launch() failed: {:?}", e);
662                        None
663                    }
664                }
665            }
666            _ => None,
667        },
668    }
669}
670
671/// Launch a chromium browser with configurations and wait until the instance is up.
672pub async fn launch_browser_base(
673    config: &Configuration,
674    url_parsed: &Option<Box<Url>>,
675    jar: Option<&std::sync::Arc<crate::client::cookie::Jar>>,
676) -> Option<(
677    Browser,
678    tokio::task::JoinHandle<()>,
679    Option<BrowserContextId>,
680    std::sync::Arc<std::sync::atomic::AtomicBool>,
681)> {
682    use chromiumoxide::{
683        cdp::browser_protocol::target::CreateBrowserContextParams, error::CdpError,
684    };
685
686    let browser_configuration = setup_browser_configuration(config).await;
687
688    match browser_configuration {
689        Some(c) => {
690            let (mut browser, mut handler) = c;
691            let mut context_id = None;
692
693            let browser_dead = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false));
694            let browser_dead_signal = browser_dead.clone();
695
696            // Spawn a new task that continuously polls the handler
697            // we might need a select with closing in case handler stalls.
698            let handle = tokio::task::spawn(async move {
699                while let Some(k) = handler.next().await {
700                    if let Err(e) = k {
701                        match e {
702                            CdpError::Ws(_)
703                            | CdpError::LaunchExit(_, _)
704                            | CdpError::LaunchTimeout(_)
705                            | CdpError::LaunchIo(_, _) => {
706                                browser_dead_signal
707                                    .store(true, std::sync::atomic::Ordering::Release);
708                                log::error!("Browser handler fatal error: {:?}", e);
709                                break;
710                            }
711                            _ => {
712                                continue;
713                            }
714                        }
715                    }
716                }
717                // Handler stream ended — browser is gone.
718                browser_dead_signal.store(true, std::sync::atomic::Ordering::Release);
719            });
720
721            let mut create_content = CreateBrowserContextParams::default();
722            create_content.dispose_on_detach = Some(true);
723
724            if let Some(ref proxies) = config.proxies {
725                let use_plain_http = proxies.len() >= 2;
726
727                for proxie in proxies.iter() {
728                    if proxie.ignore == crate::configuration::ProxyIgnore::Chrome {
729                        continue;
730                    }
731
732                    let proxie = &proxie.addr;
733
734                    if !proxie.is_empty() {
735                        // pick the socks:// proxy over http if found.
736                        if proxie.starts_with("socks://") {
737                            create_content.proxy_server =
738                                Some(proxie.replacen("socks://", "http://", 1));
739                            // pref this connection
740                            if use_plain_http {
741                                break;
742                            }
743                        }
744
745                        if *LOOP_BACK_PROXY && proxie.starts_with("http://localhost") {
746                            create_content.proxy_bypass_list =
747                                    // https://source.chromium.org/chromium/chromium/src/+/main:net/proxy_resolution/proxy_bypass_rules.cc
748                                    Some("<-loopback>;localhost;[::1]".into());
749                        }
750
751                        create_content.proxy_server = Some(proxie.into());
752                    }
753                }
754            }
755
756            if let Ok(c) = browser.create_browser_context(create_content).await {
757                let _ = browser.send_new_context(c.clone()).await;
758                let _ = context_id.insert(c);
759                if let Some(jar) = jar {
760                    set_cookies(jar, config, url_parsed, &browser).await;
761                }
762                if let Some(id) = &browser.browser_context.id {
763                    let cmd = SetDownloadBehaviorParamsBuilder::default();
764
765                    if let Ok(cmd) = cmd
766                        .behavior(SetDownloadBehaviorBehavior::Deny)
767                        .events_enabled(false)
768                        .browser_context_id(id.clone())
769                        .build()
770                    {
771                        let _ = browser.execute(cmd).await;
772                    }
773                }
774            } else {
775                handle.abort();
776            }
777
778            Some((browser, handle, context_id, browser_dead))
779        }
780        _ => None,
781    }
782}
783
784/// Launch a chromium browser with configurations and wait until the instance is up.
785pub async fn launch_browser(
786    config: &Configuration,
787    url_parsed: &Option<Box<Url>>,
788) -> Option<(
789    Browser,
790    tokio::task::JoinHandle<()>,
791    Option<BrowserContextId>,
792    std::sync::Arc<std::sync::atomic::AtomicBool>,
793)> {
794    launch_browser_base(config, url_parsed, None).await
795}
796
797/// Launch a chromium browser with configurations and wait until the instance is up.
798pub async fn launch_browser_cookies(
799    config: &Configuration,
800    url_parsed: &Option<Box<Url>>,
801    jar: Option<&Arc<crate::client::cookie::Jar>>,
802) -> Option<(
803    Browser,
804    tokio::task::JoinHandle<()>,
805    Option<BrowserContextId>,
806    std::sync::Arc<std::sync::atomic::AtomicBool>,
807)> {
808    launch_browser_base(config, url_parsed, jar).await
809}
810
811/// Represents IP-based geolocation and network metadata.
812#[derive(Debug)]
813#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
814pub struct GeoInfo {
815    /// The public IP address detected.
816    pub ip: Option<String>,
817    /// The CIDR network range of the IP.
818    pub network: Option<String>,
819    /// IP version (e.g., "IPv4" or "IPv6").
820    pub version: Option<String>,
821    /// The city associated with the IP.
822    pub city: Option<String>,
823    /// The region (e.g., state or province).
824    pub region: Option<String>,
825    /// Short regional code (e.g., "CA").
826    pub region_code: Option<String>,
827    /// Two-letter country code (e.g., "US").
828    pub country: Option<String>,
829    /// Full country name.
830    pub country_name: Option<String>,
831    /// Same as `country`, often redundant.
832    pub country_code: Option<String>,
833    /// ISO 3166-1 alpha-3 country code (e.g., "USA").
834    pub country_code_iso3: Option<String>,
835    /// Capital of the country.
836    pub country_capital: Option<String>,
837    /// Top-level domain of the country (e.g., ".us").
838    pub country_tld: Option<String>,
839    /// Continent code (e.g., "NA").
840    pub continent_code: Option<String>,
841    /// Whether the country is in the European Union.
842    pub in_eu: Option<bool>,
843    /// Postal or ZIP code.
844    pub postal: Option<String>,
845    /// Approximate latitude of the IP location.
846    pub latitude: Option<f64>,
847    /// Approximate longitude of the IP location.
848    pub longitude: Option<f64>,
849    /// Timezone identifier (e.g., "America/New_York").
850    pub timezone: Option<String>,
851    /// UTC offset string (e.g., "-0400").
852    pub utc_offset: Option<String>,
853    /// Country calling code (e.g., "+1").
854    pub country_calling_code: Option<String>,
855    /// ISO 4217 currency code (e.g., "USD").
856    pub currency: Option<String>,
857    /// Currency name (e.g., "Dollar").
858    pub currency_name: Option<String>,
859    /// Comma-separated preferred language codes.
860    pub languages: Option<String>,
861    /// Country surface area in square kilometers.
862    pub country_area: Option<f64>,
863    /// Approximate country population.
864    pub country_population: Option<u64>,
865    /// ASN (Autonomous System Number) of the IP.
866    pub asn: Option<String>,
867    /// ISP or organization name.
868    pub org: Option<String>,
869}
870
871/// Auto-detect the geo-location.
872#[cfg(feature = "serde")]
873pub async fn detect_geo_info(new_page: &Page) -> Option<GeoInfo> {
874    use rand::prelude::IndexedRandom;
875    let apis = [
876        "https://ipapi.co/json",
877        "https://ipinfo.io/json",
878        "https://ipwho.is/",
879    ];
880
881    let url = apis.choose(&mut rand::rng())?;
882
883    new_page.goto(*url).await.ok()?;
884    new_page.wait_for_navigation().await.ok()?;
885
886    let html = new_page.content().await.ok()?;
887
888    let json_start = html.find("<pre>")? + "<pre>".len();
889    let json_end = html.find("</pre>")?;
890    let json = html.get(json_start..json_end)?.trim();
891
892    serde_json::from_str(json).ok()
893}
894
895#[cfg(not(feature = "serde"))]
896/// Auto-detect the geo-location.
897pub async fn detect_geo_info(new_page: &Page) -> Option<GeoInfo> {
898    None
899}
900
901/// configure the browser.
902pub async fn configure_browser(new_page: &Page, configuration: &Configuration) {
903    let mut timezone = configuration.timezone_id.is_some();
904    let mut locale = configuration.locale.is_some();
905
906    let mut timezone_value = configuration.timezone_id.clone();
907    let mut locale_value = configuration.locale.clone();
908
909    let mut emulate_geolocation = None;
910
911    // get the locale of the proxy.
912    if configuration.auto_geolocation && configuration.proxies.is_some() && !timezone && !locale {
913        if let Some(geo) = detect_geo_info(new_page).await {
914            if let Some(languages) = geo.languages {
915                if let Some(locale_v) = languages.split(',').next() {
916                    if !locale_v.is_empty() {
917                        locale_value = Some(Box::new(locale_v.into()));
918                    }
919                }
920            }
921
922            if let Some(timezone_v) = geo.timezone {
923                if !timezone_v.is_empty() {
924                    timezone_value = Some(Box::new(timezone_v));
925                }
926            }
927
928            timezone = timezone_value.is_some();
929            locale = locale_value.is_some();
930
931            let mut geo_location_override = SetGeolocationOverrideParams::default();
932
933            geo_location_override.latitude = geo.latitude;
934            geo_location_override.longitude = geo.longitude;
935            geo_location_override.accuracy = Some(0.7);
936
937            emulate_geolocation = Some(geo_location_override);
938        }
939    }
940
941    if timezone && locale {
942        let geo = async {
943            if let Some(geolocation) = emulate_geolocation {
944                let _ = new_page.emulate_geolocation(geolocation).await;
945            }
946        };
947        let timezone_id = async {
948            if let Some(timezone_id) = timezone_value.as_deref() {
949                if !timezone_id.is_empty() {
950                    let _ = new_page
951                    .emulate_timezone(
952                        chromiumoxide::cdp::browser_protocol::emulation::SetTimezoneOverrideParams::new(
953                            timezone_id,
954                        ),
955                    )
956                    .await;
957                }
958            }
959        };
960
961        let locale = async {
962            if let Some(locale) = locale_value.as_deref() {
963                if !locale.is_empty() {
964                    let _ = new_page
965                        .emulate_locale(
966                            chromiumoxide::cdp::browser_protocol::emulation::SetLocaleOverrideParams {
967                                locale: Some(locale.into()),
968                            },
969                        )
970                        .await;
971                }
972            }
973        };
974
975        tokio::join!(timezone_id, locale, geo);
976    } else if timezone {
977        if let Some(timezone_id) = timezone_value.as_deref() {
978            if !timezone_id.is_empty() {
979                let _ = new_page
980                    .emulate_timezone(
981                        chromiumoxide::cdp::browser_protocol::emulation::SetTimezoneOverrideParams::new(
982                            timezone_id,
983                        ),
984                    )
985                    .await;
986            }
987        }
988    } else if locale {
989        if let Some(locale) = locale_value.as_deref() {
990            if !locale.is_empty() {
991                let _ = new_page
992                    .emulate_locale(
993                        chromiumoxide::cdp::browser_protocol::emulation::SetLocaleOverrideParams {
994                            locale: Some(locale.into()),
995                        },
996                    )
997                    .await;
998            }
999        }
1000    }
1001}
1002
1003/// attempt to navigate to a page respecting the request timeout. This will attempt to get a response for up to 60 seconds. There is a bug in the browser hanging if the CDP connection or handler errors. [https://github.com/mattsse/chromiumoxide/issues/64]
1004#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
1005pub(crate) async fn attempt_navigation(
1006    url: &str,
1007    browser: &Browser,
1008    request_timeout: &Option<core::time::Duration>,
1009    browser_context_id: &Option<BrowserContextId>,
1010    viewport: &Option<crate::features::chrome_common::Viewport>,
1011) -> Result<Page, CdpError> {
1012    let mut cdp_params = CreateTargetParams::new(url);
1013
1014    cdp_params.background = Some(browser_context_id.is_some()); // not supported headless-shell
1015    cdp_params.browser_context_id.clone_from(browser_context_id);
1016    cdp_params.for_tab = Some(false);
1017
1018    if viewport.is_some() {
1019        browser
1020            .config()
1021            .and_then(|c| c.viewport.as_ref())
1022            .and_then(|b_vp| {
1023                viewport.as_ref().map(|vp| {
1024                    let new_viewport = b_vp.width == vp.width && b_vp.height == vp.height;
1025
1026                    if !new_viewport {
1027                        if vp.width >= 25 {
1028                            cdp_params.width = Some(vp.width.into());
1029                        }
1030                        if vp.height >= 25 {
1031                            cdp_params.height = Some(vp.height.into());
1032                        }
1033                        cdp_params.new_window = Some(true);
1034                    }
1035                })
1036            });
1037    }
1038
1039    let page_result = tokio::time::timeout(
1040        match request_timeout {
1041            Some(timeout) => *timeout,
1042            _ => tokio::time::Duration::from_secs(60),
1043        },
1044        browser.new_page(cdp_params),
1045    )
1046    .await;
1047
1048    match page_result {
1049        Ok(page) => page,
1050        Err(_) => Err(CdpError::Timeout),
1051    }
1052}
1053
1054/// close the browser and open handles
1055pub async fn close_browser(
1056    browser_handle: JoinHandle<()>,
1057    _browser: &Browser,
1058    _context_id: &mut Option<BrowserContextId>,
1059) {
1060    if !browser_handle.is_finished() {
1061        browser_handle.abort();
1062    }
1063}
1064
1065/// Setup interception for auth challenges. This does nothing without the 'chrome_intercept' flag.
1066#[cfg(feature = "chrome")]
1067pub async fn setup_auth_challenge_response(
1068    page: &chromiumoxide::Page,
1069    chrome_intercept: bool,
1070    auth_challenge_response: &Option<crate::configuration::AuthChallengeResponse>,
1071) {
1072    if chrome_intercept {
1073        if let Some(ref auth_challenge_response) = auth_challenge_response {
1074            if let Ok(mut rp) = page
1075                .event_listener::<chromiumoxide::cdp::browser_protocol::fetch::EventAuthRequired>()
1076                .await
1077            {
1078                let intercept_page = page.clone();
1079                let auth_challenge_response = auth_challenge_response.clone();
1080
1081                // we may need return for polling
1082                crate::utils::spawn_task("auth_interception", async move {
1083                    while let Some(event) = rp.next().await {
1084                        let u = &event.request.url;
1085                        let acr = chromiumoxide::cdp::browser_protocol::fetch::AuthChallengeResponse::from(auth_challenge_response.clone());
1086
1087                        match chromiumoxide::cdp::browser_protocol::fetch::ContinueWithAuthParams::builder()
1088                        .request_id(event.request_id.clone())
1089                        .auth_challenge_response(acr)
1090                        .build() {
1091                            Ok(c) => {
1092                                if let Err(e) = intercept_page.send_command(c).await
1093                                {
1094                                    log("Failed to fullfill auth challege request: ", e.to_string());
1095                                }
1096                            }
1097                            _ => {
1098                                log("Failed to get auth challege request handle ", u);
1099                            }
1100                        }
1101                    }
1102                });
1103            }
1104        }
1105    }
1106}
1107
1108/// Setup interception for chrome request. This does nothing without the 'chrome_intercept' flag.
1109#[cfg(feature = "chrome")]
1110pub async fn setup_chrome_interception_base(
1111    page: &chromiumoxide::Page,
1112    chrome_intercept: bool,
1113    auth_challenge_response: &Option<crate::configuration::AuthChallengeResponse>,
1114    _ignore_visuals: bool,
1115    _host_name: &str,
1116) -> Option<tokio::task::JoinHandle<()>> {
1117    if chrome_intercept {
1118        setup_auth_challenge_response(page, chrome_intercept, auth_challenge_response).await;
1119    }
1120    None
1121}
1122
1123/// establish all the page events.
1124pub async fn setup_chrome_events(chrome_page: &chromiumoxide::Page, config: &Configuration) {
1125    let ua_opt = config.user_agent.as_deref().filter(|ua| !ua.is_empty());
1126
1127    let ua_for_profiles: &str = ua_opt.map_or("", |v| v);
1128
1129    let mut emulation_config =
1130        spider_fingerprint::EmulationConfiguration::setup_defaults(ua_for_profiles);
1131
1132    let stealth_mode = config.stealth_mode;
1133    let use_stealth = stealth_mode.stealth();
1134    let block_ads = config.chrome_intercept.block_ads;
1135
1136    emulation_config.dismiss_dialogs = config.dismiss_dialogs.unwrap_or(true);
1137    emulation_config.fingerprint = config.fingerprint;
1138    emulation_config.tier = stealth_mode;
1139    emulation_config.user_agent_data = Some(!ua_for_profiles.is_empty());
1140
1141    let viewport = config.viewport.as_ref().map(|vp| (*vp).into());
1142
1143    let gpu_profile = spider_fingerprint::profiles::gpu::select_random_gpu_profile(
1144        spider_fingerprint::get_agent_os(ua_for_profiles),
1145    );
1146
1147    let merged_script = spider_fingerprint::emulate_with_profile(
1148        ua_for_profiles,
1149        &emulation_config,
1150        &viewport.as_ref(),
1151        &config.evaluate_on_new_document,
1152        gpu_profile,
1153    );
1154
1155    let should_inject_script =
1156        (use_stealth || config.evaluate_on_new_document.is_some()) && merged_script.is_some();
1157
1158    let hc: u32 = gpu_profile.hardware_concurrency.try_into().unwrap_or(8);
1159
1160    let apply_page_setup = {
1161        async move {
1162            let f_script = async {
1163                if should_inject_script {
1164                    let _ = chrome_page
1165                        .add_script_to_evaluate_on_new_document(merged_script)
1166                        .await;
1167                }
1168            };
1169
1170            let f_adblock = async {
1171                if block_ads {
1172                    let _ = chrome_page.set_ad_blocking_enabled(true).await;
1173                }
1174            };
1175
1176            let f_ua = async {
1177                if !ua_for_profiles.is_empty() {
1178                    let _ = chrome_page.set_user_agent(ua_for_profiles).await;
1179                }
1180            };
1181
1182            let f_hc = async {
1183                if use_stealth {
1184                    let _ = chrome_page.emulate_hardware_concurrency(hc.into()).await;
1185                }
1186            };
1187
1188            tokio::join!(f_script, f_adblock, f_ua, f_hc);
1189        }
1190    };
1191
1192    let disable_log = async {
1193        if config.disable_log {
1194            let _ = chrome_page.disable_log().await;
1195        }
1196    };
1197
1198    let bypass_csp = async {
1199        if config.bypass_csp {
1200            let _ = chrome_page.set_bypass_csp(true).await;
1201        }
1202    };
1203
1204    if tokio::time::timeout(tokio::time::Duration::from_secs(15), async {
1205        tokio::join!(
1206            apply_page_setup,
1207            disable_log,
1208            bypass_csp,
1209            configure_browser(chrome_page, config),
1210        )
1211    })
1212    .await
1213    .is_err()
1214    {
1215        log::error!("failed to setup event handlers within 15 seconds.");
1216    }
1217}
1218
1219pub(crate) type BrowserControl = (
1220    std::sync::Arc<chromiumoxide::Browser>,
1221    Option<tokio::task::JoinHandle<()>>,
1222    Option<chromiumoxide::cdp::browser_protocol::browser::BrowserContextId>,
1223);
1224
1225/// Once cell browser
1226#[cfg(all(feature = "smart", not(feature = "decentralized")))]
1227pub(crate) type OnceBrowser = tokio::sync::OnceCell<Option<BrowserController>>;
1228
1229/// Create the browser controller to auto drop connections.
1230pub struct BrowserController {
1231    /// The browser.
1232    pub browser: BrowserControl,
1233    /// Closed browser.
1234    pub closed: bool,
1235    /// Signal set by the handler task when the browser process dies or the
1236    /// WebSocket disconnects. Spawned page-fetch tasks should check this
1237    /// before creating new tabs to avoid wasting work on a dead browser.
1238    pub browser_dead: std::sync::Arc<std::sync::atomic::AtomicBool>,
1239}
1240
1241impl BrowserController {
1242    /// A new browser controller.
1243    pub(crate) fn new(
1244        browser: BrowserControl,
1245        browser_dead: std::sync::Arc<std::sync::atomic::AtomicBool>,
1246    ) -> Self {
1247        BrowserController {
1248            browser,
1249            closed: false,
1250            browser_dead,
1251        }
1252    }
1253    /// Dispose the browser context and join handler.
1254    pub fn dispose(&mut self) {
1255        if !self.closed {
1256            self.closed = true;
1257            if let Some(handler) = self.browser.1.take() {
1258                handler.abort();
1259            }
1260        }
1261    }
1262}
1263
1264impl Drop for BrowserController {
1265    fn drop(&mut self) {
1266        self.dispose();
1267    }
1268}
1269
1270/// Guard that closes a Chrome tab when dropped.
1271///
1272/// chromiumoxide's `Page` does **not** close the underlying Chrome tab on drop —
1273/// it only decrements an internal counter.  When `tokio::select!` cancels the
1274/// losing future during a hedge race, the tab stays open and keeps consuming
1275/// browser resources.  Over time the leaked tabs exhaust Chrome, causing
1276/// `browser.new_page()` to hang and deadlocking the crawl.
1277///
1278/// `TabCloseGuard` holds a clone of the `Page` handle.  On drop it spawns a
1279/// fire-and-forget `page.close()` task so the tab is cleaned up even when the
1280/// owning future is cancelled.  Call [`defuse`](Self::defuse) before an
1281/// explicit `.close().await` to avoid a double-close.
1282#[cfg(all(feature = "chrome", not(feature = "decentralized")))]
1283pub(crate) struct TabCloseGuard(Option<chromiumoxide::Page>);
1284
1285#[cfg(all(feature = "chrome", not(feature = "decentralized")))]
1286impl TabCloseGuard {
1287    /// Create a guard that will close `page` on drop.
1288    #[inline]
1289    pub fn new(page: chromiumoxide::Page) -> Self {
1290        Self(Some(page))
1291    }
1292
1293    /// Disarm the guard — the caller will close the tab explicitly.
1294    #[inline]
1295    pub fn defuse(mut self) {
1296        self.0 = None;
1297        // self is dropped here; Drop sees None → no-op.
1298    }
1299}
1300
1301#[cfg(all(feature = "chrome", not(feature = "decentralized")))]
1302impl Drop for TabCloseGuard {
1303    fn drop(&mut self) {
1304        if let Some(page) = self.0.take() {
1305            tokio::task::spawn(async move {
1306                // Timeout prevents zombie tasks when Chrome is unresponsive.
1307                // 5 seconds is generous — tab close is normally sub-100ms.
1308                let _ =
1309                    tokio::time::timeout(tokio::time::Duration::from_secs(5), page.close()).await;
1310            });
1311        }
1312    }
1313}