spider/
website.rs

1use crate::black_list::contains;
2use crate::client::redirect::Policy;
3use crate::compact_str::CompactString;
4use crate::configuration::{
5    self, get_ua, AutomationScriptsMap, Configuration, ExecutionScriptsMap, RedirectPolicy,
6    SerializableHeaderMap,
7};
8use crate::{page::build, utils::PageResponse};
9
10use crate::features::chrome_common::RequestInterceptConfiguration;
11#[cfg(feature = "disk")]
12use crate::features::disk::DatabaseHandler;
13use crate::packages::robotparser::parser::RobotFileParser;
14use crate::page::{
15    AntiBotTech, Page, PageLinkBuildSettings, CHROME_UNKNOWN_STATUS_ERROR, UNKNOWN_STATUS_ERROR,
16};
17use crate::utils::abs::{convert_abs_url, parse_absolute_url};
18use crate::utils::interner::ListBucket;
19use crate::utils::{
20    crawl_duration_expired, emit_log, emit_log_shutdown, get_path_from_url, get_semaphore,
21    networking_capable, prepare_url, setup_website_selectors, spawn_set, AllowedDomainTypes,
22};
23use crate::{CaseInsensitiveString, Client, ClientBuilder, RelativeSelectors};
24#[cfg(feature = "cron")]
25use async_job::{async_trait, Job, Runner};
26use hashbrown::{HashMap, HashSet};
27use reqwest::header::REFERER;
28use reqwest::StatusCode;
29use std::fmt;
30use std::net::IpAddr;
31use std::sync::atomic::{AtomicBool, AtomicI8, AtomicUsize, Ordering};
32use std::sync::Arc;
33use std::time::{Duration, Instant};
34use tokio::{
35    sync::{broadcast, Semaphore},
36    task::JoinSet,
37    time::Interval,
38};
39use tokio_stream::StreamExt;
40use url::Url;
41
42#[cfg(feature = "cache_request")]
43use http_cache_reqwest::{Cache, CacheMode, HttpCache, HttpCacheOptions};
44
45#[cfg(feature = "cache_request")]
46pub use http_global_cache::CACACHE_MANAGER;
47
48/// The max backoff duration in seconds.
49const BACKOFF_MAX_DURATION: tokio::time::Duration = tokio::time::Duration::from_secs(60);
50
51/// calculate the base limits
52pub fn calc_limits(multiplier: usize) -> usize {
53    let logical = num_cpus::get();
54    let physical = num_cpus::get_physical();
55
56    let sem_limit = if logical > physical {
57        (logical) / (physical)
58    } else {
59        logical
60    };
61
62    let (sem_limit, sem_max) = if logical == physical {
63        (sem_limit * physical, 30 * multiplier)
64    } else {
65        (sem_limit * 2, 20 * multiplier)
66    };
67
68    sem_limit.max(sem_max)
69}
70
71/// Javascript challenge pages.
72static JS_SAFE_CHALLENGE_PATTERNS: &[&str] = &[
73    r#"Enable JavaScript and cookies to continue"#, // Cloudflare
74    r#"To continue, please enable JavaScript in your browser settings"#, // Akamai, F5
75    r#"Please enable JavaScript to view the page content"#, // AWS WAF
76];
77
78/// check if the page is a javascript challenge
79pub fn is_safe_javascript_challenge(page: &Page) -> bool {
80    let page = page.get_html_bytes_u8();
81
82    let page_size = page.len();
83
84    if page_size == 0 || page_size > 10_000 {
85        return false;
86    }
87
88    AC_JS_CHALLENGE.find(page).is_some()
89}
90
91#[cfg(all(
92    any(
93        target_os = "android",
94        target_os = "fuchsia",
95        target_os = "illumos",
96        target_os = "ios",
97        target_os = "linux",
98        target_os = "macos",
99        target_os = "solaris",
100        target_os = "tvos",
101        target_os = "visionos",
102        target_os = "watchos",
103    ),
104    any(not(feature = "wreq"), feature = "cache_request")
105))]
106/// Bind connections only on the specified network interface.
107pub fn set_interface(client: ClientBuilder, network_interface: &str) -> ClientBuilder {
108    client.interface(network_interface)
109}
110
111#[cfg(not(any(
112    all(feature = "wreq", not(feature = "cache_request")),
113    target_os = "android",
114    target_os = "fuchsia",
115    target_os = "illumos",
116    target_os = "ios",
117    target_os = "linux",
118    target_os = "macos",
119    target_os = "solaris",
120    target_os = "tvos",
121    target_os = "visionos",
122    target_os = "watchos",
123)))]
124/// Bind connections only on the specified network interface.
125pub fn set_interface(client: ClientBuilder, _interface: &str) -> ClientBuilder {
126    client
127}
128
129lazy_static! {
130    static ref AC_JS_CHALLENGE: aho_corasick::AhoCorasick =  aho_corasick::AhoCorasick::new(JS_SAFE_CHALLENGE_PATTERNS).expect("safe challenges");
131    /// The default Semaphore limits.
132    pub static ref DEFAULT_PERMITS: usize = calc_limits(1);
133    /// The shared global Semaphore.
134    pub(crate) static ref SEM_SHARED: Arc<Semaphore> = {
135        let base_limit = match std::env::var("SEMAPHORE_MULTIPLIER") {
136            Ok(multiplier) => match multiplier.parse::<isize>() {
137                Ok(parsed_value) => (*DEFAULT_PERMITS as isize)
138                    .wrapping_mul(parsed_value)
139                    .max(1) as usize,
140                Err(_) => *DEFAULT_PERMITS,
141            },
142            _ => *DEFAULT_PERMITS,
143        };
144        Arc::new(Semaphore::const_new(base_limit))
145    };
146    /// The max links to store in memory.
147    pub(crate) static ref LINKS_VISITED_MEMORY_LIMIT: usize = {
148        const DEFAULT_LIMIT: usize = 15_000;
149
150        match std::env::var("LINKS_VISITED_MEMORY_LIMIT") {
151            Ok(limit) => limit.parse::<usize>().unwrap_or(DEFAULT_LIMIT),
152            _ => DEFAULT_LIMIT
153        }
154    };
155    static ref WILD_CARD_PATH: CaseInsensitiveString = CaseInsensitiveString::from("*");
156}
157
158#[cfg(not(feature = "decentralized"))]
159lazy_static! {
160    /// The global Semaphore.
161    static ref SEM: Semaphore = {
162        let base_limit = calc_limits(1);
163
164        let base_limit = match std::env::var("SEMAPHORE_MULTIPLIER") {
165            Ok(multiplier) => match multiplier.parse::<isize>() {
166                Ok(parsed_value) => (base_limit as isize * parsed_value).max(1) as usize,
167                Err(_) => base_limit,
168            },
169            _ => base_limit,
170        };
171
172        Semaphore::const_new(base_limit)
173    };
174}
175
176#[cfg(feature = "decentralized")]
177lazy_static! {
178    /// The global worker count.
179    static ref WORKERS: HashSet<String> = {
180        let mut set: HashSet<_> = HashSet::new();
181
182        for worker in std::env::var("SPIDER_WORKER_SCRAPER")
183            .unwrap_or_else(|_| "http://127.0.0.1:3031".to_string())
184            .split(",")
185        {
186            set.insert(worker.to_string());
187        }
188
189        for worker in std::env::var("SPIDER_WORKER")
190            .unwrap_or_else(|_| "http://127.0.0.1:3030".to_string())
191            .split(",")
192        {
193            set.insert(worker.to_string());
194        }
195
196        set
197    };
198    static ref SEM: Semaphore = {
199        let sem_limit = calc_limits(3);
200        Semaphore::const_new(sem_limit * WORKERS.len())
201    };
202}
203
204// const INVALID_URL: &str = "The domain should be a valid URL, refer to <https://www.w3.org/TR/2011/WD-html5-20110525/urls.html#valid-url>.";
205
206/// the active status of the crawl.
207#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, strum::EnumString, strum::Display)]
208#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
209pub enum CrawlStatus {
210    /// The crawl did not start yet.
211    #[default]
212    Start,
213    /// The crawl is idle and has completed.
214    Idle,
215    /// The crawl is active.
216    Active,
217    /// The crawl blocked from network ratelimit, firewall, etc.
218    Blocked,
219    /// Crawl blocked from spider firewall.
220    FirewallBlocked,
221    /// The crawl failed from a server error.
222    ServerError,
223    /// The crawl failed from a connection error with proxy or dns.
224    ConnectError,
225    /// The crawl was rate limited.
226    RateLimited,
227    /// The initial request ran without returning html.
228    Empty,
229    /// The URL of the website is invalid. Crawl cannot commence.
230    Invalid,
231    #[cfg(feature = "control")]
232    /// The crawl shutdown manually.
233    Shutdown,
234    #[cfg(feature = "control")]
235    /// The crawl paused manually.
236    Paused,
237}
238
239/// The link activity for the crawl.
240#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, strum::EnumString, strum::Display)]
241#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
242pub enum ProcessLinkStatus {
243    /// The link can process.
244    #[default]
245    Allowed,
246    /// The link is blocked.
247    Blocked,
248    /// The budget is exceeded for the crawl.
249    BudgetExceeded,
250}
251
252/// The type of cron job to run
253#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, strum::EnumString, strum::Display)]
254#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
255pub enum CronType {
256    #[default]
257    /// Crawl collecting links, page data, and etc.
258    Crawl,
259    /// Scrape collecting links, page data as bytes to store, and etc.
260    Scrape,
261}
262
263#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, strum::EnumString, strum::Display)]
264#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
265/// Generic website meta info for handling retries.
266pub enum WebsiteMetaInfo {
267    /// The page requires Javascript.
268    RequiresJavascript,
269    /// Standard apache 403 page that requires a special http header for access like a custom iframe server.
270    Apache403,
271    /// Standard Open Resty 403 page that requires a special http header for access like a custom iframe server.
272    OpenResty403,
273    /// No meta info.
274    #[default]
275    None,
276}
277
278/// On link find callback rewrite a url if it meets a condition.
279pub type OnLinkFindCallback = Arc<
280    dyn Fn(CaseInsensitiveString, Option<String>) -> (CaseInsensitiveString, Option<String>)
281        + Send
282        + Sync,
283>;
284
285/// Callback closure that determines if a link should be crawled or not.
286pub trait OnShouldCrawlClosure: Fn(&Page) -> bool + Send + Sync + 'static {}
287impl<F: Fn(&Page) -> bool + Send + Sync + 'static> OnShouldCrawlClosure for F {}
288
289/// Callback closure or function pointer that determines if a link should be crawled or not.
290#[derive(Clone)]
291pub enum OnShouldCrawlCallback {
292    /// Static function pointer.
293    Fn(fn(&Page) -> bool),
294
295    /// Closure.
296    Closure(Arc<dyn OnShouldCrawlClosure>),
297}
298impl OnShouldCrawlCallback {
299    fn call(&self, page: &Page) -> bool {
300        match self {
301            Self::Fn(func) => func(page),
302            Self::Closure(closure) => closure(page),
303        }
304    }
305}
306
307/// Round-robin client rotator for proxy rotation.
308/// Each client is built with a single proxy, and `next()` cycles through them.
309#[derive(Clone)]
310pub struct ClientRotator {
311    clients: Vec<Client>,
312    index: Arc<AtomicUsize>,
313}
314
315impl ClientRotator {
316    /// Create a new rotator from a list of clients.
317    pub fn new(clients: Vec<Client>) -> Self {
318        Self {
319            clients,
320            index: Arc::new(AtomicUsize::new(0)),
321        }
322    }
323
324    /// Get the next client in round-robin order.
325    pub fn next(&self) -> &Client {
326        let idx = self.index.fetch_add(1, Ordering::Relaxed) % self.clients.len();
327        &self.clients[idx]
328    }
329
330    /// Number of clients in the rotator.
331    pub fn len(&self) -> usize {
332        self.clients.len()
333    }
334
335    /// Whether the rotator is empty.
336    pub fn is_empty(&self) -> bool {
337        self.clients.is_empty()
338    }
339
340    /// Get two different clients for hedging. Returns (primary, hedge) where
341    /// hedge is guaranteed to be a different client if more than one exists.
342    #[cfg(feature = "hedge")]
343    pub fn next_pair(&self) -> (&Client, Option<&Client>) {
344        let len = self.clients.len();
345        if len <= 1 {
346            return (&self.clients[0], None);
347        }
348        let idx = self.index.fetch_add(2, Ordering::Relaxed);
349        let primary_idx = idx % len;
350        let hedge_idx = (idx + 1) % len;
351        (&self.clients[primary_idx], Some(&self.clients[hedge_idx]))
352    }
353}
354
355/// Represents a website to crawl and gather all links or page content.
356/// ```rust
357/// use spider::website::Website;
358/// let mut website = Website::new("http://example.com");
359/// website.crawl();
360/// // `Website` will be filled with links or pages when crawled. If you need pages with the resource
361/// // call the `website.scrape` method with `website.get_pages` instead.
362/// for link in website.get_links() {
363///     // do something
364/// }
365/// ```
366#[derive(Clone, Default)]
367pub struct Website {
368    /// Configuration properties for website.
369    pub configuration: Box<Configuration>,
370    /// The callback when a link is found.
371    pub on_link_find_callback: Option<OnLinkFindCallback>,
372    /// The callback to use if a page should be ignored. Return false to ensure that the discovered links are not crawled.
373    pub on_should_crawl_callback: Option<OnShouldCrawlCallback>,
374    /// Set the crawl ID to track. This allows explicit targeting for shutdown, pause, and etc.
375    pub crawl_id: Box<String>,
376    #[cfg(feature = "extra_information")]
377    /// Extra information to store.
378    pub extra_info: Option<Box<String>>,
379    /// Seed the initial html for crawling.
380    seed_html: Option<String>,
381    /// All URLs visited.
382    links_visited: Box<ListBucket>,
383    /// All signatures.
384    signatures: Box<HashSet<u64>>,
385    /// Extra links to crawl.
386    extra_links: Box<HashSet<CaseInsensitiveString>>,
387    /// Pages visited.
388    pages: Option<Vec<Page>>,
389    /// Robot.txt parser.
390    robot_file_parser: Option<Box<RobotFileParser>>,
391    /// Base url of the crawl.
392    url: Box<CaseInsensitiveString>,
393    /// The domain url parsed.
394    domain_parsed: Option<Box<Url>>,
395    /// Subscribe and broadcast changes.
396    channel: Option<(broadcast::Sender<Page>, Arc<broadcast::Receiver<Page>>)>,
397    /// Guard counter for channel handling. This prevents things like the browser from closing after the crawl so that subscriptions can finalize events.
398    channel_guard: Option<ChannelGuard>,
399    /// Send links to process during the crawl.
400    channel_queue: Option<(broadcast::Sender<String>, Arc<broadcast::Receiver<String>>)>,
401    /// The status of the active crawl this is mapped to a general status and not the HTTP status code.
402    status: CrawlStatus,
403    /// The initial status code of the first request.
404    initial_status_code: StatusCode,
405    /// The initial anti-bot tech found.
406    initial_anti_bot_tech: AntiBotTech,
407    /// The initial bytes size of the first request.
408    initial_html_length: usize,
409    /// The initial page had a waf detection.
410    initial_page_waf_check: bool,
411    /// The initial page should retry.
412    initial_page_should_retry: bool,
413    /// The website was manually stopped.
414    shutdown: bool,
415    /// The request client. Stored for re-use between runs.
416    client: Option<Client>,
417    /// Round-robin client rotator for proxy rotation. Built when 2+ proxies are configured.
418    client_rotator: Option<Arc<ClientRotator>>,
419    /// The disk handler to use.
420    #[cfg(feature = "disk")]
421    sqlite: Option<Box<DatabaseHandler>>,
422    /// Configure sqlite on start
423    #[cfg(feature = "disk")]
424    enable_sqlite: bool,
425    /// Was the setup already configured for sync sendable thread use?
426    send_configured: bool,
427    /// The website requires javascript to load. This will be sent as a hint when http request.
428    website_meta_info: WebsiteMetaInfo,
429    /// Skip the initial link?
430    skip_initial: bool,
431    #[cfg(feature = "cookies")]
432    /// Cookie jar between request.
433    pub cookie_jar: Arc<crate::client::cookie::Jar>,
434}
435
436impl fmt::Debug for Website {
437    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
438        let domain_str = self.domain_parsed.as_ref().map(|u| u.as_str().to_owned());
439        let pages_len = self.pages.as_ref().map(|p| p.len()).unwrap_or(0);
440
441        let mut ds = f.debug_struct("Website");
442
443        ds.field("url", &self.url.as_ref())
444            .field("crawl_id", &self.crawl_id)
445            .field("domain_parsed", &domain_str)
446            // callbacks – just show presence, avoids Fn: Debug bound
447            .field(
448                "on_link_find_callback",
449                &self.on_link_find_callback.is_some(),
450            )
451            .field(
452                "on_should_crawl_callback",
453                &self.on_should_crawl_callback.is_some(),
454            )
455            // state + counters
456            .field("status", &self.status)
457            .field("shutdown", &self.shutdown)
458            .field("extra_links_len", &self.extra_links.len())
459            .field("signatures_len", &self.signatures.len())
460            .field("pages_len", &pages_len)
461            // channels / sqlite / client: just booleans
462            .field("channel_present", &self.channel.is_some())
463            .field("channel_queue_present", &self.channel_queue.is_some())
464            .field("client_present", &self.client.is_some())
465            // initial page info
466            .field("initial_status_code", &self.initial_status_code)
467            .field("initial_html_length", &self.initial_html_length)
468            .field("initial_anti_bot_tech", &self.initial_anti_bot_tech)
469            .field("initial_page_waf_check", &self.initial_page_waf_check)
470            .field("initial_page_should_retry", &self.initial_page_should_retry)
471            // misc flags/meta
472            .field("send_configured", &self.send_configured)
473            .field("website_meta_info", &self.website_meta_info)
474            .field("skip_initial", &self.skip_initial);
475
476        #[cfg(feature = "disk")]
477        {
478            ds.field("sqlite_present", &self.sqlite.is_some())
479                .field("enable_sqlite", &self.enable_sqlite);
480        }
481
482        ds.finish()
483    }
484}
485
486impl Website {
487    /// Initialize the Website with a starting link to crawl and check the firewall base.
488    fn _new(url: &str, check_firewall: bool) -> Self {
489        let url = url.trim();
490        let url: Box<CaseInsensitiveString> = if networking_capable(url) {
491            CaseInsensitiveString::new(&url).into()
492        } else {
493            CaseInsensitiveString::new(&prepare_url(url)).into()
494        };
495
496        let domain_parsed: Option<Box<Url>> = parse_absolute_url(&url);
497        let mut status = CrawlStatus::Start;
498
499        if let Some(u) = &domain_parsed {
500            if check_firewall && crate::utils::abs::block_website(u) {
501                status = CrawlStatus::FirewallBlocked;
502            }
503        }
504
505        Self {
506            configuration: Configuration::new().into(),
507            status,
508            domain_parsed,
509            url,
510            #[cfg(feature = "disk")]
511            enable_sqlite: true,
512            ..Default::default()
513        }
514    }
515
516    /// Initialize the Website with a starting link to crawl.
517    pub fn new(url: &str) -> Self {
518        Website::_new(url, true)
519    }
520
521    /// Initialize the Website with a starting link to crawl and check the firewall.
522    pub fn new_with_firewall(url: &str, check_firewall: bool) -> Self {
523        Website::_new(url, check_firewall)
524    }
525
526    /// Setup a shared database.
527    #[cfg(feature = "disk")]
528    pub fn setup_database_handler(&self) -> Box<DatabaseHandler> {
529        Box::new(DatabaseHandler::new(&Some(self.target_id())))
530    }
531
532    #[cfg(feature = "disk")]
533    /// Setup the sqlist usage.
534    pub fn setup_shared_db(&mut self, db: Box<DatabaseHandler>) {
535        self.sqlite = Some(db)
536    }
537
538    #[cfg(feature = "disk")]
539    /// Setup the sqlist usage.
540    pub fn setup_sqlite(&mut self) {
541        if self.sqlite.is_none() {
542            self.sqlite = Some(self.setup_database_handler())
543        }
544    }
545
546    /// Set the url of the website to re-use configuration and data.
547    pub fn set_url(&mut self, url: &str) -> &mut Self {
548        let url = if url.starts_with(' ') || url.ends_with(' ') {
549            url.trim()
550        } else {
551            url
552        };
553
554        let domain: Box<CaseInsensitiveString> = if networking_capable(url) {
555            CaseInsensitiveString::new(&url).into()
556        } else {
557            CaseInsensitiveString::new(&prepare_url(url)).into()
558        };
559
560        self.domain_parsed = parse_absolute_url(&domain);
561        self.url = domain;
562        self
563    }
564
565    /// Set the direct url of the website to re-use configuration and data without parsing the domain.
566    pub fn set_url_only(&mut self, url: &str) -> &mut Self {
567        self.url = CaseInsensitiveString::new(&url).into();
568        self
569    }
570
571    /// Get the target id for a crawl. This takes the crawl ID and the url and concats it without delimiters.
572    pub fn target_id(&self) -> String {
573        string_concat!(self.crawl_id, self.url.inner())
574    }
575
576    /// Single page request.
577    pub fn single_page(&self) -> bool {
578        match &self.configuration.inner_budget {
579            Some(b) => match b.get(&*WILD_CARD_PATH) {
580                Some(b) => b.eq(&1),
581                _ => false,
582            },
583            _ => false,
584        }
585    }
586
587    /// Setup SQLite. This does nothing with `disk` flag enabled.
588    #[cfg(feature = "disk")]
589    pub fn setup_disk(&mut self) {
590        if self.enable_sqlite && self.sqlite.is_none() {
591            self.setup_sqlite();
592        }
593        // run full on sqlite.
594        if self.configuration.shared {
595            if let Some(sqlite) = self.sqlite.as_mut() {
596                sqlite.seeded = true;
597                // sqlite.persist = true;
598            }
599        }
600    }
601
602    #[cfg(feature = "disk")]
603    /// Set the sqlite disk persistance.
604    pub fn set_disk_persistance(&mut self, persist: bool) -> &mut Self {
605        if self.enable_sqlite && self.sqlite.is_some() {
606            if let Some(sqlite) = self.sqlite.as_mut() {
607                sqlite.persist = persist;
608            }
609        }
610        self
611    }
612
613    /// Setup SQLite. This does nothing with `disk` flag enabled.
614    #[cfg(not(feature = "disk"))]
615    pub fn setup_disk(&mut self) {}
616
617    /// Get the robots.txt parser.
618    pub fn get_robots_parser(&self) -> &Option<Box<RobotFileParser>> {
619        &self.robot_file_parser
620    }
621
622    /// Does the website require javascript to run?
623    pub fn get_requires_javascript(&self) -> bool {
624        self.website_meta_info == WebsiteMetaInfo::RequiresJavascript
625    }
626
627    /// Get the website meta information that can help with retry handling.
628    pub fn get_website_meta_info(&self) -> &WebsiteMetaInfo {
629        &self.website_meta_info
630    }
631
632    /// Check if URL exists (ignore case). This does nothing with `disk` flag enabled.
633    #[cfg(feature = "disk")]
634    pub async fn is_allowed_disk(&self, url_to_check: &str) -> bool {
635        match &self.sqlite {
636            Some(sqlite) => {
637                if !sqlite.ready() {
638                    true
639                } else {
640                    let db_pool = sqlite.get_db_pool().await;
641                    let allowed = sqlite.url_exists(db_pool, url_to_check).await;
642
643                    !allowed
644                }
645            }
646            _ => true,
647        }
648    }
649
650    /// Check if URL exists (ignore case). This does nothing with `disk` flag enabled.
651    #[cfg(not(feature = "disk"))]
652    pub async fn is_allowed_disk(&self, _url_to_check: &str) -> bool {
653        true
654    }
655
656    /// Check if signature exists (ignore case). This does nothing with `disk` flag enabled.
657    #[cfg(feature = "disk")]
658    pub async fn is_allowed_signature_disk(&self, signature_to_check: u64) -> bool {
659        match &self.sqlite {
660            Some(sqlite) => {
661                if !sqlite.ready() {
662                    true
663                } else {
664                    let db_pool = sqlite.get_db_pool().await;
665
666                    !sqlite.signature_exists(db_pool, signature_to_check).await
667                }
668            }
669            _ => true,
670        }
671    }
672
673    /// Check if signature exists (ignore case). This does nothing with `disk` flag enabled.
674    #[cfg(not(feature = "disk"))]
675    pub async fn is_allowed_signature_disk(&self, _signature_to_check: u64) -> bool {
676        true
677    }
678
679    /// Is the signature allowed.
680    pub async fn is_signature_allowed(&self, signature: u64) -> bool {
681        !self.signatures.contains(&signature) || self.is_allowed_signature_disk(signature).await
682    }
683
684    /// Clear the disk. This does nothing with `disk` flag enabled.
685    #[cfg(feature = "disk")]
686    pub async fn clear_disk(&self) {
687        if let Some(sqlite) = &self.sqlite {
688            if sqlite.pool_inited() {
689                let _ = DatabaseHandler::clear_table(sqlite.get_db_pool().await).await;
690            }
691        }
692    }
693
694    /// Clear the disk. This does nothing with `disk` flag enabled.
695    #[cfg(not(feature = "disk"))]
696    pub async fn clear_disk(&self) {}
697
698    /// Check if the disk is enabled. This does nothing with `disk` flag enabled.
699    #[cfg(feature = "disk")]
700    pub(crate) fn shared_disk_enabled(&self) -> bool {
701        self.configuration.shared && self.sqlite.is_some()
702    }
703
704    /// Insert a new URL to disk if it doesn't exist. This does nothing with `disk` flag enabled.
705    #[cfg(feature = "disk")]
706    pub async fn insert_url_disk(&self, new_url: &str) {
707        if let Some(sqlite) = &self.sqlite {
708            sqlite.insert_url(sqlite.get_db_pool().await, new_url).await
709        }
710    }
711
712    /// Insert a new signature to disk if it doesn't exist. This does nothing with `disk` flag enabled.
713    #[cfg(feature = "disk")]
714    pub async fn insert_signature_disk(&self, signature: u64) {
715        if let Some(sqlite) = &self.sqlite {
716            sqlite
717                .insert_signature(sqlite.get_db_pool().await, signature)
718                .await
719        }
720    }
721
722    /// Insert a new URL if it doesn't exist. This does nothing with `disk` flag enabled.
723    #[cfg(feature = "disk")]
724    pub async fn insert_link(&mut self, new_url: CaseInsensitiveString) {
725        let mem_load = crate::utils::detect_system::get_global_memory_state().await;
726        let beyond_memory_limits = self.links_visited.len() >= *LINKS_VISITED_MEMORY_LIMIT;
727        let seed_check = mem_load == 2 || mem_load == 1 || beyond_memory_limits;
728
729        if seed_check {
730            let mut seeded = false;
731            if let Some(sqlite) = &self.sqlite {
732                if !sqlite.ready() {
733                    let _ = self.seed().await;
734                    seeded = true;
735                }
736            }
737            if let Some(sqlite) = self.sqlite.as_mut() {
738                sqlite.set_seeded(seeded);
739            }
740        }
741
742        if mem_load == 2 || beyond_memory_limits || self.shared_disk_enabled() {
743            self.insert_url_disk(&new_url).await
744        } else if mem_load == 1 {
745            if self.links_visited.len() <= 100 {
746                self.links_visited.insert(new_url);
747            } else {
748                self.insert_url_disk(&new_url).await
749            }
750        } else {
751            self.links_visited.insert(new_url);
752        }
753    }
754
755    /// Insert a new URL if it doesn't exist. This does nothing with `disk` flag enabled.
756    #[cfg(not(feature = "disk"))]
757    pub async fn insert_link(&mut self, link: CaseInsensitiveString) {
758        self.links_visited.insert(link);
759    }
760
761    /// Insert a new signature if it doesn't exist. This does nothing with `disk` flag enabled.
762    #[cfg(feature = "disk")]
763    pub async fn insert_signature(&mut self, new_signature: u64) {
764        let mem_load = crate::utils::detect_system::get_global_memory_state().await;
765        let beyond_memory_limits = self.signatures.len() >= *LINKS_VISITED_MEMORY_LIMIT;
766        let seed_check = mem_load == 2 || mem_load == 1 || beyond_memory_limits;
767
768        if seed_check {
769            let mut seeded = false;
770            if let Some(sqlite) = &self.sqlite {
771                if !sqlite.ready() {
772                    let _ = self.seed().await;
773                    seeded = true;
774                }
775            }
776            if let Some(sqlite) = self.sqlite.as_mut() {
777                sqlite.set_seeded(seeded);
778            }
779        }
780
781        if mem_load == 2 || beyond_memory_limits || self.shared_disk_enabled() {
782            self.insert_signature_disk(new_signature).await
783        } else if mem_load == 1 {
784            if self.signatures.len() <= 100 {
785                self.signatures.insert(new_signature);
786            } else {
787                self.insert_signature_disk(new_signature).await
788            }
789        } else {
790            self.signatures.insert(new_signature);
791        }
792    }
793
794    /// Insert a new signature if it doesn't exist. This does nothing with `disk` flag enabled.
795    #[cfg(not(feature = "disk"))]
796    pub async fn insert_signature(&mut self, new_signature: u64) {
797        self.signatures.insert(new_signature);
798    }
799
800    /// Seed the DB and clear the Hashset. This does nothing with `disk` flag enabled.
801    #[cfg(feature = "disk")]
802    pub async fn seed(&mut self) -> Result<(), sqlx::Error> {
803        let links = self.get_links();
804
805        if let Some(sqlite) = &self.sqlite {
806            if let Ok(links) = sqlite.seed(sqlite.get_db_pool().await, links).await {
807                self.links_visited.clear();
808
809                for link in links {
810                    self.links_visited.insert(link);
811                }
812
813                if let Some(sqlite) = self.sqlite.as_mut() {
814                    sqlite.seeded = true;
815                }
816            }
817        }
818
819        Ok(())
820    }
821
822    /// Return `false` if the crawl should shutdown. Process in between each link.
823    async fn handle_process<T>(
824        &self,
825        handle: &Option<Arc<AtomicI8>>,
826        interval: &mut Interval,
827        shutdown: T,
828    ) -> bool
829    where
830        T: std::future::Future<Output = ()>,
831    {
832        if self.shutdown {
833            (shutdown).await;
834            false
835        } else {
836            match handle.as_ref() {
837                Some(handle) => {
838                    while handle.load(Ordering::Relaxed) == 1 {
839                        interval.tick().await;
840                    }
841                    if handle.load(Ordering::Relaxed) == 2 {
842                        (shutdown).await;
843                        false
844                    } else {
845                        true
846                    }
847                }
848                _ => true,
849            }
850        }
851    }
852
853    /// return `true` if URL:
854    ///
855    /// - is not already crawled
856    /// - is not over depth
857    /// - is not over crawl budget
858    /// - is optionally whitelisted
859    /// - is not blacklisted
860    /// - is not forbidden in robot.txt file (if parameter is defined)
861    #[inline]
862    #[cfg(not(feature = "regex"))]
863    pub fn is_allowed(&mut self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
864        let status = self.is_allowed_budgetless(link);
865
866        if status.eq(&ProcessLinkStatus::Allowed) && self.is_over_budget(link) {
867            return ProcessLinkStatus::BudgetExceeded;
868        }
869
870        status
871    }
872
873    /// return `true` if URL:
874    ///
875    /// - is not already crawled
876    /// - is not over depth
877    /// - is not over crawl budget
878    /// - is optionally whitelisted
879    /// - is not blacklisted
880    /// - is not forbidden in robot.txt file (if parameter is defined)
881    #[inline]
882    #[cfg(feature = "regex")]
883    pub fn is_allowed(&mut self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
884        let status = self.is_allowed_budgetless(link);
885
886        if status.eq(&ProcessLinkStatus::Allowed) && self.is_over_budget(link) {
887            return ProcessLinkStatus::BudgetExceeded;
888        }
889        status
890    }
891
892    /// return `true` if URL:
893    ///
894    /// - is not already crawled
895    /// - is not over depth
896    /// - is optionally whitelisted
897    /// - is not blacklisted
898    /// - is not forbidden in robot.txt file (if parameter is defined)
899    #[inline]
900    #[cfg(not(feature = "regex"))]
901    pub fn is_allowed_budgetless(&mut self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
902        if self.links_visited.contains(link) {
903            ProcessLinkStatus::Blocked
904        } else {
905            let status = self.is_allowed_default(link.inner());
906
907            if status.eq(&ProcessLinkStatus::Allowed) && self.is_over_depth(link) {
908                return ProcessLinkStatus::Blocked;
909            }
910
911            status
912        }
913    }
914
915    /// return `true` if URL:
916    ///
917    /// - is not already crawled
918    /// - is not over depth
919    /// - is optionally whitelisted
920    /// - is not blacklisted
921    /// - is not forbidden in robot.txt file (if parameter is defined)
922    #[inline]
923    #[cfg(feature = "regex")]
924    pub fn is_allowed_budgetless(&mut self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
925        if self.links_visited.contains(link) {
926            ProcessLinkStatus::Blocked
927        } else {
928            let status = self.is_allowed_default(link);
929            if status.eq(&ProcessLinkStatus::Allowed) && self.is_over_depth(link) {
930                return ProcessLinkStatus::Blocked;
931            }
932            status
933        }
934    }
935
936    /// return `true` if URL:
937    ///
938    /// - is optionally whitelisted
939    /// - is not blacklisted
940    /// - is not forbidden in robot.txt file (if parameter is defined)
941    #[inline]
942    #[cfg(feature = "regex")]
943    pub fn is_allowed_default(&self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
944        let blacklist = self.configuration.get_blacklist_compiled();
945        let whitelist = self.configuration.get_whitelist_compiled();
946
947        let blocked_whitelist = !whitelist.is_empty() && !contains(whitelist, link.inner());
948        let blocked_blacklist = !blacklist.is_empty() && contains(blacklist, link.inner());
949
950        if blocked_whitelist || blocked_blacklist || !self.is_allowed_robots(link.as_ref()) {
951            ProcessLinkStatus::Blocked
952        } else {
953            ProcessLinkStatus::Allowed
954        }
955    }
956
957    /// return `true` if URL:
958    ///
959    /// - is optionally whitelisted
960    /// - is not blacklisted
961    /// - is not forbidden in robot.txt file (if parameter is defined)
962    #[inline]
963    #[cfg(not(feature = "regex"))]
964    pub fn is_allowed_default(&self, link: &CompactString) -> ProcessLinkStatus {
965        let whitelist = self.configuration.get_whitelist_compiled();
966        let blacklist = self.configuration.get_blacklist_compiled();
967
968        let blocked_whitelist = !whitelist.is_empty() && !contains(whitelist, link);
969        let blocked_blacklist = !blacklist.is_empty() && contains(blacklist, link);
970
971        if blocked_whitelist || blocked_blacklist || !self.is_allowed_robots(link) {
972            ProcessLinkStatus::Blocked
973        } else {
974            ProcessLinkStatus::Allowed
975        }
976    }
977
978    /// return `true` if URL:
979    ///
980    /// - is not forbidden in robot.txt file (if parameter is defined)
981    pub fn is_allowed_robots(&self, link: &str) -> bool {
982        if self.configuration.respect_robots_txt {
983            if let Some(r) = &self.robot_file_parser {
984                return r.can_fetch(
985                    match &self.configuration.user_agent {
986                        Some(ua) => ua,
987                        _ => "*",
988                    },
989                    link,
990                );
991            }
992        }
993
994        true
995    }
996
997    /// Detect if the inner budget is exceeded
998    pub(crate) fn is_over_inner_depth_budget(&mut self, link: &CaseInsensitiveString) -> bool {
999        let mut over = false;
1000
1001        if let Some(segments) = get_path_from_url(link)
1002            .strip_prefix('/')
1003            .map(|remainder| remainder.split('/'))
1004        {
1005            let mut depth: usize = 0;
1006
1007            for _ in segments {
1008                depth = depth.saturating_add(1);
1009                if depth > self.configuration.depth_distance {
1010                    over = true;
1011                    break;
1012                }
1013            }
1014        }
1015
1016        over
1017    }
1018
1019    /// is over the wild card budget.
1020    #[cfg(feature = "sitemap")]
1021    pub(crate) fn is_over_wild_budget(
1022        &self,
1023        budget: &Option<hashbrown::HashMap<case_insensitive_string::CaseInsensitiveString, u32>>,
1024    ) -> bool {
1025        let exceeded_wild_budget = if self.configuration.wild_card_budgeting {
1026            match budget {
1027                Some(budget) => match budget.get(&*WILD_CARD_PATH) {
1028                    Some(budget) => budget.abs_diff(0) == 1,
1029                    _ => false,
1030                },
1031                _ => false,
1032            }
1033        } else {
1034            false
1035        };
1036        exceeded_wild_budget
1037    }
1038
1039    /// Detect if the inner budget is exceeded
1040    pub(crate) fn is_over_inner_budget(&mut self, link: &CaseInsensitiveString) -> bool {
1041        match self.configuration.inner_budget.as_mut() {
1042            Some(budget) => {
1043                let exceeded_wild_budget = if self.configuration.wild_card_budgeting {
1044                    match budget.get_mut(&*WILD_CARD_PATH) {
1045                        Some(budget) => {
1046                            if budget.abs_diff(0) == 1 {
1047                                true
1048                            } else {
1049                                *budget -= 1;
1050                                false
1051                            }
1052                        }
1053                        _ => false,
1054                    }
1055                } else {
1056                    false
1057                };
1058
1059                // set this up prior to crawl to avoid checks per link.
1060                // If only the wild card budget is set we can safely skip all checks.
1061                let skip_paths = self.configuration.wild_card_budgeting && budget.len() == 1;
1062                let has_depth_control = self.configuration.depth_distance > 0;
1063
1064                // check if paths pass
1065                if !skip_paths && !exceeded_wild_budget {
1066                    let path_segments = get_path_from_url(link)
1067                        .strip_prefix('/')
1068                        .map(|remainder| remainder.split('/'));
1069
1070                    match path_segments {
1071                        Some(segments) => {
1072                            let mut joint_segment = CaseInsensitiveString::default();
1073                            let mut over = false;
1074                            let mut depth: usize = 0;
1075
1076                            for seg in segments {
1077                                if has_depth_control {
1078                                    depth = depth.saturating_add(1);
1079                                    if depth > self.configuration.depth_distance {
1080                                        over = true;
1081                                        break;
1082                                    }
1083                                }
1084
1085                                joint_segment.push_str(seg);
1086
1087                                if budget.contains_key(&joint_segment) {
1088                                    if let Some(budget) = budget.get_mut(&joint_segment) {
1089                                        if budget.abs_diff(0) == 0 || *budget == 0 {
1090                                            over = true;
1091                                            break;
1092                                        } else {
1093                                            *budget -= 1;
1094                                            continue;
1095                                        }
1096                                    }
1097                                }
1098                            }
1099
1100                            over
1101                        }
1102                        _ => false,
1103                    }
1104                } else {
1105                    exceeded_wild_budget
1106                }
1107            }
1108            _ => false,
1109        }
1110    }
1111
1112    /// Validate if url exceeds crawl depth and should be ignored.
1113    pub(crate) fn is_over_depth(&mut self, link: &CaseInsensitiveString) -> bool {
1114        self.configuration.depth_distance > 0 && self.is_over_inner_depth_budget(link)
1115    }
1116
1117    /// Validate if url exceeds crawl budget and should not be handled.
1118    pub(crate) fn is_over_budget(&mut self, link: &CaseInsensitiveString) -> bool {
1119        self.is_over_inner_budget(link)
1120    }
1121
1122    /// Restore one wildcard budget credit (for relevance-gated irrelevant pages).
1123    #[cfg(all(feature = "agent", feature = "serde"))]
1124    pub(crate) fn restore_wildcard_budget(&mut self) {
1125        if self.configuration.wild_card_budgeting {
1126            if let Some(budget) = self.configuration.inner_budget.as_mut() {
1127                if let Some(counter) = budget.get_mut(&*WILD_CARD_PATH) {
1128                    *counter = counter.saturating_add(1);
1129                }
1130            }
1131        }
1132    }
1133
1134    /// Amount of pages crawled in memory only. Use get_size for full links between memory and disk.
1135    pub fn size(&self) -> usize {
1136        self.links_visited.len()
1137    }
1138
1139    /// Get the amount of resources collected.
1140    #[cfg(not(feature = "disk"))]
1141    pub async fn get_size(&self) -> usize {
1142        self.links_visited.len()
1143    }
1144
1145    /// Get the amount of resources collected.
1146    #[cfg(feature = "disk")]
1147    pub async fn get_size(&self) -> usize {
1148        let disk_count = if let Some(sqlite) = &self.sqlite {
1149            if sqlite.pool_inited() {
1150                let disk_count = DatabaseHandler::count_records(sqlite.get_db_pool().await).await;
1151
1152                disk_count.unwrap_or_default() as usize
1153            } else {
1154                0
1155            }
1156        } else {
1157            0
1158        };
1159
1160        let mut mem_count = self.links_visited.len();
1161
1162        if mem_count >= *LINKS_VISITED_MEMORY_LIMIT {
1163            mem_count -= *LINKS_VISITED_MEMORY_LIMIT;
1164        }
1165
1166        disk_count + mem_count
1167    }
1168
1169    /// Drain the extra links used for things like the sitemap.
1170    pub fn drain_extra_links(&mut self) -> hashbrown::hash_set::Drain<'_, CaseInsensitiveString> {
1171        self.extra_links.drain()
1172    }
1173
1174    /// Set the initial status code of the request.
1175    pub fn set_initial_status_code(&mut self, initial_status_code: StatusCode) {
1176        self.initial_status_code = initial_status_code;
1177    }
1178
1179    /// Get the initial status code of the request.
1180    pub fn get_initial_status_code(&self) -> &StatusCode {
1181        &self.initial_status_code
1182    }
1183
1184    /// Set the initial html size of the request.
1185    pub fn set_initial_html_length(&mut self, initial_html_length: usize) {
1186        self.initial_html_length = initial_html_length;
1187    }
1188
1189    /// Get the initial html size of the request.
1190    pub fn get_initial_html_length(&self) -> usize {
1191        self.initial_html_length
1192    }
1193
1194    /// Set the initial anti-bot tech code used for the intitial request.
1195    pub fn set_initial_anti_bot_tech(&mut self, initial_anti_bot_tech: AntiBotTech) {
1196        self.initial_anti_bot_tech = initial_anti_bot_tech;
1197    }
1198
1199    /// Get the initial anti-bot tech code used for the intitial request.
1200    pub fn get_initial_anti_bot_tech(&self) -> &AntiBotTech {
1201        &self.initial_anti_bot_tech
1202    }
1203
1204    /// Set the initial waf detected used for the intitial request
1205    pub fn set_initial_page_waf_check(&mut self, initial_page_waf_check: bool) {
1206        self.initial_page_waf_check = initial_page_waf_check;
1207    }
1208
1209    /// Get the initial waf detected used for the intitial request.
1210    pub fn get_initial_page_waf_check(&self) -> bool {
1211        self.initial_page_waf_check
1212    }
1213
1214    /// Set the initial page should retry determination used for the intitial request.
1215    pub fn set_initial_page_should_retry(&mut self, initial_page_should_retry: bool) {
1216        self.initial_page_should_retry = initial_page_should_retry;
1217    }
1218
1219    /// Get the initial page should retry determination used for the intitial request.
1220    pub fn get_initial_page_should_retry(&self) -> bool {
1221        self.initial_page_should_retry
1222    }
1223
1224    /// Drain the links visited.
1225    #[cfg(any(
1226        feature = "string_interner_bucket_backend",
1227        feature = "string_interner_string_backend",
1228        feature = "string_interner_buffer_backend",
1229    ))]
1230    pub fn drain_links(
1231        &mut self,
1232    ) -> hashbrown::hash_set::Drain<'_, string_interner::symbol::SymbolUsize> {
1233        self.links_visited.drain()
1234    }
1235
1236    #[cfg(not(any(
1237        feature = "string_interner_bucket_backend",
1238        feature = "string_interner_string_backend",
1239        feature = "string_interner_buffer_backend",
1240    )))]
1241    /// Drain the links visited.
1242    pub fn drain_links(&mut self) -> hashbrown::hash_set::Drain<'_, CaseInsensitiveString> {
1243        self.links_visited.drain()
1244    }
1245
1246    /// Drain the signatures visited.
1247    #[cfg(any(
1248        feature = "string_interner_bucket_backend",
1249        feature = "string_interner_string_backend",
1250        feature = "string_interner_buffer_backend",
1251    ))]
1252    pub fn drain_signatures(&mut self) -> hashbrown::hash_set::Drain<'_, u64> {
1253        self.signatures.drain()
1254    }
1255
1256    #[cfg(not(any(
1257        feature = "string_interner_bucket_backend",
1258        feature = "string_interner_string_backend",
1259        feature = "string_interner_buffer_backend",
1260    )))]
1261    /// Drain the signatures visited.
1262    pub fn drain_signatures(&mut self) -> hashbrown::hash_set::Drain<'_, u64> {
1263        self.signatures.drain()
1264    }
1265
1266    /// Set extra links to crawl. This could be used in conjuntion with 'website.persist_links' to extend the crawl on the next run.
1267    pub fn set_extra_links(
1268        &mut self,
1269        extra_links: HashSet<CaseInsensitiveString>,
1270    ) -> &HashSet<CaseInsensitiveString> {
1271        self.extra_links.extend(extra_links);
1272        &self.extra_links
1273    }
1274
1275    /// Get the extra links.
1276    pub fn get_extra_links(&self) -> &HashSet<CaseInsensitiveString> {
1277        &self.extra_links
1278    }
1279
1280    /// Clear all pages, disk, and links stored in memory.
1281    pub async fn clear_all(&mut self) {
1282        self.clear();
1283        self.clear_disk().await;
1284    }
1285
1286    /// Clear all pages and links stored in memory.
1287    pub fn clear(&mut self) {
1288        self.links_visited.clear();
1289        self.signatures.clear();
1290        self.pages.take();
1291        self.extra_links.clear();
1292    }
1293
1294    /// Get the HTTP request client. The client is set after the crawl has started.
1295    pub fn get_client(&self) -> &Option<Client> {
1296        &self.client
1297    }
1298
1299    /// Page getter.
1300    pub fn get_pages(&self) -> Option<&Vec<Page>> {
1301        self.pages.as_ref()
1302    }
1303
1304    /// Links visited getter for disk. This does nothing with `disk` flag enabled.
1305    #[cfg(not(feature = "disk"))]
1306    pub async fn get_links_disk(&self) -> HashSet<CaseInsensitiveString> {
1307        Default::default()
1308    }
1309
1310    /// Links visited getter for disk. This does nothing with `disk` flag enabled.
1311    #[cfg(feature = "disk")]
1312    pub async fn get_links_disk(&self) -> HashSet<CaseInsensitiveString> {
1313        if let Some(sqlite) = &self.sqlite {
1314            if sqlite.pool_inited() {
1315                if let Ok(links) =
1316                    DatabaseHandler::get_all_resources(sqlite.get_db_pool().await).await
1317                {
1318                    links
1319                } else {
1320                    Default::default()
1321                }
1322            } else {
1323                Default::default()
1324            }
1325        } else {
1326            Default::default()
1327        }
1328    }
1329
1330    /// Links all the links visited between memory and disk.
1331    #[cfg(feature = "disk")]
1332    pub async fn get_all_links_visited(&self) -> HashSet<CaseInsensitiveString> {
1333        let mut l = self.get_links_disk().await;
1334        let m = self.links_visited.get_links();
1335
1336        l.extend(m);
1337
1338        l
1339    }
1340
1341    /// Links all the links visited between memory and disk.
1342    #[cfg(not(feature = "disk"))]
1343    pub async fn get_all_links_visited(&self) -> HashSet<CaseInsensitiveString> {
1344        self.get_links()
1345    }
1346
1347    /// Links visited getter for memory resources.
1348    pub fn get_links(&self) -> HashSet<CaseInsensitiveString> {
1349        self.links_visited.get_links()
1350    }
1351
1352    /// Domain parsed url getter.
1353    pub fn get_url_parsed(&self) -> &Option<Box<Url>> {
1354        &self.domain_parsed
1355    }
1356
1357    /// Domain name getter.
1358    pub fn get_url(&self) -> &CaseInsensitiveString {
1359        &self.url
1360    }
1361
1362    /// Crawl delay getter.
1363    pub fn get_delay(&self) -> Duration {
1364        Duration::from_millis(self.configuration.delay)
1365    }
1366
1367    /// Get the active crawl status.
1368    pub fn get_status(&self) -> &CrawlStatus {
1369        &self.status
1370    }
1371
1372    /// Set the active crawl status. This is helpful when chaining crawls concurrently.
1373    pub fn set_status(&mut self, status: CrawlStatus) -> &CrawlStatus {
1374        self.status = status;
1375        &self.status
1376    }
1377
1378    /// Reset the active crawl status to bypass websites that are blocked.
1379    pub fn reset_status(&mut self) -> &CrawlStatus {
1380        self.status = CrawlStatus::Start;
1381        &self.status
1382    }
1383
1384    /// Set the crawl status to persist between the run.
1385    /// Example crawling a sitemap and all links after - website.crawl_sitemap().await.persist_links().crawl().await
1386    pub fn persist_links(&mut self) -> &mut Self {
1387        self.status = CrawlStatus::Active;
1388        self
1389    }
1390
1391    /// Absolute base url of crawl.
1392    pub fn get_absolute_path(&self, domain: Option<&str>) -> Option<Url> {
1393        if domain.is_some() {
1394            url::Url::parse(domain.unwrap_or_default())
1395                .ok()
1396                .map(|mut url| {
1397                    if let Ok(mut path) = url.path_segments_mut() {
1398                        path.clear();
1399                    }
1400                    url
1401                })
1402        } else if let Some(mut d) = self.domain_parsed.as_deref().cloned() {
1403            if let Ok(mut path) = d.path_segments_mut() {
1404                path.clear();
1405            }
1406            Some(d)
1407        } else {
1408            None
1409        }
1410    }
1411
1412    /// Stop all crawls for the website.
1413    pub fn stop(&mut self) {
1414        self.shutdown = true;
1415    }
1416
1417    /// Crawls commenced from fresh run.
1418    pub fn start(&mut self) {
1419        self.shutdown = false;
1420    }
1421
1422    /// configure the robots parser on initial crawl attempt and run.
1423    pub async fn configure_robots_parser(&mut self, client: &Client) {
1424        if self.configuration.respect_robots_txt {
1425            let robot_file_parser = self
1426                .robot_file_parser
1427                .get_or_insert_with(RobotFileParser::new);
1428
1429            if robot_file_parser.mtime() <= 4000 {
1430                let host_str = match &self.domain_parsed {
1431                    Some(domain) => domain.as_str(),
1432                    _ => self.url.inner(),
1433                };
1434
1435                if !host_str.is_empty() {
1436                    if host_str.ends_with('/') {
1437                        robot_file_parser.read(client, host_str).await;
1438                    } else {
1439                        robot_file_parser
1440                            .read(client, &string_concat!(host_str, "/"))
1441                            .await;
1442                    }
1443                }
1444                if let Some(delay) =
1445                    robot_file_parser.get_crawl_delay(&self.configuration.user_agent)
1446                {
1447                    self.configuration.delay = delay.as_millis().min(60000) as u64;
1448                }
1449            }
1450        }
1451    }
1452
1453    /// Setup strict a strict redirect policy for request. All redirects need to match the host.
1454    pub fn setup_strict_policy(&self) -> Policy {
1455        use crate::client::redirect::Attempt;
1456        use crate::page::domain_name;
1457        use std::sync::atomic::AtomicU8;
1458
1459        let default_policy = Policy::default();
1460
1461        match self.domain_parsed.as_deref().cloned() {
1462            Some(host_s) => {
1463                let initial_redirect_limit = if self.configuration.respect_robots_txt {
1464                    2
1465                } else {
1466                    1
1467                };
1468                let subdomains = self.configuration.subdomains;
1469                let tld = self.configuration.tld;
1470                let host_domain_name = if tld {
1471                    domain_name(&host_s).to_string()
1472                } else {
1473                    Default::default()
1474                };
1475                let redirect_limit = *self.configuration.redirect_limit;
1476
1477                let custom_policy = {
1478                    let initial_redirect = Arc::new(AtomicU8::new(0));
1479
1480                    move |attempt: Attempt| {
1481                        if tld && domain_name(attempt.url()) == host_domain_name
1482                            || subdomains
1483                                && attempt
1484                                    .url()
1485                                    .host_str()
1486                                    .unwrap_or_default()
1487                                    .ends_with(host_s.host_str().unwrap_or_default())
1488                            || attempt.url().host() == host_s.host()
1489                        {
1490                            default_policy.redirect(attempt)
1491                        } else if attempt.previous().len() > redirect_limit {
1492                            attempt.error("too many redirects")
1493                        } else if attempt.status().is_redirection()
1494                            && (0..initial_redirect_limit)
1495                                .contains(&initial_redirect.load(Ordering::Relaxed))
1496                        {
1497                            initial_redirect.fetch_add(1, Ordering::Relaxed);
1498                            default_policy.redirect(attempt)
1499                        } else {
1500                            attempt.stop()
1501                        }
1502                    }
1503                };
1504                Policy::custom(custom_policy)
1505            }
1506            _ => default_policy,
1507        }
1508    }
1509
1510    /// Setup redirect policy for reqwest.
1511    pub fn setup_redirect_policy(&self) -> Policy {
1512        match self.configuration.redirect_policy {
1513            RedirectPolicy::Loose => Policy::limited(*self.configuration.redirect_limit),
1514            RedirectPolicy::None => Policy::none(),
1515            RedirectPolicy::Strict => self.setup_strict_policy(),
1516        }
1517    }
1518
1519    /// Configure the headers to use.
1520    pub fn configure_headers(&mut self) {
1521        let mut headers: reqwest::header::HeaderMap = reqwest::header::HeaderMap::new();
1522
1523        let user_agent = match &self.configuration.user_agent {
1524            Some(ua) => ua.as_str(),
1525            _ => get_ua(self.configuration.only_chrome_agent()),
1526        };
1527
1528        if self.configuration.modify_headers {
1529            crate::utils::header_utils::extend_headers(
1530                &mut headers,
1531                user_agent,
1532                &self.configuration.headers,
1533                &None,
1534                &self.configuration.viewport,
1535                &self.domain_parsed,
1536            );
1537
1538            if !headers.is_empty() {
1539                // always remove the referer header.
1540                if let Some(referer) = headers.remove(REFERER) {
1541                    if let Ok(v) = referer.to_str() {
1542                        // modify the default referer
1543                        if self.configuration.referer.is_none() && !v.is_empty() {
1544                            self.configuration.referer = Some(v.into())
1545                        }
1546                    }
1547                }
1548                self.configuration
1549                    .headers
1550                    .replace(Box::new(SerializableHeaderMap::from(headers)));
1551            }
1552        }
1553    }
1554
1555    #[cfg(all(
1556        any(not(feature = "wreq"), feature = "cache_request"),
1557        not(feature = "decentralized")
1558    ))]
1559    /// Base client configuration.
1560    pub fn configure_base_client(&self) -> ClientBuilder {
1561        let policy = self.setup_redirect_policy();
1562
1563        let user_agent = match &self.configuration.user_agent {
1564            Some(ua) => ua.as_str(),
1565            _ => get_ua(self.configuration.only_chrome_agent()),
1566        };
1567
1568        // let missing_host =
1569        //     !headers.contains_key(crate::client::header::HOST) && !headers.contains_key("Host");
1570        let missing_agent = match &self.configuration.headers {
1571            Some(headers) => {
1572                !headers.contains_key(crate::client::header::USER_AGENT)
1573                    && !headers.contains_key("User-Agent")
1574            }
1575            _ => true,
1576        };
1577
1578        let timeout_mult = if self.configuration.proxies.is_some() {
1579            2
1580        } else {
1581            1
1582        };
1583
1584        let client = reqwest::Client::builder()
1585            .redirect(policy)
1586            .http09_responses()
1587            .http1_ignore_invalid_headers_in_responses(true)
1588            .referer(self.configuration.referer.is_none())
1589            .connect_timeout(
1590                self.configuration
1591                    .default_http_connect_timeout
1592                    .unwrap_or(Duration::from_secs(24 * timeout_mult)),
1593            )
1594            .read_timeout(
1595                self.configuration
1596                    .default_http_read_timeout
1597                    .unwrap_or(Duration::from_secs(42 * timeout_mult)),
1598            )
1599            .http1_title_case_headers()
1600            .http1_allow_obsolete_multiline_headers_in_responses(true)
1601            .http1_allow_spaces_after_header_name_in_responses(true)
1602            // .http1_preserve_header_order()
1603            // .http1_preserve_header_case()
1604            .danger_accept_invalid_certs(self.configuration.accept_invalid_certs);
1605
1606        let client = if let Some(network_interface) = &self.configuration.network_interface {
1607            set_interface(client, network_interface)
1608        } else {
1609            client
1610        };
1611
1612        let client = if let Some(local_address) = &self.configuration.local_address {
1613            client.local_address(*local_address)
1614        } else {
1615            client
1616        };
1617
1618        let client = if self.configuration.proxies.is_none() {
1619            client
1620        } else {
1621            client.tcp_keepalive(Duration::from_secs(30))
1622        };
1623
1624        // check both casing for user-agent
1625        let client = if missing_agent {
1626            client.user_agent(user_agent)
1627        } else {
1628            client
1629        };
1630
1631        let client = if self.configuration.http2_prior_knowledge {
1632            client.http2_prior_knowledge()
1633        } else {
1634            client
1635        };
1636
1637        crate::utils::header_utils::setup_default_headers(client, &self.configuration)
1638    }
1639
1640    #[cfg(all(
1641        feature = "wreq",
1642        not(feature = "decentralized"),
1643        not(feature = "cache_request")
1644    ))]
1645    /// Base client configuration.
1646    pub fn configure_base_client(&self) -> ClientBuilder {
1647        let policy = self.setup_redirect_policy();
1648
1649        let user_agent = match &self.configuration.user_agent {
1650            Some(ua) => ua.as_str(),
1651            _ => get_ua(self.configuration.only_chrome_agent()),
1652        };
1653
1654        let missing_agent = match &self.configuration.headers {
1655            Some(headers) => {
1656                !headers.contains_key(crate::client::header::USER_AGENT)
1657                    && !headers.contains_key("User-Agent")
1658            }
1659            _ => true,
1660        };
1661
1662        let timeout_mult = if self.configuration.proxies.is_some() {
1663            2
1664        } else {
1665            1
1666        };
1667
1668        let client = Client::builder()
1669            .redirect(policy)
1670            .referer(self.configuration.referer.is_none())
1671            .connect_timeout(
1672                self.configuration
1673                    .default_http_connect_timeout
1674                    .unwrap_or(Duration::from_secs(24 * timeout_mult)),
1675            )
1676            .read_timeout(
1677                self.configuration
1678                    .default_http_read_timeout
1679                    .unwrap_or(Duration::from_secs(42 * timeout_mult)),
1680            );
1681
1682        let client = if let Some(local_address) = &self.configuration.local_address {
1683            client.local_address(*local_address)
1684        } else {
1685            client
1686        };
1687
1688        let client = if self.configuration.proxies.is_none() {
1689            client
1690        } else {
1691            client.tcp_keepalive(Duration::from_secs(30))
1692        };
1693
1694        let client = if missing_agent {
1695            client.user_agent(user_agent)
1696        } else {
1697            client
1698        };
1699
1700        let client = if let Some(emulation) = self.configuration.emulation {
1701            client.emulation(emulation)
1702        } else {
1703            client
1704        };
1705
1706        crate::utils::header_utils::setup_default_headers(client, &self.configuration)
1707    }
1708
1709    /// Build the HTTP client.
1710    #[cfg(all(not(feature = "decentralized"), not(feature = "cache_request")))]
1711    pub fn configure_http_client_builder(&self) -> ClientBuilder {
1712        let client = self.configure_base_client();
1713
1714        let mut client = match &self.configuration.request_timeout {
1715            Some(t) => client.timeout(**t),
1716            _ => client,
1717        };
1718
1719        let client = match &self.configuration.proxies {
1720            Some(proxies) => {
1721                let linux = cfg!(target_os = "linux");
1722                let ignore_plain_socks = proxies.len() >= 2 && linux;
1723                let replace_plain_socks = proxies.len() == 1 && linux;
1724
1725                for proxie in proxies.iter() {
1726                    if proxie.ignore == crate::configuration::ProxyIgnore::Http {
1727                        continue;
1728                    }
1729
1730                    let proxie = &proxie.addr;
1731                    let socks = proxie.starts_with("socks://");
1732
1733                    // we can skip it and use another proxy from the list.
1734                    if ignore_plain_socks && socks {
1735                        continue;
1736                    }
1737
1738                    // use HTTP instead as reqwest does not support the protocol on linux.
1739                    if replace_plain_socks && socks {
1740                        if let Ok(proxy) =
1741                            crate::client::Proxy::all(proxie.replacen("socks://", "http://", 1))
1742                        {
1743                            client = client.proxy(proxy);
1744                        }
1745                    } else if let Ok(proxy) = crate::client::Proxy::all(proxie) {
1746                        client = client.proxy(proxy);
1747                    }
1748                }
1749
1750                client
1751            }
1752            _ => client,
1753        };
1754
1755        // Spider Cloud proxy injection (modes that use proxy transport)
1756        #[cfg(feature = "spider_cloud")]
1757        let client = if let Some(ref sc) = self.configuration.spider_cloud {
1758            if sc.uses_proxy() {
1759                match (
1760                    crate::client::Proxy::all(&sc.proxy_url),
1761                    reqwest::header::HeaderValue::from_str(&format!("Bearer {}", sc.api_key)),
1762                ) {
1763                    (Ok(proxy), Ok(auth_value)) => client.proxy(proxy.custom_http_auth(auth_value)),
1764                    _ => client,
1765                }
1766            } else {
1767                client
1768            }
1769        } else {
1770            client
1771        };
1772
1773        let client = if crate::utils::connect::background_connect_threading() {
1774            client.connector_layer(crate::utils::connect::BackgroundProcessorLayer::new())
1775        } else {
1776            client
1777        };
1778
1779        let client = match self.configuration.concurrency_limit {
1780            Some(limit) => {
1781                client.connector_layer(tower::limit::concurrency::ConcurrencyLimitLayer::new(limit))
1782            }
1783            _ => client,
1784        };
1785
1786        self.configure_http_client_cookies(client)
1787    }
1788
1789    /// Build the HTTP client with caching enabled.
1790    #[cfg(all(not(feature = "decentralized"), feature = "cache_request"))]
1791    pub fn configure_http_client_builder(&self) -> reqwest_middleware::ClientBuilder {
1792        use crate::utils::create_cache_key;
1793        let client = self.configure_base_client();
1794
1795        let mut client = match &self.configuration.request_timeout {
1796            Some(t) => client.timeout(**t),
1797            _ => client,
1798        };
1799
1800        let client = match &self.configuration.proxies {
1801            Some(proxies) => {
1802                let linux = cfg!(target_os = "linux");
1803                let ignore_plain_socks = proxies.len() >= 2 && linux;
1804                let replace_plain_socks = proxies.len() == 1 && linux;
1805
1806                for proxie in proxies.iter() {
1807                    if proxie.ignore == crate::configuration::ProxyIgnore::Http {
1808                        continue;
1809                    }
1810                    let proxie = &proxie.addr;
1811
1812                    let socks = proxie.starts_with("socks://");
1813
1814                    // we can skip it and use another proxy from the list.
1815                    if ignore_plain_socks && socks {
1816                        continue;
1817                    }
1818
1819                    // use HTTP instead as reqwest does not support the protocol on linux.
1820                    if replace_plain_socks && socks {
1821                        if let Ok(proxy) =
1822                            crate::client::Proxy::all(proxie.replacen("socks://", "http://", 1))
1823                        {
1824                            client = client.proxy(proxy);
1825                        }
1826                    } else if let Ok(proxy) = crate::client::Proxy::all(proxie) {
1827                        client = client.proxy(proxy);
1828                    }
1829                }
1830
1831                client
1832            }
1833            _ => client,
1834        };
1835
1836        // Spider Cloud proxy injection (modes that use proxy transport)
1837        #[cfg(feature = "spider_cloud")]
1838        let client = if let Some(ref sc) = self.configuration.spider_cloud {
1839            if sc.uses_proxy() {
1840                match (
1841                    crate::client::Proxy::all(&sc.proxy_url),
1842                    reqwest::header::HeaderValue::from_str(&format!("Bearer {}", sc.api_key)),
1843                ) {
1844                    (Ok(proxy), Ok(auth_value)) => client.proxy(proxy.custom_http_auth(auth_value)),
1845                    _ => client,
1846                }
1847            } else {
1848                client
1849            }
1850        } else {
1851            client
1852        };
1853
1854        let client = self.configure_http_client_cookies(client);
1855
1856        let client = if crate::utils::connect::background_connect_threading() {
1857            client.connector_layer(crate::utils::connect::BackgroundProcessorLayer::new())
1858        } else {
1859            client
1860        };
1861
1862        let client = match self.configuration.concurrency_limit {
1863            Some(limit) => {
1864                client.connector_layer(tower::limit::concurrency::ConcurrencyLimitLayer::new(limit))
1865            }
1866            _ => client,
1867        };
1868
1869        let client =
1870            reqwest_middleware::ClientBuilder::new(unsafe { client.build().unwrap_unchecked() });
1871
1872        if self.configuration.cache {
1873            let mut cache_options = HttpCacheOptions::default();
1874
1875            cache_options.cache_key = Some(Arc::new(|req: &http::request::Parts| {
1876                let mut auth_token = None;
1877                if let Some(auth) = req.headers.get("authorization") {
1878                    if let Ok(token) = auth.to_str() {
1879                        if !token.is_empty() {
1880                            auth_token = Some(token);
1881                        }
1882                    }
1883                }
1884                create_cache_key(req, Some(req.method.as_str()), auth_token)
1885            }));
1886            client.with(Cache(HttpCache {
1887                mode: CacheMode::Default,
1888                manager: CACACHE_MANAGER.clone(),
1889                options: cache_options,
1890            }))
1891        } else {
1892            client
1893        }
1894    }
1895
1896    /// Build the HTTP client with cookie configurations.
1897    #[cfg(all(not(feature = "decentralized"), feature = "cookies"))]
1898    pub fn configure_http_client_cookies(
1899        &self,
1900        client: crate::client::ClientBuilder,
1901    ) -> crate::client::ClientBuilder {
1902        let client = client.cookie_provider(self.cookie_jar.clone());
1903
1904        if !self.configuration.cookie_str.is_empty() {
1905            if let Some(url) = self.domain_parsed.as_ref() {
1906                self.cookie_jar
1907                    .add_cookie_str(&self.configuration.cookie_str, url);
1908            }
1909        }
1910
1911        client
1912    }
1913
1914    /// Build the client with cookie configurations. This does nothing with [cookies] flag enabled.
1915    #[cfg(all(not(feature = "decentralized"), not(feature = "cookies")))]
1916    pub fn configure_http_client_cookies(
1917        &self,
1918        client: crate::client::ClientBuilder,
1919    ) -> crate::client::ClientBuilder {
1920        client
1921    }
1922
1923    /// Set the HTTP client to use directly. This is helpful if you manually call 'website.configure_http_client' before the crawl.
1924    pub fn set_http_client(&mut self, client: Client) -> &Option<Client> {
1925        self.client = Some(client);
1926        &self.client
1927    }
1928
1929    /// Build a client configured with a single proxy for use in rotation.
1930    #[cfg(all(not(feature = "decentralized"), not(feature = "cache_request")))]
1931    fn build_single_proxy_client(
1932        &self,
1933        proxy: &crate::configuration::RequestProxy,
1934    ) -> Option<Client> {
1935        if proxy.ignore == crate::configuration::ProxyIgnore::Http {
1936            return None;
1937        }
1938
1939        let client = self.configure_base_client();
1940
1941        let client = match &self.configuration.request_timeout {
1942            Some(t) => client.timeout(**t),
1943            _ => client,
1944        };
1945
1946        let addr = &proxy.addr;
1947        let linux = cfg!(target_os = "linux");
1948        let socks = addr.starts_with("socks://");
1949
1950        let client = if socks && linux {
1951            match crate::client::Proxy::all(addr.replacen("socks://", "http://", 1)) {
1952                Ok(p) => client.proxy(p),
1953                Err(_) => return None,
1954            }
1955        } else {
1956            match crate::client::Proxy::all(addr) {
1957                Ok(p) => client.proxy(p),
1958                Err(_) => return None,
1959            }
1960        };
1961
1962        #[cfg(feature = "spider_cloud")]
1963        let client = if let Some(ref sc) = self.configuration.spider_cloud {
1964            if sc.uses_proxy() {
1965                match (
1966                    crate::client::Proxy::all(&sc.proxy_url),
1967                    reqwest::header::HeaderValue::from_str(&format!("Bearer {}", sc.api_key)),
1968                ) {
1969                    (Ok(proxy), Ok(auth_value)) => client.proxy(proxy.custom_http_auth(auth_value)),
1970                    _ => client,
1971                }
1972            } else {
1973                client
1974            }
1975        } else {
1976            client
1977        };
1978
1979        let client = if crate::utils::connect::background_connect_threading() {
1980            client.connector_layer(crate::utils::connect::BackgroundProcessorLayer::new())
1981        } else {
1982            client
1983        };
1984
1985        let client = match self.configuration.concurrency_limit {
1986            Some(limit) => {
1987                client.connector_layer(tower::limit::concurrency::ConcurrencyLimitLayer::new(limit))
1988            }
1989            _ => client,
1990        };
1991
1992        let client = self.configure_http_client_cookies(client);
1993        unsafe { Some(client.build().unwrap_unchecked()) }
1994    }
1995
1996    /// Build a client configured with a single proxy for use in rotation (cache_request variant).
1997    #[cfg(all(not(feature = "decentralized"), feature = "cache_request"))]
1998    fn build_single_proxy_client(
1999        &self,
2000        proxy: &crate::configuration::RequestProxy,
2001    ) -> Option<Client> {
2002        use crate::utils::create_cache_key;
2003
2004        if proxy.ignore == crate::configuration::ProxyIgnore::Http {
2005            return None;
2006        }
2007
2008        let client = self.configure_base_client();
2009
2010        let client = match &self.configuration.request_timeout {
2011            Some(t) => client.timeout(**t),
2012            _ => client,
2013        };
2014
2015        let addr = &proxy.addr;
2016        let linux = cfg!(target_os = "linux");
2017        let socks = addr.starts_with("socks://");
2018
2019        let client = if socks && linux {
2020            match crate::client::Proxy::all(addr.replacen("socks://", "http://", 1)) {
2021                Ok(p) => client.proxy(p),
2022                Err(_) => return None,
2023            }
2024        } else {
2025            match crate::client::Proxy::all(addr) {
2026                Ok(p) => client.proxy(p),
2027                Err(_) => return None,
2028            }
2029        };
2030
2031        #[cfg(feature = "spider_cloud")]
2032        let client = if let Some(ref sc) = self.configuration.spider_cloud {
2033            if sc.uses_proxy() {
2034                match (
2035                    crate::client::Proxy::all(&sc.proxy_url),
2036                    reqwest::header::HeaderValue::from_str(&format!("Bearer {}", sc.api_key)),
2037                ) {
2038                    (Ok(proxy), Ok(auth_value)) => client.proxy(proxy.custom_http_auth(auth_value)),
2039                    _ => client,
2040                }
2041            } else {
2042                client
2043            }
2044        } else {
2045            client
2046        };
2047
2048        let client = self.configure_http_client_cookies(client);
2049
2050        let client = if crate::utils::connect::background_connect_threading() {
2051            client.connector_layer(crate::utils::connect::BackgroundProcessorLayer::new())
2052        } else {
2053            client
2054        };
2055
2056        let client = match self.configuration.concurrency_limit {
2057            Some(limit) => {
2058                client.connector_layer(tower::limit::concurrency::ConcurrencyLimitLayer::new(limit))
2059            }
2060            _ => client,
2061        };
2062
2063        let client =
2064            reqwest_middleware::ClientBuilder::new(unsafe { client.build().unwrap_unchecked() });
2065
2066        if self.configuration.cache {
2067            let mut cache_options = HttpCacheOptions::default();
2068
2069            cache_options.cache_key = Some(Arc::new(|req: &http::request::Parts| {
2070                let mut auth_token = None;
2071                if let Some(auth) = req.headers.get("authorization") {
2072                    if let Ok(token) = auth.to_str() {
2073                        if !token.is_empty() {
2074                            auth_token = Some(token);
2075                        }
2076                    }
2077                }
2078                create_cache_key(req, Some(req.method.as_str()), auth_token)
2079            }));
2080
2081            Some(
2082                client
2083                    .with(Cache(HttpCache {
2084                        mode: CacheMode::Default,
2085                        manager: CACACHE_MANAGER.clone(),
2086                        options: cache_options,
2087                    }))
2088                    .build(),
2089            )
2090        } else {
2091            Some(client.build())
2092        }
2093    }
2094
2095    /// Build rotated clients from the proxy list. Returns None if fewer than 2 proxies.
2096    #[cfg(not(feature = "decentralized"))]
2097    fn build_rotated_clients(&self) -> Option<Arc<ClientRotator>> {
2098        let proxies = self.configuration.proxies.as_ref()?;
2099        if proxies.len() < 2 {
2100            return None;
2101        }
2102        let clients: Vec<Client> = proxies
2103            .iter()
2104            .filter_map(|proxy| self.build_single_proxy_client(proxy))
2105            .collect();
2106        if clients.len() < 2 {
2107            return None;
2108        }
2109        Some(Arc::new(ClientRotator::new(clients)))
2110    }
2111
2112    /// Configure http client.
2113    #[cfg(all(not(feature = "decentralized"), not(feature = "cache_request")))]
2114    pub fn configure_http_client(&self) -> Client {
2115        let client = self.configure_http_client_builder();
2116        // should unwrap using native-tls-alpn
2117        unsafe { client.build().unwrap_unchecked() }
2118    }
2119
2120    /// Configure http client.
2121    #[cfg(all(not(feature = "decentralized"), feature = "cache_request"))]
2122    pub fn configure_http_client(&self) -> Client {
2123        let client = self.configure_http_client_builder();
2124        client.build()
2125    }
2126
2127    /// Configure http client for decentralization.
2128    #[cfg(all(feature = "decentralized", not(feature = "cache_request")))]
2129    pub fn configure_http_client(&self) -> Client {
2130        use reqwest::header::{HeaderMap, HeaderValue};
2131
2132        let mut headers = HeaderMap::new();
2133
2134        let policy = self.setup_redirect_policy();
2135
2136        let mut client = Client::builder()
2137            .user_agent(match &self.configuration.user_agent {
2138                Some(ua) => ua.as_str(),
2139                _ => &get_ua(self.configuration.only_chrome_agent()),
2140            })
2141            .redirect(policy)
2142            .tcp_keepalive(Duration::from_millis(500));
2143
2144        let referer = if self.configuration.tld && self.configuration.subdomains {
2145            2
2146        } else if self.configuration.tld {
2147            2
2148        } else if self.configuration.subdomains {
2149            1
2150        } else {
2151            0
2152        };
2153
2154        if referer > 0 {
2155            // use expected http headers for providers that drop invalid headers
2156            headers.insert(reqwest::header::REFERER, HeaderValue::from(referer));
2157        }
2158
2159        if let Some(h) = &self.configuration.headers {
2160            headers.extend(h.inner().clone());
2161        }
2162
2163        if let Some(domain_url) = self.get_absolute_path(None) {
2164            let domain_url = domain_url.as_str();
2165            let domain_host = if domain_url.ends_with("/") {
2166                &domain_url[0..domain_url.len() - 1]
2167            } else {
2168                domain_url
2169            };
2170            if let Ok(value) = HeaderValue::from_str(domain_host) {
2171                headers.insert(reqwest::header::HOST, value);
2172            }
2173        }
2174
2175        for worker in WORKERS.iter() {
2176            if let Ok(worker) = crate::client::Proxy::all(worker) {
2177                client = client.proxy(worker);
2178            }
2179        }
2180
2181        if !self.configuration.modify_headers && self.configuration.modify_http_client_headers {
2182            if let Some(ua) = &self.configuration.user_agent {
2183                crate::utils::header_utils::extend_headers(
2184                    &mut headers,
2185                    ua,
2186                    &self.configuration.headers,
2187                    &None,
2188                    &self.configuration.viewport,
2189                    &self.domain_parsed,
2190                );
2191            }
2192        }
2193
2194        // should unwrap using native-tls-alpn
2195        unsafe {
2196            match &self.configuration.request_timeout {
2197                Some(t) => client.timeout(**t),
2198                _ => client,
2199            }
2200            .default_headers(headers)
2201            .build()
2202            .unwrap_unchecked()
2203        }
2204    }
2205
2206    /// Configure http client for decentralization.
2207    #[cfg(all(feature = "decentralized", feature = "cache_request"))]
2208    pub fn configure_http_client(&mut self) -> Client {
2209        use crate::utils::create_cache_key;
2210        use reqwest::header::{HeaderMap, HeaderValue};
2211        use reqwest_middleware::ClientBuilder;
2212
2213        let mut headers = HeaderMap::new();
2214
2215        let policy = self.setup_redirect_policy();
2216
2217        let mut client = reqwest::Client::builder()
2218            .user_agent(match &self.configuration.user_agent {
2219                Some(ua) => ua.as_str(),
2220                _ => get_ua(self.configuration.only_chrome_agent()),
2221            })
2222            .redirect(policy)
2223            .tcp_keepalive(Duration::from_millis(500));
2224
2225        let referer = if self.configuration.tld && self.configuration.subdomains {
2226            2
2227        } else if self.configuration.tld {
2228            2
2229        } else if self.configuration.subdomains {
2230            1
2231        } else {
2232            0
2233        };
2234
2235        if referer > 0 {
2236            // use expected http headers for providers that drop invalid headers
2237            headers.insert(reqwest::header::REFERER, HeaderValue::from(referer));
2238        }
2239
2240        if let Some(h) = &self.configuration.headers {
2241            headers.extend(h.inner().clone());
2242        }
2243
2244        if let Some(domain_url) = self.get_absolute_path(None) {
2245            let domain_url = domain_url.as_str();
2246            let domain_host = if domain_url.ends_with("/") {
2247                &domain_url[0..domain_url.len() - 1]
2248            } else {
2249                domain_url
2250            };
2251            if let Ok(value) = HeaderValue::from_str(domain_host) {
2252                headers.insert(reqwest::header::HOST, value);
2253            }
2254        }
2255
2256        for worker in WORKERS.iter() {
2257            if let Ok(worker) = crate::client::Proxy::all(worker) {
2258                client = client.proxy(worker);
2259            }
2260        }
2261
2262        let mut cache_options = HttpCacheOptions::default();
2263
2264        cache_options.cache_key = Some(Arc::new(|req: &http::request::Parts| {
2265            let mut auth_token = None;
2266            if let Some(auth) = req.headers.get("authorization") {
2267                if let Ok(token) = auth.to_str() {
2268                    if !token.is_empty() {
2269                        auth_token = Some(token);
2270                    }
2271                }
2272            }
2273            create_cache_key(req, Some(req.method.as_str()), auth_token)
2274        }));
2275
2276        if !self.configuration.modify_headers && self.configuration.modify_http_client_headers {
2277            if let Some(ua) = &self.configuration.user_agent {
2278                crate::utils::header_utils::extend_headers(
2279                    &mut headers,
2280                    ua,
2281                    &self.configuration.headers,
2282                    &None,
2283                    &self.configuration.viewport,
2284                    &self.domain_parsed,
2285                );
2286            }
2287        }
2288
2289        let client = ClientBuilder::new(unsafe {
2290            match &self.configuration.request_timeout {
2291                Some(t) => client.timeout(**t),
2292                _ => client,
2293            }
2294            .default_headers(headers)
2295            .build()
2296            .unwrap_unchecked()
2297        })
2298        .with(Cache(HttpCache {
2299            mode: CacheMode::Default,
2300            manager: CACACHE_MANAGER.clone(),
2301            options: cache_options,
2302        }));
2303
2304        client.build()
2305    }
2306
2307    /// Setup atomic controller. This does nothing without the 'control' feature flag enabled.
2308    #[cfg(feature = "control")]
2309    pub fn configure_handler(&self) -> Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)> {
2310        use crate::utils::{Handler, CONTROLLER};
2311
2312        if self.configuration.no_control_thread {
2313            None
2314        } else {
2315            let c: Arc<AtomicI8> = Arc::new(AtomicI8::new(0));
2316            let handle = c.clone();
2317            let target_id = self.target_id();
2318
2319            let join_handle = crate::utils::spawn_task("control_handler", async move {
2320                let mut l = CONTROLLER.read().await.1.to_owned();
2321
2322                while l.changed().await.is_ok() {
2323                    let n = &*l.borrow();
2324                    let (target, rest) = n;
2325
2326                    if target_id.eq_ignore_ascii_case(target) {
2327                        if rest == &Handler::Resume {
2328                            c.store(0, Ordering::Relaxed);
2329                        }
2330                        if rest == &Handler::Pause {
2331                            c.store(1, Ordering::Relaxed);
2332                        }
2333                        if rest == &Handler::Shutdown {
2334                            c.store(2, Ordering::Relaxed);
2335                        }
2336                    }
2337                }
2338            });
2339
2340            Some((handle, join_handle))
2341        }
2342    }
2343
2344    #[cfg(not(feature = "control"))]
2345    /// Setup atomic controller. This does nothing without the 'control' feature flag enabled.
2346    pub fn configure_handler(&self) -> Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)> {
2347        None
2348    }
2349
2350    /// Setup interception for chrome request.
2351    #[cfg(all(feature = "chrome", feature = "chrome_intercept"))]
2352    pub async fn setup_chrome_interception(
2353        &self,
2354        page: &chromiumoxide::Page,
2355    ) -> Option<tokio::task::JoinHandle<()>> {
2356        crate::features::chrome::setup_chrome_interception_base(
2357            page,
2358            self.configuration.chrome_intercept.enabled,
2359            &self.configuration.auth_challenge_response,
2360            self.configuration.chrome_intercept.block_visuals,
2361            self.url.inner(),
2362        )
2363        .await
2364    }
2365
2366    /// Setup interception for chrome request
2367    #[cfg(all(feature = "chrome", not(feature = "chrome_intercept")))]
2368    pub async fn setup_chrome_interception(
2369        &self,
2370        _chrome_page: &chromiumoxide::Page,
2371    ) -> Option<tokio::task::JoinHandle<()>> {
2372        None
2373    }
2374
2375    /// Setup selectors for handling link targets.
2376    pub fn setup_selectors(&self) -> RelativeSelectors {
2377        setup_website_selectors(
2378            self.get_url().inner(),
2379            AllowedDomainTypes::new(self.configuration.subdomains, self.configuration.tld),
2380        )
2381    }
2382
2383    /// Base configuration setup.
2384    pub fn setup_base(&mut self) -> (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) {
2385        self.determine_limits();
2386        self.setup_disk();
2387        self.configure_headers();
2388
2389        crate::utils::connect::init_background_runtime();
2390
2391        let client = match self.client.take() {
2392            Some(client) => client,
2393            _ => self.configure_http_client(),
2394        };
2395
2396        #[cfg(not(feature = "decentralized"))]
2397        {
2398            self.client_rotator = self.build_rotated_clients();
2399        }
2400
2401        (client, self.configure_handler())
2402    }
2403
2404    /// Setup config for crawl.
2405    pub async fn setup(
2406        &mut self,
2407    ) -> (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) {
2408        let setup = self.setup_base();
2409        if self.status != CrawlStatus::Active {
2410            self.clear_all().await;
2411        } else {
2412            self.skip_initial = !self.extra_links.is_empty();
2413        }
2414        self.configure_robots_parser(&setup.0).await;
2415        setup
2416    }
2417
2418    /// Setup shared concurrent configs.
2419    pub fn setup_crawl(
2420        &self,
2421    ) -> (
2422        std::pin::Pin<Box<tokio::time::Interval>>,
2423        std::pin::Pin<Box<Duration>>,
2424    ) {
2425        let interval = Box::pin(tokio::time::interval(Duration::from_millis(10)));
2426        let throttle = Box::pin(self.get_delay());
2427
2428        (interval, throttle)
2429    }
2430
2431    /// Get all the expanded links.
2432    #[cfg(feature = "glob")]
2433    pub fn get_expanded_links(&self, domain_name: &str) -> Vec<CaseInsensitiveString> {
2434        let mut expanded = crate::features::glob::expand_url(domain_name);
2435
2436        if expanded.is_empty() {
2437            if let Some(u) = self.get_absolute_path(Some(domain_name)) {
2438                expanded.push(u.as_str().into());
2439            }
2440        };
2441
2442        expanded
2443    }
2444
2445    /// Set the initial crawl status by page output.
2446    pub fn set_crawl_initial_status(
2447        &mut self,
2448        page: &crate::page::Page,
2449        links: &HashSet<CaseInsensitiveString>,
2450    ) {
2451        use crate::utils::{detect_open_resty_forbidden, APACHE_FORBIDDEN};
2452
2453        if page.status_code == reqwest::StatusCode::FORBIDDEN && links.is_empty() {
2454            if is_safe_javascript_challenge(page) {
2455                self.website_meta_info = WebsiteMetaInfo::RequiresJavascript;
2456            } else if page.get_html_bytes_u8() == *APACHE_FORBIDDEN {
2457                self.website_meta_info = WebsiteMetaInfo::Apache403;
2458            } else if detect_open_resty_forbidden(page.get_html_bytes_u8()) {
2459                self.website_meta_info = WebsiteMetaInfo::OpenResty403;
2460            }
2461            self.status = CrawlStatus::Blocked;
2462        } else if page.status_code == reqwest::StatusCode::TOO_MANY_REQUESTS {
2463            self.status = CrawlStatus::RateLimited;
2464        } else if page.status_code.is_server_error() {
2465            self.status = CrawlStatus::ServerError;
2466        } else if page.is_empty() {
2467            if page.status_code == *UNKNOWN_STATUS_ERROR
2468                || page.status_code == *CHROME_UNKNOWN_STATUS_ERROR
2469            {
2470                self.status = CrawlStatus::ConnectError;
2471            } else {
2472                self.status = CrawlStatus::Empty;
2473            }
2474        }
2475    }
2476
2477    /// Expand links for crawl base establish using a **command-based fetch**.
2478    #[cfg(feature = "cmd")]
2479    pub async fn _crawl_establish_cmd(
2480        &mut self,
2481        cmd: std::path::PathBuf,
2482        cmd_args: Vec<String>,
2483        base: &mut RelativeSelectors,
2484        _ssg_build: bool,
2485    ) -> HashSet<CaseInsensitiveString> {
2486        if self.skip_initial {
2487            return Default::default();
2488        }
2489
2490        if !self
2491            .is_allowed_default(self.get_base_link())
2492            .eq(&ProcessLinkStatus::Allowed)
2493        {
2494            return HashSet::new();
2495        }
2496
2497        let url = self.url.inner();
2498
2499        let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
2500        let mut links_ssg = HashSet::new();
2501        let mut links_pages = if self.configuration.return_page_links {
2502            Some(HashSet::new())
2503        } else {
2504            None
2505        };
2506
2507        let mut page_links_settings =
2508            PageLinkBuildSettings::new(true, self.configuration.full_resources);
2509        page_links_settings.subdomains = self.configuration.subdomains;
2510        page_links_settings.tld = self.configuration.tld;
2511        page_links_settings.normalize = self.configuration.normalize;
2512
2513        let mut domain_parsed = self.domain_parsed.take();
2514
2515        let mut retry_count = self.configuration.retry;
2516        let mut last_err: Option<std::io::Error> = None;
2517
2518        let build_error_page = |status: StatusCode, _err: std::io::Error| {
2519            let mut p = Page::default();
2520            p.url = url.to_string();
2521            p.status_code = status;
2522            #[cfg(not(feature = "page_error_status_details"))]
2523            {
2524                p.error_status = Some(_err.to_string());
2525            }
2526            p
2527        };
2528
2529        let mut page: Page = loop {
2530            let bytes = match Self::run_via_cmd(&cmd, &cmd_args, url).await {
2531                Ok(b) => {
2532                    if b.is_empty() {
2533                        last_err = Some(std::io::Error::new(
2534                            std::io::ErrorKind::UnexpectedEof,
2535                            "cmd returned empty stdout",
2536                        ));
2537                        None
2538                    } else {
2539                        Some(b)
2540                    }
2541                }
2542                Err(e) => {
2543                    last_err = Some(e);
2544                    None
2545                }
2546            };
2547
2548            if let Some(bytes) = bytes.as_deref() {
2549                let mut domain_parsed_out = None;
2550
2551                let page = Page::new_page_streaming_from_bytes(
2552                    url,
2553                    bytes,
2554                    base,
2555                    &self.configuration.external_domains_caseless,
2556                    &page_links_settings,
2557                    &mut links,
2558                    Some(&mut links_ssg),
2559                    &domain_parsed,
2560                    &mut domain_parsed_out,
2561                    &mut links_pages,
2562                )
2563                .await;
2564
2565                if self.domain_parsed.is_none() {
2566                    if let Some(mut dp) = domain_parsed.take() {
2567                        convert_abs_url(&mut dp);
2568                        self.domain_parsed.replace(dp);
2569                    } else if let Some(mut dp) = domain_parsed_out.take() {
2570                        convert_abs_url(&mut dp);
2571                        self.domain_parsed.replace(dp);
2572                    }
2573                } else if self.domain_parsed.is_none() {
2574                    self.domain_parsed = domain_parsed_out;
2575                }
2576
2577                if page.should_retry && retry_count > 0 {
2578                    retry_count -= 1;
2579                    if let Some(timeout) = page.get_timeout() {
2580                        tokio::time::sleep(timeout).await;
2581                    } else {
2582                        tokio::time::sleep(std::time::Duration::from_millis(250)).await;
2583                    }
2584                    continue;
2585                }
2586
2587                break page;
2588            }
2589
2590            if retry_count == 0 {
2591                let err = last_err
2592                    .take()
2593                    .unwrap_or_else(|| std::io::Error::other("cmd fetch failed (unknown error)"));
2594                break build_error_page(StatusCode::BAD_GATEWAY, err);
2595            }
2596
2597            retry_count -= 1;
2598            tokio::time::sleep(std::time::Duration::from_millis(250)).await;
2599        };
2600
2601        if page.get_html_bytes_u8().starts_with(b"<?xml") {
2602            page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
2603                .await;
2604        }
2605
2606        emit_log(url);
2607
2608        if let Some(signature) = page.signature {
2609            if !self.is_signature_allowed(signature).await {
2610                return Default::default();
2611            }
2612            self.insert_signature(signature).await;
2613        }
2614
2615        let url_ci = match &self.on_link_find_callback {
2616            Some(cb) => cb(*self.url.clone(), None).0,
2617            _ => *self.url.clone(),
2618        };
2619        self.insert_link(url_ci).await;
2620
2621        if self.configuration.return_page_links {
2622            page.page_links = links_pages
2623                .filter(|pages: &HashSet<CaseInsensitiveString>| !pages.is_empty())
2624                .map(Box::new);
2625        }
2626
2627        links.extend(links_ssg);
2628
2629        self.initial_status_code = page.status_code;
2630        self.initial_html_length = page.get_html_bytes_u8().len();
2631        self.initial_anti_bot_tech = page.anti_bot_tech;
2632        self.initial_page_should_retry = page.should_retry;
2633        self.initial_page_waf_check = page.waf_check;
2634
2635        self.set_crawl_initial_status(&page, &links);
2636
2637        if let Some(ref cb) = self.on_should_crawl_callback {
2638            if !cb.call(&page) {
2639                page.blocked_crawl = true;
2640                channel_send_page(&self.channel, page, &self.channel_guard);
2641                return Default::default();
2642            }
2643        }
2644
2645        channel_send_page(&self.channel, page, &self.channel_guard);
2646
2647        links
2648    }
2649
2650    /// Expand links for crawl base establish.
2651    #[cfg(not(feature = "glob"))]
2652    pub async fn _crawl_establish(
2653        &mut self,
2654        client: &Client,
2655        base: &mut RelativeSelectors,
2656        _: bool,
2657    ) -> HashSet<CaseInsensitiveString> {
2658        if self.skip_initial {
2659            return Default::default();
2660        }
2661
2662        if self
2663            .is_allowed_default(self.get_base_link())
2664            .eq(&ProcessLinkStatus::Allowed)
2665        {
2666            let url = self.url.inner();
2667
2668            let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
2669            let mut links_ssg = HashSet::new();
2670            let mut links_pages = if self.configuration.return_page_links {
2671                Some(HashSet::new())
2672            } else {
2673                None
2674            };
2675            let mut page_links_settings =
2676                PageLinkBuildSettings::new(true, self.configuration.full_resources);
2677
2678            page_links_settings.subdomains = self.configuration.subdomains;
2679            page_links_settings.tld = self.configuration.tld;
2680            page_links_settings.normalize = self.configuration.normalize;
2681
2682            let mut domain_parsed = self.domain_parsed.take();
2683
2684            let mut page = if let Some(mut seeded_page) = self.build_seed_page() {
2685                // Extract links and metadata from seeded HTML content if not binary
2686                #[cfg(not(feature = "decentralized"))]
2687                {
2688                    let html_bytes = seeded_page.get_html_bytes_u8();
2689                    if !html_bytes.is_empty() && !auto_encoder::is_binary_file(html_bytes) {
2690                        let html = seeded_page.get_html();
2691                        let extracted_links: HashSet<CaseInsensitiveString> = seeded_page
2692                            .links_stream_base_ssg(base, &html, client, &self.domain_parsed)
2693                            .await;
2694                        links.extend(extracted_links);
2695                    }
2696                }
2697                seeded_page
2698            } else {
2699                Page::new_page_streaming(
2700                    url,
2701                    client,
2702                    false,
2703                    base,
2704                    &self.configuration.external_domains_caseless,
2705                    &page_links_settings,
2706                    &mut links,
2707                    Some(&mut links_ssg),
2708                    &domain_parsed, // original domain
2709                    &mut self.domain_parsed,
2710                    &mut links_pages,
2711                )
2712                .await
2713            };
2714
2715            if page.get_html_bytes_u8().starts_with(b"<?xml") {
2716                page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
2717                    .await;
2718            }
2719
2720            if self.domain_parsed.is_none() {
2721                if let Some(mut domain_parsed) = domain_parsed.take() {
2722                    convert_abs_url(&mut domain_parsed);
2723                    self.domain_parsed.replace(domain_parsed);
2724                }
2725            }
2726
2727            let mut retry_count = self.configuration.retry;
2728            let domains_caseless = &self.configuration.external_domains_caseless;
2729
2730            while page.should_retry && retry_count > 0 {
2731                retry_count -= 1;
2732                if let Some(timeout) = page.get_timeout() {
2733                    tokio::time::sleep(timeout).await;
2734                }
2735
2736                if page.status_code == StatusCode::GATEWAY_TIMEOUT {
2737                    let mut domain_parsed_clone = self.domain_parsed.clone();
2738
2739                    if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
2740                        page.clone_from(
2741                            &Page::new_page_streaming(
2742                                url,
2743                                client,
2744                                false,
2745                                base,
2746                                domains_caseless,
2747                                &page_links_settings,
2748                                &mut links,
2749                                Some(&mut links_ssg),
2750                                &domain_parsed,
2751                                &mut domain_parsed_clone,
2752                                &mut links_pages,
2753                            )
2754                            .await,
2755                        );
2756                    })
2757                    .await
2758                    {
2759                        log::info!("backoff gateway timeout exceeded {elasped}");
2760                    }
2761
2762                    self.domain_parsed = domain_parsed_clone;
2763                } else {
2764                    page.clone_from(
2765                        &Page::new_page_streaming(
2766                            url,
2767                            client,
2768                            false,
2769                            base,
2770                            &self.configuration.external_domains_caseless,
2771                            &page_links_settings,
2772                            &mut links,
2773                            Some(&mut links_ssg),
2774                            &domain_parsed,
2775                            &mut self.domain_parsed,
2776                            &mut links_pages,
2777                        )
2778                        .await,
2779                    );
2780                }
2781            }
2782
2783            emit_log(url);
2784
2785            if let Some(signature) = page.signature {
2786                if !self.is_signature_allowed(signature).await {
2787                    return Default::default();
2788                }
2789                self.insert_signature(signature).await;
2790            }
2791
2792            let url = match &self.on_link_find_callback {
2793                Some(cb) => cb(*self.url.clone(), None).0,
2794                _ => *self.url.clone(),
2795            };
2796
2797            self.insert_link(url).await;
2798
2799            if self.configuration.return_page_links {
2800                page.page_links = links_pages
2801                    .filter(|pages: &HashSet<CaseInsensitiveString>| !pages.is_empty())
2802                    .map(Box::new);
2803            }
2804
2805            links.extend(links_ssg);
2806
2807            self.initial_status_code = page.status_code;
2808            self.initial_html_length = page.get_html_bytes_u8().len();
2809            self.initial_anti_bot_tech = page.anti_bot_tech;
2810            self.initial_page_should_retry = page.should_retry;
2811            self.initial_page_waf_check = page.waf_check;
2812
2813            self.set_crawl_initial_status(&page, &links);
2814
2815            if let Some(ref cb) = self.on_should_crawl_callback {
2816                if !cb.call(&page) {
2817                    page.blocked_crawl = true;
2818                    channel_send_page(&self.channel, page, &self.channel_guard);
2819                    return Default::default();
2820                }
2821            }
2822
2823            channel_send_page(&self.channel, page, &self.channel_guard);
2824
2825            links
2826        } else {
2827            HashSet::new()
2828        }
2829    }
2830
2831    /// Run `cmd` and return stdout bytes.
2832    #[cfg(feature = "cmd")]
2833    pub async fn run_via_cmd(
2834        cmd: &std::path::Path,
2835        fixed_args: &[String],
2836        url: &str,
2837    ) -> std::io::Result<Vec<u8>> {
2838        use tokio::process::Command;
2839        let mut args: Vec<String> = Vec::with_capacity(fixed_args.len() + 1);
2840        let mut used_placeholder = false;
2841
2842        for a in fixed_args {
2843            if a.contains("{url}") {
2844                used_placeholder = true;
2845                args.push(a.replace("{url}", url));
2846            } else {
2847                args.push(a.clone());
2848            }
2849        }
2850
2851        if !used_placeholder {
2852            args.push(url.to_string());
2853        }
2854
2855        let out = Command::new(cmd)
2856            .args(&args)
2857            .kill_on_drop(true)
2858            .output()
2859            .await?;
2860
2861        if !out.status.success() {
2862            let code = out.status.code().unwrap_or(-1);
2863            let stderr = String::from_utf8_lossy(&out.stderr);
2864
2865            return Err(std::io::Error::other(format!(
2866                "cmd exit={code} stderr={stderr}"
2867            )));
2868        }
2869
2870        Ok(out.stdout)
2871    }
2872
2873    /// Start to crawl website concurrently using a cmd executable.
2874    /// - `cmd` is the executable (absolute preferred)
2875    /// - `cmd_args` are fixed args; can include "{url}" placeholder, otherwise url is appended.
2876    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
2877    #[cfg(feature = "cmd")]
2878    pub async fn crawl_concurrent_cmd(
2879        &mut self,
2880        cmd: std::path::PathBuf,
2881        cmd_args: Vec<String>,
2882        handle: &Option<Arc<AtomicI8>>,
2883    ) {
2884        self.start();
2885        self.status = CrawlStatus::Active;
2886
2887        let mut selector: (
2888            CompactString,
2889            smallvec::SmallVec<[CompactString; 2]>,
2890            CompactString,
2891        ) = self.setup_selectors();
2892
2893        if self.single_page() {
2894            let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
2895            let mut links_pages: Option<HashSet<CaseInsensitiveString>> =
2896                if self.configuration.return_page_links {
2897                    Some(HashSet::new())
2898                } else {
2899                    None
2900                };
2901
2902            let mut relative_selectors = selector;
2903            let mut domain_parsed = None;
2904
2905            let target = self
2906                .domain_parsed
2907                .as_ref()
2908                .map(|u| u.as_str())
2909                .unwrap_or(self.get_url());
2910
2911            let bytes = match Self::run_via_cmd(&cmd, &cmd_args, target).await {
2912                Ok(b) => b,
2913                Err(_err) => {
2914                    let mut page = Page::default();
2915                    page.url = target.to_string();
2916                    page.status_code = StatusCode::BAD_GATEWAY;
2917                    #[cfg(not(feature = "page_error_status_details"))]
2918                    {
2919                        page.error_status = Some(_err.to_string());
2920                    }
2921                    channel_send_page(&self.channel, page, &self.channel_guard);
2922                    return;
2923                }
2924            };
2925
2926            let page = Page::new_page_streaming_from_bytes(
2927                target,
2928                &bytes,
2929                &mut relative_selectors,
2930                &self.configuration.external_domains_caseless,
2931                &PageLinkBuildSettings::new_full(
2932                    false,
2933                    self.configuration.full_resources,
2934                    self.configuration.subdomains,
2935                    self.configuration.tld,
2936                    self.configuration.normalize,
2937                ),
2938                &mut links,
2939                None,
2940                &self.domain_parsed,
2941                &mut domain_parsed,
2942                &mut links_pages,
2943            )
2944            .await;
2945
2946            channel_send_page(&self.channel, page, &self.channel_guard);
2947            return;
2948        }
2949
2950        let on_should_crawl_callback = self.on_should_crawl_callback.clone();
2951        let return_page_links = self.configuration.return_page_links;
2952        let full_resources = self.configuration.full_resources;
2953        let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
2954
2955        let (mut interval, throttle) = self.setup_crawl();
2956        let mut links: HashSet<CaseInsensitiveString> = self.drain_extra_links().collect();
2957
2958        links.extend(
2959            self._crawl_establish_cmd(cmd.clone(), cmd_args.clone(), &mut selector, false)
2960                .await,
2961        );
2962
2963        self.configuration.configure_allowlist();
2964        let semaphore = self.setup_semaphore();
2965
2966        let shared = Arc::new((
2967            cmd,
2968            cmd_args,
2969            selector,
2970            self.channel.clone(),
2971            self.configuration.external_domains_caseless.clone(),
2972            self.channel_guard.clone(),
2973            self.configuration.retry,
2974            return_page_links,
2975            PageLinkBuildSettings::new_full(
2976                false,
2977                full_resources,
2978                self.configuration.subdomains,
2979                self.configuration.tld,
2980                self.configuration.normalize,
2981            ),
2982            self.domain_parsed.clone(),
2983            self.on_link_find_callback.clone(),
2984        ));
2985
2986        let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> = JoinSet::new();
2987
2988        let mut exceeded_budget = false;
2989        let concurrency = throttle.is_zero();
2990
2991        self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
2992
2993        if !concurrency && !links.is_empty() {
2994            tokio::time::sleep(*throttle).await;
2995        }
2996
2997        let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
2998            Some(Instant::now())
2999        } else {
3000            None
3001        };
3002
3003        'outer: loop {
3004            #[cfg(all(feature = "agent", feature = "serde"))]
3005            self.apply_url_prefilter(&mut links).await;
3006
3007            let mut stream =
3008                tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect());
3009
3010            loop {
3011                if !concurrency {
3012                    tokio::time::sleep(*throttle).await;
3013                }
3014
3015                let semaphore = get_semaphore(&semaphore, !self.configuration.shared_queue).await;
3016
3017                tokio::select! {
3018                    biased;
3019
3020                    Some(link) = stream.next(),
3021                    if semaphore.available_permits() > 0
3022                        && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) =>
3023                    {
3024                        if !self.handle_process(handle, &mut interval, async {
3025                            emit_log_shutdown(link.inner());
3026                            let permits = set.len();
3027                            set.shutdown().await;
3028                            semaphore.add_permits(permits);
3029                        }).await {
3030                            while let Some(links) = stream.next().await {
3031                                self.extra_links.insert(links);
3032                            }
3033                            break 'outer;
3034                        }
3035
3036                        let allowed = self.is_allowed(&link);
3037                        if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
3038                            exceeded_budget = true;
3039                            break;
3040                        }
3041                        if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
3042                            continue;
3043                        }
3044
3045                        emit_log(link.inner());
3046                        self.insert_link(link.clone()).await;
3047
3048                        if let Ok(permit) = semaphore.clone().acquire_owned().await {
3049                            let shared = shared.clone();
3050                            let on_should_crawl_callback = on_should_crawl_callback.clone();
3051                            spawn_set("page_fetch_cmd", &mut set, async move {
3052                                let link_result = match &shared.10 {
3053                                    Some(cb) => cb(link, None),
3054                                    _ => (link, None),
3055                                };
3056
3057                                let mut out_links: HashSet<CaseInsensitiveString> = HashSet::new();
3058                                let mut links_pages = if shared.7 { Some(HashSet::new()) } else { None };
3059
3060                                let mut relative_selectors = shared.2.clone();
3061                                let mut r_settings = shared.8;
3062                                r_settings.ssg_build = true;
3063
3064                                let target_url = link_result.0.as_ref();
3065
3066                                // Run cmd -> bytes with retry
3067                                let mut retry_count = shared.6;
3068                                let mut last_err: Option<std::io::Error> = None;
3069
3070                                let bytes = loop {
3071                                    match Self::run_via_cmd(&shared.0, &shared.1, target_url).await {
3072                                        Ok(b) if !b.is_empty() => break Some(b),
3073                                        Ok(_) => {
3074                                            last_err = Some(std::io::Error::new(
3075                                                std::io::ErrorKind::UnexpectedEof,
3076                                                "cmd returned empty stdout",
3077                                            ));
3078                                        }
3079                                        Err(e) => {
3080                                            last_err = Some(e);
3081                                        }
3082                                    }
3083
3084                                    if retry_count == 0 { break None; }
3085                                    retry_count -= 1;
3086
3087                                    tokio::time::sleep(std::time::Duration::from_millis(250)).await;
3088                                };
3089
3090                                let mut domain_parsed = None;
3091
3092                                let mut page = if let Some(bytes) = bytes {
3093                                    Page::new_page_streaming_from_bytes(
3094                                        target_url,
3095                                        &bytes,
3096                                        &mut relative_selectors,
3097                                        &shared.4,
3098                                        &r_settings,
3099                                        &mut out_links,
3100                                        None,
3101                                        &shared.9,
3102                                        &mut domain_parsed,
3103                                        &mut links_pages,
3104                                    ).await
3105                                } else {
3106                                    // Build an error page
3107                                    let mut p = Page::default();
3108                                    p.url = target_url.to_string();
3109                                    p.status_code = StatusCode::BAD_GATEWAY;
3110                                    if let Some(_e) = last_err {
3111                                        #[cfg(not(feature = "page_error_status_details"))]
3112                                        {
3113                                            p.error_status = Some(_e.to_string());
3114                                        }
3115                                    }
3116                                    p
3117                                };
3118
3119                                if shared.7 {
3120                                    page.page_links = links_pages
3121                                        .filter(|pages| !pages.is_empty())
3122                                        .map(Box::new);
3123                                }
3124
3125                                if let Some(ref cb) = on_should_crawl_callback {
3126                                    if !cb.call(&page) {
3127                                        page.blocked_crawl = true;
3128                                        channel_send_page(&shared.3, page, &shared.5);
3129                                        drop(permit);
3130                                        return Default::default();
3131                                    }
3132                                }
3133
3134                                let signature = page.signature;
3135                                channel_send_page(&shared.3, page, &shared.5);
3136                                drop(permit);
3137
3138                                (out_links, signature)
3139                            });
3140                        }
3141
3142                        self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
3143                    },
3144
3145                    Some(result) = set.join_next(), if !set.is_empty() => {
3146                        if let Ok(res) = result {
3147                            match res.1 {
3148                                Some(signature) => {
3149                                    if self.is_signature_allowed(signature).await {
3150                                        self.insert_signature(signature).await;
3151                                        self.links_visited.extend_links(&mut links, res.0);
3152                                    }
3153                                }
3154                                _ => {
3155                                    self.links_visited.extend_links(&mut links, res.0);
3156                                }
3157                            }
3158                        } else {
3159                            break;
3160                        }
3161                    }
3162
3163                    else => break,
3164                }
3165
3166                self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
3167
3168                if (links.is_empty() && set.is_empty()) || exceeded_budget {
3169                    if exceeded_budget {
3170                        while let Some(links) = stream.next().await {
3171                            self.extra_links.insert(links);
3172                        }
3173                        while let Some(links) = set.join_next().await {
3174                            if let Ok(links) = links {
3175                                self.extra_links.extend(links.0);
3176                            }
3177                        }
3178                    }
3179                    break 'outer;
3180                }
3181            }
3182
3183            self.subscription_guard().await;
3184            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
3185
3186            if links.is_empty() && set.is_empty() {
3187                break;
3188            }
3189        }
3190
3191        if !links.is_empty() {
3192            self.extra_links.extend(links);
3193        }
3194    }
3195
3196    /// Build a page from a seed.
3197    #[allow(dead_code)]
3198    fn build_seed_page(&self) -> Option<Page> {
3199        if let Some(seeded_html) = self.get_seeded_html() {
3200            if crate::utils::is_cacheable_body_empty(seeded_html.as_bytes()) {
3201                return None;
3202            }
3203            let mut page_response = PageResponse::default();
3204            page_response.content = Some(Box::new(seeded_html.as_bytes().to_vec()));
3205            Some(build(self.url.inner(), page_response))
3206        } else {
3207            None
3208        }
3209    }
3210
3211    /// Expand links for crawl.
3212    #[cfg(all(
3213        not(feature = "decentralized"),
3214        feature = "chrome",
3215        not(feature = "glob")
3216    ))]
3217    pub async fn crawl_establish(
3218        &mut self,
3219        client: &Client,
3220        base: &mut RelativeSelectors,
3221        _: bool,
3222        chrome_page: &chromiumoxide::Page,
3223    ) -> HashSet<CaseInsensitiveString> {
3224        if self.skip_initial {
3225            return Default::default();
3226        }
3227
3228        if self
3229            .is_allowed_default(self.get_base_link())
3230            .eq(&ProcessLinkStatus::Allowed)
3231        {
3232            let (_, intercept_handle) = tokio::join!(
3233                crate::features::chrome::setup_chrome_events(chrome_page, &self.configuration),
3234                self.setup_chrome_interception(chrome_page)
3235            );
3236
3237            let mut page = if let Some(seeded_html) = self.get_seeded_html() {
3238                Page::new_seeded(
3239                    self.url.inner(),
3240                    client,
3241                    chrome_page,
3242                    &self.configuration.wait_for,
3243                    &self.configuration.screenshot,
3244                    false, // we use the initial about:blank page.
3245                    &self.configuration.openai_config,
3246                    &self.configuration.execution_scripts,
3247                    &self.configuration.automation_scripts,
3248                    &self.configuration.viewport,
3249                    &self.configuration.request_timeout,
3250                    &self.configuration.track_events,
3251                    self.configuration.referer.clone(),
3252                    self.configuration.max_page_bytes,
3253                    self.configuration.get_cache_options(),
3254                    &self.configuration.cache_policy,
3255                    Some(seeded_html.clone()),
3256                    Some(&self.cookie_jar),
3257                    &self.configuration.remote_multimodal,
3258                )
3259                .await
3260            } else {
3261                Page::new(
3262                    self.url.inner(),
3263                    client,
3264                    chrome_page,
3265                    &self.configuration.wait_for,
3266                    &self.configuration.screenshot,
3267                    false, // we use the initial about:blank page.
3268                    &self.configuration.openai_config,
3269                    &self.configuration.execution_scripts,
3270                    &self.configuration.automation_scripts,
3271                    &self.configuration.viewport,
3272                    &self.configuration.request_timeout,
3273                    &self.configuration.track_events,
3274                    self.configuration.referer.clone(),
3275                    self.configuration.max_page_bytes,
3276                    self.configuration.get_cache_options(),
3277                    &self.configuration.cache_policy,
3278                    &self.configuration.remote_multimodal,
3279                )
3280                .await
3281            };
3282
3283            let mut retry_count = self.configuration.retry;
3284
3285            if let Some(final_redirect_destination) = &page.final_redirect_destination {
3286                if final_redirect_destination == "chrome-error://chromewebdata/"
3287                    && page.status_code.is_success()
3288                    && page.is_empty()
3289                    && self.configuration.proxies.is_some()
3290                {
3291                    page.error_status = Some("Invalid proxy configuration.".into());
3292                    page.should_retry = true;
3293                    page.status_code = *crate::page::CHROME_UNKNOWN_STATUS_ERROR;
3294                }
3295            }
3296
3297            while page.should_retry && retry_count > 0 {
3298                retry_count -= 1;
3299                if let Some(timeout) = page.get_timeout() {
3300                    tokio::time::sleep(timeout).await;
3301                }
3302                if page.status_code == StatusCode::GATEWAY_TIMEOUT {
3303                    if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
3304                        let next_page = Page::new(
3305                            self.url.inner(),
3306                            client,
3307                            chrome_page,
3308                            &self.configuration.wait_for,
3309                            &self.configuration.screenshot,
3310                            false, // we use the initial about:blank page.
3311                            &self.configuration.openai_config,
3312                            &self.configuration.execution_scripts,
3313                            &self.configuration.automation_scripts,
3314                            &self.configuration.viewport,
3315                            &self.configuration.request_timeout,
3316                            &self.configuration.track_events,
3317                            self.configuration.referer.clone(),
3318                            self.configuration.max_page_bytes,
3319                            self.configuration.get_cache_options(),
3320                            &self.configuration.cache_policy,
3321                            &self.configuration.remote_multimodal,
3322                        )
3323                        .await;
3324                        page.clone_from(&next_page);
3325                    })
3326                    .await
3327                    {
3328                        log::warn!("backoff timeout {elasped}");
3329                    }
3330                } else {
3331                    let next_page = Page::new(
3332                        self.url.inner(),
3333                        client,
3334                        chrome_page,
3335                        &self.configuration.wait_for,
3336                        &self.configuration.screenshot,
3337                        false, // we use the initial about:blank page.
3338                        &self.configuration.openai_config,
3339                        &self.configuration.execution_scripts,
3340                        &self.configuration.automation_scripts,
3341                        &self.configuration.viewport,
3342                        &self.configuration.request_timeout,
3343                        &self.configuration.track_events,
3344                        self.configuration.referer.clone(),
3345                        self.configuration.max_page_bytes,
3346                        self.configuration.get_cache_options(),
3347                        &self.configuration.cache_policy,
3348                        &self.configuration.remote_multimodal,
3349                    )
3350                    .await;
3351                    page.clone_from(&next_page);
3352                }
3353
3354                // check the page again for final.
3355                if let Some(final_redirect_destination) = &page.final_redirect_destination {
3356                    if final_redirect_destination == "chrome-error://chromewebdata/"
3357                        && page.status_code.is_success()
3358                        && page.is_empty()
3359                        && self.configuration.proxies.is_some()
3360                    {
3361                        page.error_status = Some("Invalid proxy configuration.".into());
3362                        page.should_retry = true;
3363                        page.status_code = *crate::page::CHROME_UNKNOWN_STATUS_ERROR;
3364                    }
3365                }
3366            }
3367
3368            if let Some(h) = intercept_handle {
3369                let abort_handle = h.abort_handle();
3370                if let Err(elasped) =
3371                    tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await
3372                {
3373                    log::warn!("Handler timeout exceeded {elasped}");
3374                    abort_handle.abort();
3375                }
3376            }
3377
3378            if let Some(domain) = &page.final_redirect_destination {
3379                let domain: Box<CaseInsensitiveString> = CaseInsensitiveString::new(&domain).into();
3380                let prior_domain = self.domain_parsed.take();
3381                self.domain_parsed = parse_absolute_url(&domain);
3382                self.url = domain;
3383
3384                let s = self.setup_selectors();
3385                base.0 = s.0;
3386                base.1 = s.1;
3387
3388                if let Some(pdname) = prior_domain {
3389                    if let Some(dname) = pdname.host_str() {
3390                        base.2 = dname.into();
3391                    }
3392                }
3393            }
3394
3395            emit_log(self.url.inner());
3396
3397            if let Some(sid) = page.signature {
3398                self.insert_signature(sid).await;
3399            }
3400
3401            let url = match &self.on_link_find_callback {
3402                Some(cb) => cb(*self.url.clone(), None).0,
3403                _ => *self.url.clone(),
3404            };
3405
3406            self.insert_link(url).await;
3407
3408            // setup link tracking.
3409            if self.configuration.return_page_links && page.page_links.is_none() {
3410                page.page_links = Some(Box::default());
3411            }
3412
3413            let xml_file = page.get_html_bytes_u8().starts_with(b"<?xml");
3414
3415            let mut links = if !page.is_empty() && !xml_file {
3416                page.links_ssg(base, client, &self.domain_parsed).await
3417            } else {
3418                Default::default()
3419            };
3420
3421            if xml_file {
3422                page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
3423                    .await;
3424            }
3425
3426            self.initial_status_code = page.status_code;
3427            self.initial_html_length = page.get_html_bytes_u8().len();
3428            self.initial_anti_bot_tech = page.anti_bot_tech;
3429            self.initial_page_should_retry = page.should_retry;
3430            self.initial_page_waf_check = page.waf_check;
3431
3432            self.set_crawl_initial_status(&page, &links);
3433
3434            if let Some(ref cb) = self.on_should_crawl_callback {
3435                if !cb.call(&page) {
3436                    page.blocked_crawl = true;
3437                    channel_send_page(&self.channel, page, &self.channel_guard);
3438                    return Default::default();
3439                }
3440            }
3441
3442            channel_send_page(&self.channel, page, &self.channel_guard);
3443
3444            links
3445        } else {
3446            HashSet::new()
3447        }
3448    }
3449
3450    /// Expand links for crawl.
3451    #[cfg(all(not(feature = "decentralized"), feature = "chrome",))]
3452    pub async fn crawl_establish_chrome_one(
3453        &self,
3454        client: &Client,
3455        base: &mut RelativeSelectors,
3456        url: &Option<&str>,
3457        chrome_page: &chromiumoxide::Page,
3458    ) -> HashSet<CaseInsensitiveString> {
3459        if self
3460            .is_allowed_default(self.get_base_link())
3461            .eq(&ProcessLinkStatus::Allowed)
3462        {
3463            let (_, intercept_handle) = tokio::join!(
3464                crate::features::chrome::setup_chrome_events(chrome_page, &self.configuration),
3465                self.setup_chrome_interception(chrome_page)
3466            );
3467
3468            let mut page = Page::new(
3469                url.unwrap_or(self.url.inner()),
3470                client,
3471                chrome_page,
3472                &self.configuration.wait_for,
3473                &self.configuration.screenshot,
3474                false, // we use the initial about:blank page.
3475                &self.configuration.openai_config,
3476                &self.configuration.execution_scripts,
3477                &self.configuration.automation_scripts,
3478                &self.configuration.viewport,
3479                &self.configuration.request_timeout,
3480                &self.configuration.track_events,
3481                self.configuration.referer.clone(),
3482                self.configuration.max_page_bytes,
3483                self.configuration.get_cache_options(),
3484                &self.configuration.cache_policy,
3485                &self.configuration.remote_multimodal,
3486            )
3487            .await;
3488
3489            let mut retry_count = self.configuration.retry;
3490
3491            if let Some(final_redirect_destination) = &page.final_redirect_destination {
3492                if final_redirect_destination == "chrome-error://chromewebdata/"
3493                    && page.status_code.is_success()
3494                    && page.is_empty()
3495                    && self.configuration.proxies.is_some()
3496                {
3497                    page.error_status = Some("Invalid proxy configuration.".into());
3498                    page.should_retry = true;
3499                    page.status_code = *crate::page::CHROME_UNKNOWN_STATUS_ERROR;
3500                }
3501            }
3502
3503            while page.should_retry && retry_count > 0 {
3504                retry_count -= 1;
3505                if let Some(timeout) = page.get_timeout() {
3506                    tokio::time::sleep(timeout).await;
3507                }
3508                if page.status_code == StatusCode::GATEWAY_TIMEOUT {
3509                    if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
3510                        let next_page = Page::new(
3511                            self.url.inner(),
3512                            client,
3513                            chrome_page,
3514                            &self.configuration.wait_for,
3515                            &self.configuration.screenshot,
3516                            false, // we use the initial about:blank page.
3517                            &self.configuration.openai_config,
3518                            &self.configuration.execution_scripts,
3519                            &self.configuration.automation_scripts,
3520                            &self.configuration.viewport,
3521                            &self.configuration.request_timeout,
3522                            &self.configuration.track_events,
3523                            self.configuration.referer.clone(),
3524                            self.configuration.max_page_bytes,
3525                            self.configuration.get_cache_options(),
3526                            &self.configuration.cache_policy,
3527                            &self.configuration.remote_multimodal,
3528                        )
3529                        .await;
3530                        page.clone_from(&next_page);
3531                    })
3532                    .await
3533                    {
3534                        log::warn!("backoff timeout {elasped}");
3535                    }
3536                } else {
3537                    let next_page = Page::new(
3538                        self.url.inner(),
3539                        client,
3540                        chrome_page,
3541                        &self.configuration.wait_for,
3542                        &self.configuration.screenshot,
3543                        false, // we use the initial about:blank page.
3544                        &self.configuration.openai_config,
3545                        &self.configuration.execution_scripts,
3546                        &self.configuration.automation_scripts,
3547                        &self.configuration.viewport,
3548                        &self.configuration.request_timeout,
3549                        &self.configuration.track_events,
3550                        self.configuration.referer.clone(),
3551                        self.configuration.max_page_bytes,
3552                        self.configuration.get_cache_options(),
3553                        &self.configuration.cache_policy,
3554                        &self.configuration.remote_multimodal,
3555                    )
3556                    .await;
3557                    page.clone_from(&next_page);
3558                }
3559
3560                // check the page again for final.
3561                if let Some(final_redirect_destination) = &page.final_redirect_destination {
3562                    if final_redirect_destination == "chrome-error://chromewebdata/"
3563                        && page.status_code.is_success()
3564                        && page.is_empty()
3565                        && self.configuration.proxies.is_some()
3566                    {
3567                        page.error_status = Some("Invalid proxy configuration.".into());
3568                        page.should_retry = true;
3569                        page.status_code = *crate::page::CHROME_UNKNOWN_STATUS_ERROR;
3570                    }
3571                }
3572            }
3573
3574            if let Some(h) = intercept_handle {
3575                let abort_handle = h.abort_handle();
3576                if let Err(elasped) =
3577                    tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await
3578                {
3579                    log::warn!("Handler timeout exceeded {elasped}");
3580                    abort_handle.abort();
3581                }
3582            }
3583
3584            if let Some(domain) = &page.final_redirect_destination {
3585                let domain: Box<CaseInsensitiveString> = CaseInsensitiveString::new(&domain).into();
3586                let s = self.setup_selectors();
3587
3588                base.0 = s.0;
3589                base.1 = s.1;
3590
3591                if let Some(pdname) = parse_absolute_url(&domain) {
3592                    if let Some(dname) = pdname.host_str() {
3593                        base.2 = dname.into();
3594                    }
3595                }
3596            }
3597
3598            emit_log(self.url.inner());
3599
3600            if self.configuration.return_page_links && page.page_links.is_none() {
3601                page.page_links = Some(Box::default());
3602            }
3603
3604            let xml_file = page.get_html_bytes_u8().starts_with(b"<?xml");
3605
3606            let mut links = if !page.is_empty() && !xml_file {
3607                page.links_ssg(base, client, &self.domain_parsed).await
3608            } else {
3609                Default::default()
3610            };
3611
3612            if xml_file {
3613                page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
3614                    .await;
3615            }
3616
3617            if let Some(ref cb) = self.on_should_crawl_callback {
3618                if !cb.call(&page) {
3619                    page.blocked_crawl = true;
3620                    channel_send_page(&self.channel, page, &self.channel_guard);
3621                    return Default::default();
3622                }
3623            }
3624
3625            channel_send_page(&self.channel, page, &self.channel_guard);
3626
3627            links
3628        } else {
3629            HashSet::new()
3630        }
3631    }
3632
3633    /// Expand links for crawl using WebDriver.
3634    #[cfg(all(
3635        feature = "webdriver",
3636        not(feature = "decentralized"),
3637        not(feature = "chrome")
3638    ))]
3639    pub async fn crawl_establish_webdriver_one(
3640        &self,
3641        client: &Client,
3642        base: &mut RelativeSelectors,
3643        url: &Option<&str>,
3644        driver: &std::sync::Arc<thirtyfour::WebDriver>,
3645    ) -> HashSet<CaseInsensitiveString> {
3646        if self
3647            .is_allowed_default(self.get_base_link())
3648            .eq(&ProcessLinkStatus::Allowed)
3649        {
3650            let timeout = self
3651                .configuration
3652                .webdriver_config
3653                .as_ref()
3654                .and_then(|c| c.timeout);
3655
3656            // Setup stealth events
3657            crate::features::webdriver::setup_driver_events(driver, &self.configuration).await;
3658
3659            let mut page =
3660                Page::new_page_webdriver(url.unwrap_or(self.url.inner()), driver, timeout).await;
3661
3662            let mut retry_count = self.configuration.retry;
3663
3664            while page.should_retry && retry_count > 0 {
3665                retry_count -= 1;
3666                if let Some(timeout_duration) = page.get_timeout() {
3667                    tokio::time::sleep(timeout_duration).await;
3668                }
3669                if page.status_code == StatusCode::GATEWAY_TIMEOUT {
3670                    if let Err(elapsed) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
3671                        let next_page =
3672                            Page::new_page_webdriver(self.url.inner(), driver, timeout).await;
3673                        page.clone_from(&next_page);
3674                    })
3675                    .await
3676                    {
3677                        log::warn!("backoff timeout {elapsed}");
3678                    }
3679                } else {
3680                    let next_page =
3681                        Page::new_page_webdriver(self.url.inner(), driver, timeout).await;
3682                    page.clone_from(&next_page);
3683                }
3684            }
3685
3686            if let Some(domain) = &page.final_redirect_destination {
3687                let domain: Box<CaseInsensitiveString> = CaseInsensitiveString::new(&domain).into();
3688                let s = self.setup_selectors();
3689
3690                base.0 = s.0;
3691                base.1 = s.1;
3692
3693                if let Some(pdname) = parse_absolute_url(&domain) {
3694                    if let Some(dname) = pdname.host_str() {
3695                        base.2 = dname.into();
3696                    }
3697                }
3698            }
3699
3700            emit_log(self.url.inner());
3701
3702            if self.configuration.return_page_links && page.page_links.is_none() {
3703                page.page_links = Some(Box::default());
3704            }
3705
3706            let xml_file = page.get_html_bytes_u8().starts_with(b"<?xml");
3707
3708            let mut links = if !page.is_empty() && !xml_file {
3709                page.links_ssg(base, client, &self.domain_parsed).await
3710            } else {
3711                Default::default()
3712            };
3713
3714            if xml_file {
3715                page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
3716                    .await;
3717            }
3718
3719            if let Some(ref cb) = self.on_should_crawl_callback {
3720                if !cb.call(&page) {
3721                    page.blocked_crawl = true;
3722                    channel_send_page(&self.channel, page, &self.channel_guard);
3723                    return Default::default();
3724                }
3725            }
3726
3727            channel_send_page(&self.channel, page, &self.channel_guard);
3728
3729            links
3730        } else {
3731            HashSet::new()
3732        }
3733    }
3734
3735    /// Expand links for crawl.
3736    #[cfg(all(not(feature = "glob"), feature = "decentralized"))]
3737    pub async fn crawl_establish(
3738        &mut self,
3739        client: &Client,
3740        _: &(CompactString, smallvec::SmallVec<[CompactString; 2]>),
3741        http_worker: bool,
3742    ) -> HashSet<CaseInsensitiveString> {
3743        // base_domain name passed here is for primary url determination and not subdomain.tld placement
3744        let links: HashSet<CaseInsensitiveString> = if self
3745            .is_allowed_default(&self.get_base_link())
3746            .eq(&ProcessLinkStatus::Allowed)
3747        {
3748            let link = self.url.inner();
3749
3750            let mut page = Page::new_page_with_cache(
3751                &if http_worker && link.starts_with("https") {
3752                    link.replacen("https", "http", 1)
3753                } else {
3754                    link.to_string()
3755                },
3756                &client,
3757                self.configuration.get_cache_options(),
3758                &self.configuration.cache_policy,
3759            )
3760            .await;
3761
3762            if let Some(sid) = page.signature {
3763                self.insert_signature(sid).await;
3764            }
3765
3766            self.insert_link(match &self.on_link_find_callback {
3767                Some(cb) => cb(*self.url.to_owned(), None).0,
3768                _ => *self.url.to_owned(),
3769            })
3770            .await;
3771
3772            self.initial_status_code = page.status_code;
3773            self.initial_html_length = page.get_html_bytes_u8().len();
3774            self.initial_anti_bot_tech = page.anti_bot_tech;
3775            self.initial_page_should_retry = page.should_retry;
3776            self.initial_page_waf_check = page.waf_check;
3777
3778            // todo: pass full links to the worker to return.
3779            if self.configuration.return_page_links {
3780                page.page_links = Some(page.links.clone().into());
3781            }
3782
3783            let links = HashSet::from(page.links.clone());
3784
3785            self.set_crawl_initial_status(&page, &links);
3786
3787            channel_send_page(&self.channel, page, &self.channel_guard);
3788
3789            links
3790        } else {
3791            HashSet::new()
3792        };
3793
3794        links
3795    }
3796
3797    /// Expand links for crawl.
3798    #[cfg(all(feature = "glob", feature = "decentralized"))]
3799    pub async fn crawl_establish(
3800        &mut self,
3801        client: &Client,
3802        _: &(CompactString, smallvec::SmallVec<[CompactString; 2]>),
3803        http_worker: bool,
3804    ) -> HashSet<CaseInsensitiveString> {
3805        let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
3806        let expanded = self.get_expanded_links(self.url.inner().as_str());
3807        self.configuration.configure_allowlist();
3808
3809        for link in expanded {
3810            let allowed = self.is_allowed(&link);
3811
3812            if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
3813                break;
3814            }
3815            if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
3816                continue;
3817            }
3818
3819            let mut page = Page::new_page_with_cache(
3820                &if http_worker && link.as_ref().starts_with("https") {
3821                    link.inner().replacen("https", "http", 1).to_string()
3822                } else {
3823                    link.inner().to_string()
3824                },
3825                client,
3826                self.configuration.get_cache_options(),
3827                &self.configuration.cache_policy,
3828            )
3829            .await;
3830
3831            let u = page.get_url();
3832            let u = if u.is_empty() { link } else { u.into() };
3833
3834            let link_result = match &self.on_link_find_callback {
3835                Some(cb) => cb(u, None),
3836                _ => (u, None),
3837            };
3838
3839            if let Some(sid) = page.signature {
3840                self.insert_signature(sid).await;
3841            }
3842
3843            self.insert_link(link_result.0).await;
3844
3845            if self.configuration.return_page_links {
3846                page.page_links = Some(Default::default());
3847            }
3848
3849            channel_send_page(&self.channel, page.clone(), &self.channel_guard);
3850
3851            let page_links = page.links;
3852
3853            links.extend(page_links);
3854        }
3855
3856        links
3857    }
3858
3859    /// Expand links for crawl.
3860    #[cfg(all(feature = "glob", feature = "chrome", not(feature = "decentralized")))]
3861    pub async fn crawl_establish(
3862        &mut self,
3863        client: &Client,
3864        base: &mut RelativeSelectors,
3865        _: bool,
3866        page: &chromiumoxide::Page,
3867    ) -> HashSet<CaseInsensitiveString> {
3868        if self.skip_initial {
3869            return Default::default();
3870        }
3871        let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
3872        let expanded = self.get_expanded_links(&self.url.inner().as_str());
3873        self.configuration.configure_allowlist();
3874
3875        for link in expanded {
3876            let allowed = self.is_allowed(&link);
3877
3878            if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
3879                break;
3880            }
3881            if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
3882                continue;
3883            }
3884
3885            let mut page = Page::new(
3886                &link.inner().as_str(),
3887                &client,
3888                &page,
3889                &self.configuration.wait_for,
3890                &self.configuration.screenshot,
3891                false, // we use the initial about:blank page.
3892                &self.configuration.openai_config,
3893                &self.configuration.execution_scripts,
3894                &self.configuration.automation_scripts,
3895                &self.configuration.viewport,
3896                &self.configuration.request_timeout,
3897                &self.configuration.track_events,
3898                self.configuration.referer.clone(),
3899                self.configuration.max_page_bytes,
3900                self.configuration.get_cache_options(),
3901                &self.configuration.cache_policy,
3902                &self.configuration.remote_multimodal,
3903            )
3904            .await;
3905
3906            let u = page.get_url();
3907            let u = if u.is_empty() { link } else { u.into() };
3908
3909            let link_result = match &self.on_link_find_callback {
3910                Some(cb) => cb(u, None),
3911                _ => (u, None),
3912            };
3913
3914            if let Some(sid) = page.signature {
3915                self.insert_signature(sid).await;
3916            }
3917
3918            self.insert_link(link_result.0).await;
3919
3920            if self.configuration.return_page_links {
3921                page.page_links = Some(Default::default());
3922                let next_links = HashSet::from(page.links(&base, &self.domain_parsed).await);
3923
3924                channel_send_page(&self.channel, page.clone(), &self.channel_guard);
3925
3926                links.extend(next_links);
3927            } else {
3928                channel_send_page(&self.channel, page.clone(), &self.channel_guard);
3929                let next_links = HashSet::from(page.links(&base, &self.domain_parsed).await);
3930
3931                links.extend(next_links);
3932            }
3933        }
3934
3935        links
3936    }
3937
3938    /// Expand links for crawl.
3939    #[cfg(feature = "glob")]
3940    async fn _crawl_establish(
3941        &mut self,
3942        client: &Client,
3943        base: &mut RelativeSelectors,
3944        _: bool,
3945    ) -> HashSet<CaseInsensitiveString> {
3946        if self.skip_initial {
3947            return Default::default();
3948        }
3949        let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
3950        let domain_name = self.url.inner();
3951        let expanded = self.get_expanded_links(domain_name.as_str());
3952
3953        self.configuration.configure_allowlist();
3954
3955        for url in expanded {
3956            #[cfg(feature = "regex")]
3957            let url_ref: &CaseInsensitiveString = &url;
3958            #[cfg(not(feature = "regex"))]
3959            let url_ref: &CompactString = url.inner();
3960            if self
3961                .is_allowed_default(url_ref)
3962                .eq(&ProcessLinkStatus::Allowed)
3963            {
3964                let mut links_ssg = HashSet::new();
3965                let mut links_pages = if self.configuration.return_page_links {
3966                    Some(HashSet::new())
3967                } else {
3968                    None
3969                };
3970                let mut page_links_settings =
3971                    PageLinkBuildSettings::new(true, self.configuration.full_resources);
3972
3973                page_links_settings.subdomains = self.configuration.subdomains;
3974                page_links_settings.tld = self.configuration.tld;
3975                page_links_settings.normalize = self.configuration.normalize;
3976
3977                let mut domain_parsed = self.domain_parsed.take();
3978
3979                let mut page = Page::new_page_streaming(
3980                    &url,
3981                    client,
3982                    false,
3983                    base,
3984                    &self.configuration.external_domains_caseless,
3985                    &page_links_settings,
3986                    &mut links,
3987                    Some(&mut links_ssg),
3988                    &domain_parsed, // original domain
3989                    &mut self.domain_parsed,
3990                    &mut links_pages,
3991                )
3992                .await;
3993
3994                if self.domain_parsed.is_none() {
3995                    if let Some(mut domain_parsed) = domain_parsed.take() {
3996                        convert_abs_url(&mut domain_parsed);
3997                        self.domain_parsed.replace(domain_parsed);
3998                    }
3999                }
4000
4001                let mut retry_count = self.configuration.retry;
4002                let domains_caseless = &self.configuration.external_domains_caseless;
4003
4004                while page.should_retry && retry_count > 0 {
4005                    retry_count -= 1;
4006                    if let Some(timeout) = page.get_timeout() {
4007                        tokio::time::sleep(timeout).await;
4008                    }
4009
4010                    if page.status_code == StatusCode::GATEWAY_TIMEOUT {
4011                        let mut domain_parsed_clone = self.domain_parsed.clone();
4012
4013                        if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
4014                            page.clone_from(
4015                                &Page::new_page_streaming(
4016                                    &url,
4017                                    client,
4018                                    false,
4019                                    base,
4020                                    domains_caseless,
4021                                    &page_links_settings,
4022                                    &mut links,
4023                                    Some(&mut links_ssg),
4024                                    &domain_parsed,
4025                                    &mut domain_parsed_clone,
4026                                    &mut links_pages,
4027                                )
4028                                .await,
4029                            );
4030                        })
4031                        .await
4032                        {
4033                            log::info!("backoff gateway timeout exceeded {elasped}");
4034                        }
4035
4036                        self.domain_parsed = domain_parsed_clone;
4037                    } else {
4038                        page.clone_from(
4039                            &Page::new_page_streaming(
4040                                &url,
4041                                client,
4042                                false,
4043                                base,
4044                                &self.configuration.external_domains_caseless,
4045                                &page_links_settings,
4046                                &mut links,
4047                                Some(&mut links_ssg),
4048                                &domain_parsed,
4049                                &mut self.domain_parsed,
4050                                &mut links_pages,
4051                            )
4052                            .await,
4053                        );
4054                    }
4055                }
4056
4057                emit_log(&url);
4058
4059                if let Some(signature) = page.signature {
4060                    if !self.is_signature_allowed(signature).await {
4061                        return Default::default();
4062                    }
4063                    self.insert_signature(signature).await;
4064                }
4065
4066                self.insert_link(
4067                    self.on_link_find_callback
4068                        .as_ref()
4069                        .map(|cb| cb(*self.url.clone(), None).0)
4070                        .unwrap_or_else(|| *self.url.clone()),
4071                )
4072                .await;
4073
4074                if self.configuration.return_page_links {
4075                    page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new);
4076                }
4077
4078                links.extend(links_ssg);
4079
4080                self.initial_status_code = page.status_code;
4081                self.initial_html_length = page.get_html_bytes_u8().len();
4082                self.initial_anti_bot_tech = page.anti_bot_tech;
4083                self.initial_page_should_retry = page.should_retry;
4084                self.initial_page_waf_check = page.waf_check;
4085
4086                self.set_crawl_initial_status(&page, &links);
4087
4088                if let Some(ref cb) = self.on_should_crawl_callback {
4089                    if !cb.call(&page) {
4090                        page.blocked_crawl = true;
4091                        channel_send_page(&self.channel, page, &self.channel_guard);
4092                        return Default::default();
4093                    }
4094                }
4095
4096                channel_send_page(&self.channel, page, &self.channel_guard);
4097            }
4098        }
4099
4100        links
4101    }
4102
4103    /// Expand links for crawl.
4104    #[cfg(all(not(feature = "decentralized"), feature = "smart"))]
4105    pub async fn crawl_establish_smart(
4106        &mut self,
4107        client: &Client,
4108        mut base: &mut RelativeSelectors,
4109        browser: &crate::features::chrome::OnceBrowser,
4110    ) -> HashSet<CaseInsensitiveString> {
4111        if self.skip_initial {
4112            return Default::default();
4113        }
4114
4115        let links: HashSet<CaseInsensitiveString> = if self
4116            .is_allowed_default(&self.get_base_link())
4117            .eq(&ProcessLinkStatus::Allowed)
4118        {
4119            let url = self.url.inner();
4120
4121            let mut page = if let Some(seeded_page) = self.build_seed_page() {
4122                seeded_page
4123            } else {
4124                Page::new_page_with_cache(
4125                    &url,
4126                    &client,
4127                    self.configuration.get_cache_options(),
4128                    &self.configuration.cache_policy,
4129                )
4130                .await
4131            };
4132
4133            let mut retry_count = self.configuration.retry;
4134
4135            while page.should_retry && retry_count > 0 {
4136                retry_count -= 1;
4137                if let Some(timeout) = page.get_timeout() {
4138                    tokio::time::sleep(timeout).await;
4139                }
4140                let client_error = page.status_code.is_client_error();
4141
4142                if page.status_code == StatusCode::GATEWAY_TIMEOUT {
4143                    if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
4144                        if retry_count.is_power_of_two() {
4145                            Website::render_chrome_page(
4146                                &self.configuration,
4147                                client,
4148                                &mut page,
4149                                url,
4150                                &self.domain_parsed,
4151                                browser,
4152                            )
4153                            .await;
4154                        } else {
4155                            let next_page = Page::new_page_with_cache(
4156                                url,
4157                                &client,
4158                                self.configuration.get_cache_options(),
4159                                &self.configuration.cache_policy,
4160                            )
4161                            .await;
4162                            page.clone_from(&next_page);
4163                        };
4164                    })
4165                    .await
4166                    {
4167                        log::warn!("backoff timeout {elasped}");
4168                    }
4169                } else {
4170                    if retry_count.is_power_of_two() || client_error {
4171                        Website::render_chrome_page(
4172                            &self.configuration,
4173                            client,
4174                            &mut page,
4175                            url,
4176                            &self.domain_parsed,
4177                            browser,
4178                        )
4179                        .await
4180                    } else {
4181                        page.clone_from(
4182                            &Page::new_page_with_cache(
4183                                url,
4184                                &client,
4185                                self.configuration.get_cache_options(),
4186                                &self.configuration.cache_policy,
4187                            )
4188                            .await,
4189                        );
4190                    }
4191                }
4192            }
4193
4194            let (page_links, bytes_transferred): (HashSet<CaseInsensitiveString>, Option<f64>) =
4195                page.smart_links(
4196                    &base,
4197                    &self.configuration,
4198                    &self.domain_parsed,
4199                    &browser,
4200                    Some(&self.cookie_jar),
4201                )
4202                .await;
4203
4204            if let Some(domain) = &page.final_redirect_destination {
4205                let prior_domain = self.domain_parsed.take();
4206                crate::utils::modify_selectors(
4207                    &prior_domain,
4208                    domain,
4209                    &mut self.domain_parsed,
4210                    &mut self.url,
4211                    &mut base,
4212                    AllowedDomainTypes::new(self.configuration.subdomains, self.configuration.tld),
4213                );
4214            }
4215
4216            emit_log(&self.url.inner());
4217
4218            if let Some(sid) = page.signature {
4219                self.insert_signature(sid).await;
4220            }
4221
4222            self.insert_link(
4223                self.on_link_find_callback
4224                    .as_ref()
4225                    .map(|cb| cb(*self.url.clone(), None).0)
4226                    .unwrap_or_else(|| *self.url.clone()),
4227            )
4228            .await;
4229
4230            let links = if !page_links.is_empty() {
4231                page_links
4232            } else {
4233                Default::default()
4234            };
4235
4236            page.bytes_transferred = bytes_transferred;
4237
4238            self.initial_status_code = page.status_code;
4239            self.initial_html_length = page.get_html_bytes_u8().len();
4240            self.initial_anti_bot_tech = page.anti_bot_tech;
4241            self.initial_page_should_retry = page.should_retry;
4242            self.initial_page_waf_check = page.waf_check;
4243
4244            self.set_crawl_initial_status(&page, &links);
4245
4246            if self.configuration.return_page_links {
4247                page.page_links = if links.is_empty() {
4248                    None
4249                } else {
4250                    Some(Box::new(links.clone()))
4251                };
4252            }
4253
4254            if let Some(cb) = &mut self.on_should_crawl_callback {
4255                if !cb.call(&page) {
4256                    page.blocked_crawl = true;
4257                    channel_send_page(&self.channel, page, &self.channel_guard);
4258                    return Default::default();
4259                }
4260            }
4261
4262            channel_send_page(&self.channel, page, &self.channel_guard);
4263
4264            links
4265        } else {
4266            HashSet::new()
4267        };
4268
4269        links
4270    }
4271
4272    /// fetch the page with chrome
4273    #[cfg(all(not(feature = "decentralized"), feature = "smart"))]
4274    pub async fn render_chrome_page(
4275        config: &Configuration,
4276        client: &Client,
4277        page: &mut Page,
4278        url: &str,
4279        base: &Option<Box<Url>>,
4280        browser: &crate::features::chrome::OnceBrowser,
4281    ) {
4282        if let Some(browser_controller) = browser
4283            .get_or_init(|| crate::website::Website::setup_browser_base(&config, &base, None))
4284            .await
4285        {
4286            if let Ok(chrome_page) = crate::features::chrome::attempt_navigation(
4287                "about:blank",
4288                &browser_controller.browser.0,
4289                &config.request_timeout,
4290                &browser_controller.browser.2,
4291                &config.viewport,
4292            )
4293            .await
4294            {
4295                let (_, intercept_handle) = tokio::join!(
4296                    crate::features::chrome::setup_chrome_events(&chrome_page, &config),
4297                    crate::features::chrome::setup_chrome_interception_base(
4298                        &chrome_page,
4299                        config.chrome_intercept.enabled,
4300                        &config.auth_challenge_response,
4301                        config.chrome_intercept.block_visuals,
4302                        &url,
4303                    )
4304                );
4305
4306                let next_page = Page::new(
4307                    &url,
4308                    &client,
4309                    &chrome_page,
4310                    &config.wait_for,
4311                    &config.screenshot,
4312                    false, // we use the initial about:blank page.
4313                    &config.openai_config,
4314                    &config.execution_scripts,
4315                    &config.automation_scripts,
4316                    &config.viewport,
4317                    &config.request_timeout,
4318                    &config.track_events,
4319                    config.referer.clone(),
4320                    config.max_page_bytes,
4321                    config.get_cache_options(),
4322                    &config.cache_policy,
4323                    &config.remote_multimodal,
4324                )
4325                .await;
4326
4327                page.clone_from(&next_page);
4328
4329                if let Some(h) = intercept_handle {
4330                    let abort_handle = h.abort_handle();
4331                    if let Err(elasped) =
4332                        tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await
4333                    {
4334                        log::warn!("Handler timeout exceeded {elasped}");
4335                        abort_handle.abort();
4336                    }
4337                }
4338            }
4339        }
4340    }
4341
4342    /// Set the crawl status depending on crawl state. The crawl that only changes if the state is Start or Active.
4343    pub fn set_crawl_status(&mut self) {
4344        if self.status == CrawlStatus::Start || self.status == CrawlStatus::Active {
4345            self.status = if self.domain_parsed.is_none() {
4346                CrawlStatus::Invalid
4347            } else {
4348                CrawlStatus::Idle
4349            };
4350        }
4351    }
4352
4353    /// Setup the Semaphore for the crawl.
4354    pub fn setup_semaphore(&self) -> Arc<Semaphore> {
4355        if self.configuration.shared_queue {
4356            SEM_SHARED.clone()
4357        } else {
4358            Arc::new(Semaphore::const_new(
4359                self.configuration
4360                    .concurrency_limit
4361                    .unwrap_or(*DEFAULT_PERMITS),
4362            ))
4363        }
4364    }
4365
4366    /// Fast path: serve single-page crawl from cache, bypassing ALL heavy setup.
4367    /// No browser, no HTTP client, no robots.txt. Returns true on cache hit.
4368    #[cfg(any(
4369        feature = "cache",
4370        feature = "cache_mem",
4371        feature = "chrome_remote_cache"
4372    ))]
4373    async fn try_cache_shortcircuit(&mut self) -> bool {
4374        use crate::utils::{cache_skip_browser, get_cached_url};
4375
4376        // Ensure budget is configured so single_page() works before setup()
4377        self.configuration.configure_budget();
4378
4379        if !self.single_page() {
4380            return false;
4381        }
4382
4383        let cache_options = self.configuration.get_cache_options();
4384        if !cache_skip_browser(&cache_options) {
4385            return false;
4386        }
4387
4388        let target_url = self.url.inner().to_string();
4389
4390        if let Some(html) = get_cached_url(
4391            &target_url,
4392            cache_options.as_ref(),
4393            &self.configuration.cache_policy,
4394        )
4395        .await
4396        {
4397            self.status = CrawlStatus::Active;
4398            let page_response = crate::utils::build_cached_html_page_response(&target_url, &html);
4399            let page = build(&target_url, page_response);
4400            self.initial_status_code = page.status_code;
4401            self.initial_html_length = page.get_html_bytes_u8().len();
4402            self.links_visited
4403                .insert(CaseInsensitiveString::from(target_url.as_str()));
4404            channel_send_page(&self.channel, page, &self.channel_guard);
4405            self.subscription_guard().await;
4406            return true;
4407        }
4408
4409        false
4410    }
4411
4412    /// No-op stub when cache features not enabled.
4413    #[cfg(not(any(
4414        feature = "cache",
4415        feature = "cache_mem",
4416        feature = "chrome_remote_cache"
4417    )))]
4418    async fn try_cache_shortcircuit(&mut self) -> bool {
4419        false
4420    }
4421
4422    /// Cache-only crawl phase: process initial URL + follow links from cache.
4423    /// Returns `true` if all reachable pages were served from cache (Chrome/HTTP can be skipped).
4424    /// Any cache-miss links are left in `self.extra_links` for the subsequent Chrome/HTTP phase.
4425    #[cfg(any(
4426        feature = "cache",
4427        feature = "cache_mem",
4428        feature = "chrome_remote_cache"
4429    ))]
4430    async fn crawl_cache_phase(&mut self, _client: &Client) -> bool {
4431        use crate::utils::{build_cached_html_page_response, cache_skip_browser, get_cached_url};
4432
4433        let cache_options = self.configuration.get_cache_options();
4434        if !cache_skip_browser(&cache_options) {
4435            return false;
4436        }
4437
4438        self.configuration.configure_budget();
4439        self.configuration.configure_allowlist();
4440
4441        let target_url = self.url.inner().to_string();
4442
4443        // Try initial URL from cache
4444        let html = match get_cached_url(
4445            &target_url,
4446            cache_options.as_ref(),
4447            &self.configuration.cache_policy,
4448        )
4449        .await
4450        {
4451            Some(h) => h,
4452            None => return false, // Cache miss on initial URL — need Chrome/HTTP
4453        };
4454
4455        self.status = CrawlStatus::Active;
4456        let selectors = self.setup_selectors();
4457        let full_resources = self.configuration.full_resources;
4458        let return_page_links = self.configuration.return_page_links;
4459        let normalize = self.configuration.normalize;
4460
4461        // Build page from cached HTML
4462        let page_response = build_cached_html_page_response(&target_url, &html);
4463        let mut page = build(&target_url, page_response);
4464
4465        if !self.configuration.external_domains_caseless.is_empty() {
4466            page.set_external(self.configuration.external_domains_caseless.clone());
4467        }
4468        page.set_url_parsed_direct();
4469        if return_page_links {
4470            page.page_links = Some(Default::default());
4471        }
4472
4473        let page_base = page.base.take().map(Box::new);
4474        let mut links: HashSet<CaseInsensitiveString> = if full_resources {
4475            page.links_full(&selectors, &page_base).await
4476        } else {
4477            page.links(&selectors, &page_base).await
4478        };
4479        page.base = None;
4480
4481        if normalize {
4482            page.signature
4483                .replace(crate::utils::hash_html(page.get_html_bytes_u8()).await);
4484        }
4485
4486        // Set initial metadata
4487        self.initial_status_code = page.status_code;
4488        self.initial_html_length = page.get_html_bytes_u8().len();
4489
4490        let url = match &self.on_link_find_callback {
4491            Some(cb) => cb(*self.url.clone(), None).0,
4492            _ => *self.url.clone(),
4493        };
4494        self.insert_link(url).await;
4495        self.links_visited
4496            .insert(CaseInsensitiveString::from(target_url.as_str()));
4497
4498        emit_log(&target_url);
4499
4500        if normalize {
4501            if let Some(sig) = page.signature {
4502                if !self.is_signature_allowed(sig).await {
4503                    channel_send_page(&self.channel, page, &self.channel_guard);
4504                    self.subscription_guard().await;
4505                    return true;
4506                }
4507                self.insert_signature(sig).await;
4508            }
4509        }
4510
4511        self.set_crawl_initial_status(&page, &links);
4512
4513        if let Some(ref cb) = self.on_should_crawl_callback {
4514            if !cb.call(&page) {
4515                page.blocked_crawl = true;
4516                channel_send_page(&self.channel, page, &self.channel_guard);
4517                self.subscription_guard().await;
4518                return true; // blocked, but cache phase handled it
4519            }
4520        }
4521
4522        channel_send_page(&self.channel, page, &self.channel_guard);
4523
4524        // If single_page, we're done
4525        if self.single_page() {
4526            self.subscription_guard().await;
4527            return true;
4528        }
4529
4530        // Follow links from cache
4531        let mut cache_misses: HashSet<CaseInsensitiveString> = HashSet::new();
4532
4533        'cache_loop: loop {
4534            let current_links: Vec<CaseInsensitiveString> = links.drain().collect();
4535            if current_links.is_empty() {
4536                break;
4537            }
4538
4539            for link in current_links {
4540                let allowed = self.is_allowed(&link);
4541                if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
4542                    break 'cache_loop;
4543                }
4544                if allowed.eq(&ProcessLinkStatus::Blocked) {
4545                    continue;
4546                }
4547
4548                let link_url = link.inner().to_string();
4549
4550                match get_cached_url(
4551                    &link_url,
4552                    cache_options.as_ref(),
4553                    &self.configuration.cache_policy,
4554                )
4555                .await
4556                {
4557                    Some(cached_html) => {
4558                        emit_log(&link_url);
4559                        self.insert_link(link.clone()).await;
4560
4561                        let page_response =
4562                            build_cached_html_page_response(&link_url, &cached_html);
4563                        let mut page = build(&link_url, page_response);
4564
4565                        if !self.configuration.external_domains_caseless.is_empty() {
4566                            page.set_external(self.configuration.external_domains_caseless.clone());
4567                        }
4568                        page.set_url_parsed_direct();
4569                        if return_page_links {
4570                            page.page_links = Some(Default::default());
4571                        }
4572
4573                        let page_base = page.base.take().map(Box::new);
4574                        let new_links = if full_resources {
4575                            page.links_full(&selectors, &page_base).await
4576                        } else {
4577                            page.links(&selectors, &page_base).await
4578                        };
4579                        page.base = None;
4580
4581                        if normalize {
4582                            page.signature
4583                                .replace(crate::utils::hash_html(page.get_html_bytes_u8()).await);
4584                            if let Some(sig) = page.signature {
4585                                if !self.is_signature_allowed(sig).await {
4586                                    continue;
4587                                }
4588                                self.insert_signature(sig).await;
4589                            }
4590                        }
4591
4592                        if let Some(ref cb) = self.on_should_crawl_callback {
4593                            if !cb.call(&page) {
4594                                page.blocked_crawl = true;
4595                                channel_send_page(&self.channel, page, &self.channel_guard);
4596                                continue;
4597                            }
4598                        }
4599
4600                        channel_send_page(&self.channel, page, &self.channel_guard);
4601                        // Add newly discovered links for further cache processing
4602                        links.extend(new_links);
4603                    }
4604                    None => {
4605                        // Cache miss — save for Chrome/HTTP
4606                        cache_misses.insert(link);
4607                    }
4608                }
4609            }
4610        }
4611
4612        // If there are cache misses, put them in extra_links for Chrome/HTTP
4613        if !cache_misses.is_empty() {
4614            self.extra_links.extend(cache_misses);
4615            return false; // Need Chrome/HTTP for remaining links
4616        }
4617
4618        self.subscription_guard().await;
4619        true // All pages served from cache
4620    }
4621
4622    /// No-op stub when cache features not enabled.
4623    #[cfg(not(any(
4624        feature = "cache",
4625        feature = "cache_mem",
4626        feature = "chrome_remote_cache"
4627    )))]
4628    async fn crawl_cache_phase(&mut self, _client: &Client) -> bool {
4629        false
4630    }
4631
4632    /// Start to crawl website with async concurrency.
4633    pub async fn crawl(&mut self) {
4634        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4635            self.start();
4636            if self.try_cache_shortcircuit().await {
4637                self.set_crawl_status();
4638                return;
4639            }
4640            let (client, handle) = self.setup().await;
4641            let (handle, join_handle) = match handle {
4642                Some(h) => (Some(h.0), Some(h.1)),
4643                _ => (None, None),
4644            };
4645            self.crawl_concurrent(&client, &handle).await;
4646            self.sitemap_crawl_chain(&client, &handle, false).await;
4647            self.set_crawl_status();
4648            if let Some(h) = join_handle {
4649                h.abort()
4650            }
4651            self.client.replace(client);
4652        }
4653    }
4654
4655    /// Start to crawl website with async concurrency using the sitemap. This does not page forward into the request. This does nothing without the `sitemap` flag enabled.
4656    pub async fn crawl_sitemap(&mut self) {
4657        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4658            self.start();
4659            if self.try_cache_shortcircuit().await {
4660                self.set_crawl_status();
4661                return;
4662            }
4663            let (client, handle) = self.setup().await;
4664            let (handle, join_handle) = match handle {
4665                Some(h) => (Some(h.0), Some(h.1)),
4666                _ => (None, None),
4667            };
4668            self.sitemap_crawl(&client, &handle, false).await;
4669            self.set_crawl_status();
4670            if let Some(h) = join_handle {
4671                h.abort()
4672            }
4673            self.client.replace(client);
4674        }
4675    }
4676
4677    /// Start to crawl website with async concurrency using the sitemap. This does not page forward into the request. This does nothing without the `sitemap` and the `chrome` flag enabled.
4678    #[cfg(all(
4679        feature = "sitemap",
4680        feature = "chrome",
4681        not(feature = "decentralized")
4682    ))]
4683    pub async fn crawl_sitemap_chrome(&mut self) {
4684        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4685            self.start();
4686            let (client, handle) = self.setup().await;
4687            let (handle, join_handle) = match handle {
4688                Some(h) => (Some(h.0), Some(h.1)),
4689                _ => (None, None),
4690            };
4691            self.sitemap_crawl_chrome(&client, &handle, false).await;
4692            self.set_crawl_status();
4693            if let Some(h) = join_handle {
4694                h.abort()
4695            }
4696            self.client.replace(client);
4697        }
4698    }
4699
4700    /// Configures the website crawling process for concurrent execution with the ability to send it across threads for subscriptions.
4701    pub async fn configure_setup(&mut self) {
4702        self.status = CrawlStatus::Active;
4703        self.start();
4704        self.setup().await;
4705        self.configuration.configure_allowlist();
4706        self.send_configured = true;
4707    }
4708
4709    /// Configures the website crawling process for concurrent execution with the ability to send it across threads for subscriptions without robot protection.
4710    /// You can manually call `website.configure_robots_parser` after.
4711    pub fn configure_setup_norobots(&mut self) {
4712        self.status = CrawlStatus::Active;
4713        self.start();
4714        self.setup_base();
4715        self.configuration.configure_allowlist();
4716        self.send_configured = true;
4717    }
4718
4719    #[cfg(not(feature = "decentralized"))]
4720    /// Initiates the website crawling http process concurrently with the ability to send it across threads for subscriptions.
4721    /// Ensure that `website.configure_setup()` has been called before executing this function.
4722    /// It checks the status to ensure it is not firewall-blocked before proceeding with concurrent crawling.
4723    /// You can pass in a manual url in order to setup a new crawl directly with pre-configurations ready.
4724    pub async fn crawl_raw_send(&self, url: Option<&str>) {
4725        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4726            let (client, handle) = (
4727                match &self.client {
4728                    Some(c) => c.to_owned(),
4729                    _ => self.configure_http_client(),
4730                },
4731                self.configure_handler(),
4732            );
4733            let (handle, join_handle) = match handle {
4734                Some(h) => (Some(h.0), Some(h.1)),
4735                _ => (None, None),
4736            };
4737            self.crawl_concurrent_raw_send(&client, &handle, &url).await;
4738            if let Some(h) = join_handle {
4739                h.abort()
4740            }
4741        }
4742    }
4743
4744    #[cfg(all(feature = "chrome", not(feature = "decentralized")))]
4745    /// Initiates the website crawling process concurrently with the ability to send it across threads for subscriptions.
4746    /// Use `website.configure_setup().await` before executing this function to re-use the initial setup.
4747    /// You can pass in a manual url in order to setup a new crawl directly with pre-configurations ready.
4748    pub async fn crawl_chrome_send(&self, url: Option<&str>) {
4749        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4750            let (client, handle) = (
4751                match &self.client {
4752                    Some(c) => c.to_owned(),
4753                    _ => self.configure_http_client(),
4754                },
4755                self.configure_handler(),
4756            );
4757            let (handle, join_handle) = match handle {
4758                Some(h) => (Some(h.0), Some(h.1)),
4759                _ => (None, None),
4760            };
4761            self.crawl_concurrent_send(&client, &handle, &url).await;
4762            if let Some(h) = join_handle {
4763                h.abort()
4764            }
4765        }
4766    }
4767
4768    #[cfg(all(feature = "chrome", feature = "decentralized"))]
4769    /// In decentralized builds, chrome send crawling is not supported and this is a no-op.
4770    pub async fn crawl_chrome_send(&self, _url: Option<&str>) {}
4771
4772    #[cfg(all(feature = "chrome", not(feature = "decentralized")))]
4773    /// Initiates a single fetch with chrome for one page with the ability to send it across threads for subscriptions.
4774    pub async fn fetch_chrome(&self, url: Option<&str>) {
4775        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4776            let (client, handle) = (
4777                match &self.client {
4778                    Some(c) => c.to_owned(),
4779                    _ => self.configure_http_client(),
4780                },
4781                self.configure_handler(),
4782            );
4783            let (_handle, join_handle) = match handle {
4784                Some(h) => (Some(h.0), Some(h.1)),
4785                _ => (None, None),
4786            };
4787            self._fetch_chrome(&client, &url).await;
4788            if let Some(h) = join_handle {
4789                h.abort()
4790            }
4791        }
4792    }
4793
4794    #[cfg(all(feature = "chrome", not(feature = "decentralized")))]
4795    /// Initiates a single fetch with chrome without closing the browser for one page with the ability to send it across threads for subscriptions.
4796    pub async fn fetch_chrome_persisted(
4797        &self,
4798        url: Option<&str>,
4799        browser: &crate::features::chrome::BrowserController,
4800    ) {
4801        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4802            let (client, handle) = (
4803                match &self.client {
4804                    Some(c) => c.to_owned(),
4805                    _ => self.configure_http_client(),
4806                },
4807                self.configure_handler(),
4808            );
4809            let (_handle, join_handle) = match handle {
4810                Some(h) => (Some(h.0), Some(h.1)),
4811                _ => (None, None),
4812            };
4813            self._fetch_chrome_persisted(&client, &url, browser).await;
4814            if let Some(h) = join_handle {
4815                h.abort()
4816            }
4817        }
4818    }
4819
4820    #[cfg(all(feature = "decentralized", feature = "smart"))]
4821    /// Start to crawl website with async concurrency smart. Use HTTP first and JavaScript Rendering as needed. This has no effect without the `smart` flag enabled.
4822    pub async fn crawl_smart(&mut self) {
4823        self.crawl().await;
4824    }
4825
4826    #[cfg(all(feature = "decentralized", not(feature = "smart")))]
4827    /// Start to crawl website with async concurrency smart. Use HTTP first and JavaScript Rendering as needed. This has no effect without the `smart` flag enabled.
4828    pub async fn crawl_smart(&mut self) {
4829        self.crawl().await;
4830    }
4831
4832    #[cfg(all(not(feature = "decentralized"), feature = "smart"))]
4833    /// Start to crawl website with async concurrency smart. Use HTTP first and JavaScript Rendering as needed. This has no effect without the `smart` flag enabled.
4834    pub async fn crawl_smart(&mut self) {
4835        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4836            self.start();
4837            if self.try_cache_shortcircuit().await {
4838                self.set_crawl_status();
4839                return;
4840            }
4841            let (client, handle) = self.setup().await;
4842            let (handle, join_handle) = match handle {
4843                Some(h) => (Some(h.0), Some(h.1)),
4844                _ => (None, None),
4845            };
4846            self.crawl_concurrent_smart(&client, &handle).await;
4847            self.set_crawl_status();
4848            if let Some(h) = join_handle {
4849                h.abort()
4850            }
4851            self.client.replace(client);
4852        }
4853    }
4854
4855    #[cfg(all(not(feature = "decentralized"), not(feature = "smart")))]
4856    /// Start to crawl website with async concurrency smart. Use HTTP first and JavaScript Rendering as needed. This has no effect without the `smart` flag enabled.
4857    pub async fn crawl_smart(&mut self) {
4858        self.crawl().await
4859    }
4860
4861    /// Start to crawl website with async concurrency using the base raw functionality. Useful when using the `chrome` feature and defaulting to the basic implementation.
4862    pub async fn crawl_raw(&mut self) {
4863        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4864            self.start();
4865            if self.try_cache_shortcircuit().await {
4866                self.set_crawl_status();
4867                return;
4868            }
4869            let (client, handle) = self.setup().await;
4870            let (handle, join_handle) = match handle {
4871                Some(h) => (Some(h.0), Some(h.1)),
4872                _ => (None, None),
4873            };
4874            self.crawl_concurrent_raw(&client, &handle).await;
4875            self.sitemap_crawl_chain(&client, &handle, false).await;
4876            self.set_crawl_status();
4877            if let Some(h) = join_handle {
4878                h.abort()
4879            }
4880            self.client.replace(client);
4881        }
4882    }
4883
4884    /// Start to scrape/download website with async concurrency.
4885    pub async fn scrape(&mut self) {
4886        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4887            let mut w = self.clone();
4888            let mut rx2 = w.subscribe(0).expect("receiver enabled");
4889
4890            if self.pages.is_none() {
4891                self.pages = Some(Vec::new());
4892            }
4893
4894            // Signal channel to notify when crawl is done
4895            let (done_tx, mut done_rx) = tokio::sync::oneshot::channel::<()>();
4896
4897            let crawl = async move {
4898                w.crawl().await;
4899                w.unsubscribe();
4900                // Signal that crawl is complete
4901                let _ = done_tx.send(());
4902            };
4903
4904            let sub = async {
4905                loop {
4906                    tokio::select! {
4907                        biased;
4908                        // Check if crawl is done first
4909                        _ = &mut done_rx => {
4910                            break;
4911                        }
4912                        result = rx2.recv() => {
4913                            if let Ok(page) = result {
4914                                if let Some(sid) = page.signature {
4915                                    self.insert_signature(sid).await;
4916                                }
4917                                self.insert_link(page.get_url().into()).await;
4918                                if let Some(p) = self.pages.as_mut() {
4919                                    p.push(page);
4920                                }
4921                            } else {
4922                                break;
4923                            }
4924                        }
4925                    }
4926                }
4927            };
4928
4929            tokio::join!(sub, crawl);
4930            // Unsubscribe from self to close the original channel for any external subscribers
4931            self.unsubscribe();
4932        }
4933    }
4934
4935    /// Start to crawl website with async concurrency using the base raw functionality. Useful when using the "chrome" feature and defaulting to the basic implementation.
4936    pub async fn scrape_raw(&mut self) {
4937        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4938            let mut w = self.clone();
4939            let mut rx2 = w.subscribe(0).expect("receiver enabled");
4940
4941            if self.pages.is_none() {
4942                self.pages = Some(Vec::new());
4943            }
4944
4945            let (done_tx, mut done_rx) = tokio::sync::oneshot::channel::<()>();
4946
4947            let crawl = async move {
4948                w.crawl_raw().await;
4949                w.unsubscribe();
4950                let _ = done_tx.send(());
4951            };
4952
4953            let sub = async {
4954                loop {
4955                    tokio::select! {
4956                        biased;
4957                        _ = &mut done_rx => break,
4958                        result = rx2.recv() => {
4959                            if let Ok(page) = result {
4960                                if let Some(sid) = page.signature {
4961                                    self.insert_signature(sid).await;
4962                                }
4963                                self.insert_link(page.get_url().into()).await;
4964                                if let Some(p) = self.pages.as_mut() {
4965                                    p.push(page);
4966                                }
4967                            } else {
4968                                break;
4969                            }
4970                        }
4971                    }
4972                }
4973            };
4974
4975            tokio::join!(sub, crawl);
4976            self.unsubscribe();
4977        }
4978    }
4979
4980    /// Start to scrape website with async concurrency smart. Use HTTP first and JavaScript Rendering as needed. This has no effect without the `smart` flag enabled.
4981    pub async fn scrape_smart(&mut self) {
4982        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4983            let mut w = self.clone();
4984            let mut rx2 = w.subscribe(0).expect("receiver enabled");
4985
4986            if self.pages.is_none() {
4987                self.pages = Some(Vec::new());
4988            }
4989
4990            let (done_tx, mut done_rx) = tokio::sync::oneshot::channel::<()>();
4991
4992            let crawl = async move {
4993                w.crawl_smart().await;
4994                w.unsubscribe();
4995                let _ = done_tx.send(());
4996            };
4997
4998            let sub = async {
4999                loop {
5000                    tokio::select! {
5001                        biased;
5002                        _ = &mut done_rx => break,
5003                        result = rx2.recv() => {
5004                            if let Ok(page) = result {
5005                                if let Some(sid) = page.signature {
5006                                    self.insert_signature(sid).await;
5007                                }
5008                                self.insert_link(page.get_url().into()).await;
5009                                if let Some(p) = self.pages.as_mut() {
5010                                    p.push(page);
5011                                }
5012                            } else {
5013                                break;
5014                            }
5015                        }
5016                    }
5017                }
5018            };
5019
5020            tokio::join!(sub, crawl);
5021            self.unsubscribe();
5022        }
5023    }
5024
5025    /// Start to scrape website sitemap with async concurrency. Use HTTP first and JavaScript Rendering as needed. This has no effect without the `sitemap` flag enabled.
5026    pub async fn scrape_sitemap(&mut self) {
5027        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
5028            let mut w = self.clone();
5029            let mut rx2 = w.subscribe(0).expect("receiver enabled");
5030
5031            if self.pages.is_none() {
5032                self.pages = Some(Vec::new());
5033            }
5034
5035            let (done_tx, mut done_rx) = tokio::sync::oneshot::channel::<()>();
5036
5037            let crawl = async move {
5038                w.crawl_sitemap().await;
5039                w.unsubscribe();
5040                let _ = done_tx.send(());
5041            };
5042
5043            let sub = async {
5044                loop {
5045                    tokio::select! {
5046                        biased;
5047                        _ = &mut done_rx => break,
5048                        result = rx2.recv() => {
5049                            if let Ok(page) = result {
5050                                if let Some(sid) = page.signature {
5051                                    self.insert_signature(sid).await;
5052                                }
5053                                self.insert_link(page.get_url().into()).await;
5054                                if let Some(p) = self.pages.as_mut() {
5055                                    p.push(page);
5056                                }
5057                            } else {
5058                                break;
5059                            }
5060                        }
5061                    }
5062                }
5063            };
5064
5065            tokio::join!(sub, crawl);
5066            self.unsubscribe();
5067        }
5068    }
5069
5070    /// Dequeue the links to a set
5071    async fn dequeue(
5072        &mut self,
5073        q: &mut Option<tokio::sync::broadcast::Receiver<String>>,
5074        links: &mut HashSet<CaseInsensitiveString>,
5075        exceeded_budget: &mut bool,
5076    ) {
5077        // Drain relevance credits: restore wildcard budget for irrelevant pages
5078        #[cfg(all(feature = "agent", feature = "serde"))]
5079        if let Some(ref cfgs) = self.configuration.remote_multimodal {
5080            let credits = cfgs
5081                .relevance_credits
5082                .swap(0, std::sync::atomic::Ordering::Relaxed);
5083            for _ in 0..credits {
5084                self.restore_wildcard_budget();
5085            }
5086        }
5087
5088        if let Some(q) = q {
5089            while let Ok(link) = q.try_recv() {
5090                let s = link.into();
5091                let allowed = self.is_allowed_budgetless(&s);
5092
5093                if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
5094                    *exceeded_budget = true;
5095                    break;
5096                }
5097
5098                if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&s).await {
5099                    continue;
5100                }
5101
5102                self.links_visited.extend_with_new_links(links, s);
5103            }
5104        }
5105    }
5106
5107    /// Apply URL pre-filter to links if configured. Returns filtered links.
5108    #[cfg(all(feature = "agent", feature = "serde"))]
5109    async fn apply_url_prefilter(&self, links: &mut HashSet<CaseInsensitiveString>) {
5110        if let Some(ref cfgs) = self.configuration.remote_multimodal {
5111            if cfgs.cfg.url_prefilter && cfgs.cfg.relevance_gate && !links.is_empty() {
5112                *links = crate::features::automation::prefilter_urls(cfgs, links).await;
5113            }
5114        }
5115    }
5116
5117    /// Start to crawl website concurrently - used mainly for chrome instances to connect to default raw HTTP.
5118    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
5119    async fn crawl_concurrent_raw(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
5120        self.start();
5121
5122        // Cache-only phase first
5123        if self.crawl_cache_phase(client).await {
5124            return; // All pages served from cache
5125        }
5126        if !self.extra_links.is_empty() {
5127            self.skip_initial = true;
5128        }
5129
5130        self.status = CrawlStatus::Active;
5131        let client_rotator = self.client_rotator.clone();
5132        #[cfg(feature = "hedge")]
5133        let hedge_config = self.configuration.hedge.clone();
5134        let mut selector: (
5135            CompactString,
5136            smallvec::SmallVec<[CompactString; 2]>,
5137            CompactString,
5138        ) = self.setup_selectors();
5139        if self.single_page() {
5140            self._crawl_establish(client, &mut selector, false).await;
5141        } else {
5142            let on_should_crawl_callback = self.on_should_crawl_callback.clone();
5143            let full_resources = self.configuration.full_resources;
5144            let return_page_links = self.configuration.return_page_links;
5145            let only_html = self.configuration.only_html && !full_resources;
5146            #[cfg(any(
5147                feature = "cache",
5148                feature = "cache_mem",
5149                feature = "chrome_remote_cache"
5150            ))]
5151            let cache_options_raw = self.configuration.get_cache_options();
5152            #[cfg(any(
5153                feature = "cache",
5154                feature = "cache_mem",
5155                feature = "chrome_remote_cache"
5156            ))]
5157            let cache_policy_raw = self.configuration.cache_policy.clone();
5158            #[cfg(any(
5159                feature = "cache",
5160                feature = "cache_mem",
5161                feature = "chrome_remote_cache"
5162            ))]
5163            let normalize_raw = self.configuration.normalize;
5164            let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
5165
5166            let (mut interval, throttle) = self.setup_crawl();
5167
5168            let mut links: HashSet<CaseInsensitiveString> = self.drain_extra_links().collect();
5169
5170            links.extend(self._crawl_establish(client, &mut selector, false).await);
5171
5172            self.configuration.configure_allowlist();
5173
5174            let semaphore = self.setup_semaphore();
5175
5176            let shared = Arc::new((
5177                client.to_owned(),
5178                selector,
5179                self.channel.clone(),
5180                self.configuration.external_domains_caseless.clone(),
5181                self.channel_guard.clone(),
5182                self.configuration.retry,
5183                self.configuration.full_resources,
5184                PageLinkBuildSettings::new_full(
5185                    false,
5186                    self.configuration.full_resources,
5187                    self.configuration.subdomains,
5188                    self.configuration.tld,
5189                    self.configuration.normalize,
5190                ),
5191                self.domain_parsed.clone(),
5192                self.on_link_find_callback.clone(),
5193                self.configuration.remote_multimodal.clone(),
5194            ));
5195
5196            let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> = JoinSet::new();
5197
5198            // track budgeting one time.
5199            let mut exceeded_budget = false;
5200            let concurrency = throttle.is_zero();
5201
5202            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5203
5204            if !concurrency && !links.is_empty() {
5205                tokio::time::sleep(*throttle).await;
5206            }
5207
5208            let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
5209                Some(Instant::now())
5210            } else {
5211                None
5212            };
5213
5214            'outer: loop {
5215                #[cfg(all(feature = "agent", feature = "serde"))]
5216                self.apply_url_prefilter(&mut links).await;
5217
5218                let mut stream =
5219                    tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect());
5220
5221                loop {
5222                    if !concurrency {
5223                        tokio::time::sleep(*throttle).await;
5224                    }
5225
5226                    let semaphore =
5227                        get_semaphore(&semaphore, !self.configuration.shared_queue).await;
5228
5229                    tokio::select! {
5230                        biased;
5231                        Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
5232                            if !self.handle_process(handle, &mut interval, async {
5233                                emit_log_shutdown(link.inner());
5234                                let permits = set.len();
5235                                set.shutdown().await;
5236                                semaphore.add_permits(permits);
5237                            }).await {
5238                                while let Some(links) = stream.next().await {
5239                                    self.extra_links.insert(links);
5240                                }
5241                                break 'outer;
5242                            }
5243                            let allowed = self.is_allowed(&link);
5244
5245                            if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
5246                                exceeded_budget = true;
5247                                break;
5248                            }
5249
5250                            if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
5251                                continue;
5252                            }
5253
5254                            emit_log(link.inner());
5255
5256                            self.insert_link(link.clone()).await;
5257
5258                            if let Ok(permit) = semaphore.clone().acquire_owned().await {
5259                                let shared = shared.clone();
5260                                let on_should_crawl_callback = on_should_crawl_callback.clone();
5261                                let rotator = client_rotator.clone();
5262                                #[cfg(feature = "hedge")]
5263                                let hedge_cfg = hedge_config.clone();
5264                                #[cfg(any(feature = "cache", feature = "cache_mem", feature = "chrome_remote_cache"))]
5265                                let cache_opts = cache_options_raw.clone();
5266                                #[cfg(any(feature = "cache", feature = "cache_mem", feature = "chrome_remote_cache"))]
5267                                let cache_pol = cache_policy_raw.clone();
5268                                #[cfg(any(feature = "cache", feature = "cache_mem", feature = "chrome_remote_cache"))]
5269                                let normalize = normalize_raw;
5270                                spawn_set("page_fetch", &mut set, async move {
5271                                    let link_result = match &shared.9 {
5272                                        Some(cb) => cb(link, None),
5273                                        _ => (link, None),
5274                                    };
5275
5276                                    let target_url = link_result.0.as_ref();
5277
5278                                    // Cache-first: skip HTTP fetch entirely for cached pages
5279                                    #[cfg(any(feature = "cache", feature = "cache_mem", feature = "chrome_remote_cache"))]
5280                                    {
5281                                        use crate::utils::{cache_skip_browser, get_cached_url, build_cached_html_page_response};
5282                                        if cache_skip_browser(&cache_opts) {
5283                                            if let Some(html) = get_cached_url(target_url, cache_opts.as_ref(), &cache_pol).await {
5284                                                let page_response = build_cached_html_page_response(target_url, &html);
5285                                                let mut page = build(target_url, page_response);
5286
5287                                                if !shared.3.is_empty() {
5288                                                    page.set_external(shared.3.clone());
5289                                                }
5290                                                page.set_url_parsed_direct();
5291                                                if return_page_links {
5292                                                    page.page_links = Some(Default::default());
5293                                                }
5294                                                let page_base = page.base.take().map(Box::new);
5295                                                let links = if full_resources {
5296                                                    page.links_full(&shared.1, &page_base).await
5297                                                } else {
5298                                                    page.links(&shared.1, &page_base).await
5299                                                };
5300                                                page.base = None;
5301                                                if normalize {
5302                                                    page.signature.replace(crate::utils::hash_html(page.get_html_bytes_u8()).await);
5303                                                }
5304                                                if let Some(ref cb) = on_should_crawl_callback {
5305                                                    if !cb.call(&page) {
5306                                                        page.blocked_crawl = true;
5307                                                        channel_send_page(&shared.2, page, &shared.4);
5308                                                        drop(permit);
5309                                                        return Default::default();
5310                                                    }
5311                                                }
5312                                                let signature = page.signature;
5313                                                channel_send_page(&shared.2, page, &shared.4);
5314                                                drop(permit);
5315                                                return (links, signature);
5316                                            }
5317                                        }
5318                                    }
5319
5320                                    let external_domains_caseless = &shared.3;
5321
5322                                    // Hedge-enabled path: race primary vs delayed hedge on different proxy
5323                                    #[cfg(feature = "hedge")]
5324                                    let (mut page, mut links, mut links_pages) = {
5325                                        let should_hedge = if let Some(ref hcfg) = hedge_cfg {
5326                                            hcfg.enabled && rotator.as_ref().map_or(false, |r| r.len() > 1)
5327                                        } else {
5328                                            false
5329                                        };
5330
5331                                        if should_hedge {
5332                                            let hcfg = hedge_cfg.as_ref().unwrap();
5333                                            let rot = rotator.as_ref().unwrap();
5334                                            let (primary_client, hedge_client_opt) = rot.next_pair();
5335
5336                                            if let Some(hedge_client) = hedge_client_opt {
5337                                                let delay = hcfg.delay;
5338
5339                                                let primary_fut = async {
5340                                                    let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
5341                                                    let mut links_pages = if return_page_links { Some(HashSet::new()) } else { None };
5342                                                    let mut selectors = shared.1.clone();
5343                                                    let mut r_settings = shared.7;
5344                                                    r_settings.ssg_build = true;
5345                                                    let mut domain_parsed = None;
5346                                                    let page = Page::new_page_streaming(
5347                                                        target_url, primary_client, only_html,
5348                                                        &mut selectors, external_domains_caseless,
5349                                                        &r_settings, &mut links, None, &shared.8,
5350                                                        &mut domain_parsed, &mut links_pages).await;
5351                                                    (page, links, links_pages)
5352                                                };
5353
5354                                                tokio::pin!(primary_fut);
5355
5356                                                tokio::select! {
5357                                                    biased;
5358                                                    result = &mut primary_fut => result,
5359                                                    _ = tokio::time::sleep(delay) => {
5360                                                        log::info!("[hedge] fired after {}ms url={}", delay.as_millis(), target_url);
5361
5362                                                        let hedge_fut = async {
5363                                                            let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
5364                                                            let mut links_pages = if return_page_links { Some(HashSet::new()) } else { None };
5365                                                            let mut selectors = shared.1.clone();
5366                                                            let mut r_settings = shared.7;
5367                                                            r_settings.ssg_build = true;
5368                                                            let mut domain_parsed = None;
5369                                                            let page = Page::new_page_streaming(
5370                                                                target_url, hedge_client, only_html,
5371                                                                &mut selectors, external_domains_caseless,
5372                                                                &r_settings, &mut links, None, &shared.8,
5373                                                                &mut domain_parsed, &mut links_pages).await;
5374                                                            (page, links, links_pages)
5375                                                        };
5376
5377                                                        tokio::pin!(hedge_fut);
5378
5379                                                        tokio::select! {
5380                                                            biased;
5381                                                            result = &mut primary_fut => {
5382                                                                log::info!("[hedge] winner: primary url={}", target_url);
5383                                                                result
5384                                                            }
5385                                                            result = &mut hedge_fut => {
5386                                                                log::info!("[hedge] winner: hedge url={}", target_url);
5387                                                                result
5388                                                            }
5389                                                        }
5390                                                    }
5391                                                }
5392                                            } else {
5393                                                let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
5394                                                let mut links_pages = if return_page_links { Some(HashSet::new()) } else { None };
5395                                                let mut selectors = shared.1.clone();
5396                                                let mut r_settings = shared.7;
5397                                                r_settings.ssg_build = true;
5398                                                let mut domain_parsed = None;
5399                                                let page = Page::new_page_streaming(
5400                                                    target_url, primary_client, only_html,
5401                                                    &mut selectors, external_domains_caseless,
5402                                                    &r_settings, &mut links, None, &shared.8,
5403                                                    &mut domain_parsed, &mut links_pages).await;
5404                                                (page, links, links_pages)
5405                                            }
5406                                        } else {
5407                                            let client = match &rotator {
5408                                                Some(r) => r.next(),
5409                                                None => &shared.0,
5410                                            };
5411                                            let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
5412                                            let mut links_pages = if return_page_links { Some(HashSet::new()) } else { None };
5413                                            let mut selectors = shared.1.clone();
5414                                            let mut r_settings = shared.7;
5415                                            r_settings.ssg_build = true;
5416                                            let mut domain_parsed = None;
5417                                            let page = Page::new_page_streaming(
5418                                                target_url, client, only_html,
5419                                                &mut selectors, external_domains_caseless,
5420                                                &r_settings, &mut links, None, &shared.8,
5421                                                &mut domain_parsed, &mut links_pages).await;
5422                                            (page, links, links_pages)
5423                                        }
5424                                    };
5425
5426                                    #[cfg(not(feature = "hedge"))]
5427                                    let (mut page, mut links, mut links_pages) = {
5428                                        let client = match &rotator {
5429                                            Some(r) => r.next(),
5430                                            None => &shared.0,
5431                                        };
5432                                        let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
5433                                        let mut links_pages = if return_page_links {
5434                                            Some(links.clone())
5435                                        } else {
5436                                            None
5437                                        };
5438                                        let mut relative_selectors = shared.1.clone();
5439                                        let mut r_settings = shared.7;
5440                                        r_settings.ssg_build = true;
5441                                        let mut domain_parsed = None;
5442                                        let page = Page::new_page_streaming(
5443                                            target_url,
5444                                            client, only_html,
5445                                            &mut relative_selectors,
5446                                            external_domains_caseless,
5447                                            &r_settings,
5448                                            &mut links,
5449                                            None,
5450                                            &shared.8,
5451                                            &mut domain_parsed,
5452                                            &mut links_pages).await;
5453                                        (page, links, links_pages)
5454                                    };
5455
5456                                    let mut retry_count = shared.5;
5457
5458                                    while page.should_retry && retry_count > 0 {
5459                                        retry_count -= 1;
5460
5461                                        if let Some(timeout) = page.get_timeout() {
5462                                            tokio::time::sleep(timeout).await;
5463                                        }
5464
5465                                        let retry_client = match &rotator {
5466                                            Some(r) => r.next(),
5467                                            None => &shared.0,
5468                                        };
5469
5470                                        if page.status_code == StatusCode::GATEWAY_TIMEOUT {
5471                                            if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
5472                                                let mut domain_parsed = None;
5473                                                let mut retry_r_settings = shared.7;
5474                                                retry_r_settings.ssg_build = true;
5475                                                let next_page = Page::new_page_streaming(
5476                                                    target_url,
5477                                                    retry_client, only_html,
5478                                                    &mut shared.1.clone(),
5479                                                    external_domains_caseless,
5480                                                    &retry_r_settings,
5481                                                    &mut links,
5482                                                    None,
5483                                                    &shared.8,
5484                                                    &mut domain_parsed,
5485                                                    &mut links_pages).await;
5486
5487                                                page.clone_from(&next_page);
5488
5489                                            }).await
5490                                        {
5491                                            log::warn!("Handler timeout exceeded {elasped}");
5492                                        }
5493
5494                                        } else {
5495                                            let mut domain_parsed = None;
5496                                            let mut retry_r_settings = shared.7;
5497                                            retry_r_settings.ssg_build = true;
5498                                            page.clone_from(&Page::new_page_streaming(
5499                                                target_url,
5500                                                retry_client,
5501                                                only_html,
5502                                                &mut shared.1.clone(),
5503                                                external_domains_caseless,
5504                                                &retry_r_settings,
5505                                                &mut links,
5506                                                None,
5507                                                &shared.8,
5508                                                &mut domain_parsed,
5509                                                &mut links_pages).await);
5510                                        }
5511                                    }
5512
5513                                    if return_page_links {
5514                                        page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new);
5515                                    }
5516
5517                                    // Run remote multimodal extraction if configured (HTTP-only path)
5518                                    #[cfg(all(feature = "agent", feature = "serde"))]
5519                                    if shared.10.is_some() {
5520                                        let html = page.get_html();
5521                                        if !html.is_empty() {
5522                                            use crate::features::automation::{run_remote_multimodal_extraction, AutomationResultExt};
5523                                            let title = page.metadata.as_ref().and_then(|m| m.title.as_ref()).map(|t| t.as_str());
5524                                            if let Ok(Some(result)) = run_remote_multimodal_extraction(
5525                                                &shared.10,
5526                                                &html,
5527                                                target_url,
5528                                                title,
5529                                            ).await {
5530                                                // Store usage on page
5531                                                match page.remote_multimodal_usage.as_mut() {
5532                                                    Some(v) => v.push(result.usage.clone()),
5533                                                    None => page.remote_multimodal_usage = Some(vec![result.usage.clone()]),
5534                                                }
5535                                                // Store extracted data if available
5536                                                if result.extracted.is_some() || result.screenshot.is_some() {
5537                                                    let automation_result = result.to_automation_results();
5538                                                    match page.extra_remote_multimodal_data.as_mut() {
5539                                                        Some(v) => v.push(automation_result),
5540                                                        None => page.extra_remote_multimodal_data = Some(vec![automation_result]),
5541                                                    }
5542                                                }
5543                                            }
5544                                        }
5545                                    }
5546
5547                                    if let Some(ref cb) = on_should_crawl_callback {
5548                                        if !cb.call(&page) {
5549                                            page.blocked_crawl = true;
5550                                            channel_send_page(&shared.2, page, &shared.4);
5551                                            drop(permit);
5552                                            return Default::default()
5553                                        }
5554                                    }
5555
5556                                    let signature = page.signature;
5557
5558                                    channel_send_page(&shared.2, page, &shared.4);
5559
5560                                    drop(permit);
5561
5562                                    (links, signature)
5563                                });
5564                            }
5565
5566                            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5567                        },
5568                        Some(result) = set.join_next(), if !set.is_empty() => {
5569                            if let Ok(res) = result {
5570                                match res.1 {
5571                                    Some(signature) => {
5572                                        if self.is_signature_allowed(signature).await {
5573                                            self.insert_signature(signature).await;
5574                                            self.links_visited.extend_links(&mut links, res.0);
5575                                        }
5576                                    }
5577                                    _ => {
5578                                        self.links_visited.extend_links(&mut links, res.0);
5579                                    }
5580                                }
5581                            } else {
5582                                break;
5583                            }
5584                        }
5585                        else => break,
5586                    }
5587
5588                    self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5589
5590                    if links.is_empty() && set.is_empty() || exceeded_budget {
5591                        // await for all tasks to complete.
5592                        if exceeded_budget {
5593                            while let Some(links) = stream.next().await {
5594                                self.extra_links.insert(links);
5595                            }
5596                            while let Some(links) = set.join_next().await {
5597                                if let Ok(links) = links {
5598                                    self.extra_links.extend(links.0);
5599                                }
5600                            }
5601                        }
5602                        break 'outer;
5603                    }
5604                }
5605
5606                self.subscription_guard().await;
5607                self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5608
5609                if links.is_empty() && set.is_empty() {
5610                    break;
5611                }
5612            }
5613
5614            // store the extra links.
5615            if !links.is_empty() {
5616                self.extra_links.extend(links);
5617            }
5618        }
5619    }
5620
5621    /// Start to crawl website concurrently.
5622    #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
5623    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
5624    async fn crawl_concurrent(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
5625        use crate::features::chrome::attempt_navigation;
5626        self.start();
5627
5628        // Phase 1: Try cache-only crawl (no Chrome, no HTTP)
5629        if self.crawl_cache_phase(client).await {
5630            return; // All pages served from cache — skip Chrome entirely
5631        }
5632        // If cache_phase returned false with cache misses, they're in self.extra_links.
5633        if !self.extra_links.is_empty() {
5634            self.skip_initial = true;
5635        }
5636
5637        // Phase 2: Chrome for remaining pages
5638        match self.setup_browser().await {
5639            Some(mut b) => {
5640                match attempt_navigation(
5641                    "about:blank",
5642                    &b.browser.0,
5643                    &self.configuration.request_timeout,
5644                    &b.browser.2,
5645                    &self.configuration.viewport,
5646                )
5647                .await
5648                {
5649                    Ok(new_page) => {
5650                        let mut selectors = self.setup_selectors();
5651                        self.status = CrawlStatus::Active;
5652
5653                        if self.single_page() {
5654                            self.crawl_establish(client, &mut selectors, false, &new_page)
5655                                .await;
5656                            drop(new_page);
5657                            self.subscription_guard().await;
5658                            b.dispose();
5659                        } else {
5660                            let semaphore: Arc<Semaphore> = self.setup_semaphore();
5661                            let (mut interval, throttle) = self.setup_crawl();
5662
5663                            let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
5664
5665                            let base_links = self
5666                                .crawl_establish(client, &mut selectors, false, &new_page)
5667                                .await;
5668
5669                            drop(new_page);
5670
5671                            let mut links: HashSet<CaseInsensitiveString> =
5672                                self.drain_extra_links().collect();
5673
5674                            links.extend(base_links);
5675
5676                            self.configuration.configure_allowlist();
5677
5678                            let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> =
5679                                JoinSet::new();
5680
5681                            let shared = Arc::new((
5682                                client.to_owned(),
5683                                selectors,
5684                                self.channel.clone(),
5685                                self.configuration.external_domains_caseless.clone(),
5686                                self.channel_guard.clone(),
5687                                b.browser.0.clone(),
5688                                self.configuration.clone(),
5689                                self.url.inner().to_string(),
5690                                b.browser.2.clone(),
5691                                self.domain_parsed.clone(),
5692                                self.on_link_find_callback.clone(),
5693                            ));
5694
5695                            let add_external = !shared.3.is_empty();
5696                            let on_should_crawl_callback = self.on_should_crawl_callback.clone();
5697                            let full_resources = self.configuration.full_resources;
5698                            let return_page_links = self.configuration.return_page_links;
5699                            let mut exceeded_budget = false;
5700                            let concurrency = throttle.is_zero();
5701
5702                            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5703
5704                            if !concurrency && !links.is_empty() {
5705                                tokio::time::sleep(*throttle).await;
5706                            }
5707
5708                            let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
5709                                Some(Instant::now())
5710                            } else {
5711                                None
5712                            };
5713
5714                            'outer: loop {
5715                                #[cfg(all(feature = "agent", feature = "serde"))]
5716                                self.apply_url_prefilter(&mut links).await;
5717
5718                                let mut stream = tokio_stream::iter::<HashSet<CaseInsensitiveString>>(
5719                                    links.drain().collect(),
5720                                );
5721
5722                                loop {
5723                                    if !concurrency {
5724                                        tokio::time::sleep(*throttle).await;
5725                                    }
5726
5727                                    let semaphore =
5728                                        get_semaphore(&semaphore, !self.configuration.shared_queue)
5729                                            .await;
5730
5731                                    tokio::select! {
5732                                        biased;
5733                                        Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker)  => {
5734                                            if !self
5735                                                .handle_process(
5736                                                    handle,
5737                                                    &mut interval,
5738                                                    async {
5739                                                        emit_log_shutdown(link.inner());
5740                                                        let permits = set.len();
5741                                                        set.shutdown().await;
5742                                                        semaphore.add_permits(permits);
5743                                                    },
5744                                                )
5745                                                .await
5746                                            {
5747                                                break 'outer;
5748                                            }
5749
5750                                            let allowed = self.is_allowed(&link);
5751
5752                                            if allowed
5753                                                .eq(&ProcessLinkStatus::BudgetExceeded)
5754                                            {
5755                                                exceeded_budget = true;
5756                                                break;
5757                                            }
5758                                            if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
5759                                                continue;
5760                                            }
5761
5762                                            emit_log(link.inner());
5763
5764                                            self.insert_link(link.clone()).await;
5765
5766                                            if let Ok(permit) = semaphore.clone().acquire_owned().await {
5767                                                let shared = shared.clone();
5768                                                let on_should_crawl_callback = on_should_crawl_callback.clone();
5769                                                spawn_set("page_fetch", &mut set, async move {
5770                                                    let link_result =
5771                                                        match &shared.10 {
5772                                                            Some(cb) => cb(link, None),
5773                                                            _ => (link, None),
5774                                                        };
5775
5776                                                    let target_url_string = link_result.0.as_ref().to_string();
5777
5778                                                    // Cache-first: skip tab creation entirely for cached pages
5779                                                    #[cfg(any(feature = "cache", feature = "cache_mem", feature = "chrome_remote_cache"))]
5780                                                    {
5781                                                        use crate::utils::{cache_skip_browser, get_cached_url, build_cached_html_page_response};
5782                                                        let cache_options = shared.6.get_cache_options();
5783                                                        if cache_skip_browser(&cache_options) {
5784                                                            if let Some(html) = get_cached_url(&target_url_string, cache_options.as_ref(), &shared.6.cache_policy).await {
5785                                                                let page_response = build_cached_html_page_response(&target_url_string, &html);
5786                                                                let mut page = build(&target_url_string, page_response);
5787
5788                                                                if add_external {
5789                                                                    page.set_external(shared.3.clone());
5790                                                                }
5791                                                                page.set_url_parsed_direct();
5792                                                                let page_base = page.base.take().map(Box::new);
5793                                                                if return_page_links {
5794                                                                    page.page_links = Some(Default::default());
5795                                                                }
5796                                                                let links = if full_resources {
5797                                                                    page.links_full(&shared.1, &page_base).await
5798                                                                } else {
5799                                                                    page.links(&shared.1, &page_base).await
5800                                                                };
5801                                                                page.base = None;
5802                                                                if shared.6.normalize {
5803                                                                    page.signature.replace(crate::utils::hash_html(page.get_html_bytes_u8()).await);
5804                                                                }
5805                                                                if let Some(ref cb) = on_should_crawl_callback {
5806                                                                    if !cb.call(&page) {
5807                                                                        page.blocked_crawl = true;
5808                                                                        channel_send_page(&shared.2, page, &shared.4);
5809                                                                        drop(permit);
5810                                                                        return Default::default();
5811                                                                    }
5812                                                                }
5813                                                                let signature = page.signature;
5814                                                                channel_send_page(&shared.2, page, &shared.4);
5815                                                                drop(permit);
5816                                                                return (links, signature);
5817                                                            }
5818                                                        }
5819                                                    }
5820
5821                                                    let results = match attempt_navigation("about:blank", &shared.5, &shared.6.request_timeout, &shared.8, &shared.6.viewport).await {
5822                                                        Ok(new_page) => {
5823                                                            let (_, intercept_handle) = tokio::join!(
5824                                                                crate::features::chrome::setup_chrome_events(&new_page, &shared.6),
5825                                                                crate::features::chrome::setup_chrome_interception_base(
5826                                                                    &new_page,
5827                                                                    shared.6.chrome_intercept.enabled,
5828                                                                    &shared.6.auth_challenge_response,
5829                                                                    shared.6.chrome_intercept.block_visuals,
5830                                                                    &shared.7,
5831                                                                )
5832                                                            );
5833
5834                                                            let target_url = target_url_string.as_str();
5835
5836                                                            let mut page = Page::new(
5837                                                                target_url,
5838                                                                &shared.0,
5839                                                                &new_page,
5840                                                                &shared.6.wait_for,
5841                                                                &shared.6.screenshot,
5842                                                                false,
5843                                                                &shared.6.openai_config,
5844                                                                &shared.6.execution_scripts,
5845                                                                &shared.6.automation_scripts,
5846                                                                &shared.6.viewport,
5847                                                                &shared.6.request_timeout,
5848                                                                &shared.6.track_events,
5849                                                                shared.6.referer.clone(),
5850                                                                shared.6.max_page_bytes,
5851                                                                shared.6.get_cache_options(),
5852                                                                &shared.6.cache_policy,
5853                                                                &shared.6.remote_multimodal,
5854                                                            )
5855                                                            .await;
5856
5857                                                            let mut retry_count = shared.6.retry;
5858
5859                                                            while page.should_retry && retry_count > 0 {
5860                                                                retry_count -= 1;
5861                                                                if let Some(timeout) = page.get_timeout() {
5862                                                                    tokio::time::sleep(timeout).await;
5863                                                                }
5864                                                                if page.status_code == StatusCode::GATEWAY_TIMEOUT {
5865                                                                    if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
5866                                                                        let p = Page::new(
5867                                                                            target_url,
5868                                                                            &shared.0,
5869                                                                            &new_page,
5870                                                                            &shared.6.wait_for,
5871                                                                            &shared.6.screenshot,
5872                                                                            false,
5873                                                                            &shared.6.openai_config,
5874                                                                            &shared.6.execution_scripts,
5875                                                                            &shared.6.automation_scripts,
5876                                                                            &shared.6.viewport,
5877                                                                            &shared.6.request_timeout,
5878                                                                            &shared.6.track_events,
5879                                                                            shared.6.referer.clone(),
5880                                                                            shared.6.max_page_bytes,
5881                                                                            shared.6.get_cache_options(),
5882                                                                            &shared.6.cache_policy,
5883                                                                            &shared.6.remote_multimodal,
5884                                                                        ).await;
5885                                                                        page.clone_from(&p);
5886
5887                                                                    }).await {
5888                                                                        log::info!("{target_url} backoff gateway timeout exceeded {elasped}");
5889                                                                    }
5890                                                                } else {
5891                                                                    page.clone_from(
5892                                                                        &Page::new(
5893                                                                            target_url,
5894                                                                            &shared.0,
5895                                                                            &new_page,
5896                                                                            &shared.6.wait_for,
5897                                                                            &shared.6.screenshot,
5898                                                                            false,
5899                                                                            &shared.6.openai_config,
5900                                                                            &shared.6.execution_scripts,
5901                                                                            &shared.6.automation_scripts,
5902                                                                            &shared.6.viewport,
5903                                                                            &shared.6.request_timeout,
5904                                                                            &shared.6.track_events,
5905                                                                            shared.6.referer.clone(),
5906                                                                            shared.6.max_page_bytes,
5907                                                                            shared.6.get_cache_options(),
5908                                                                            &shared.6.cache_policy,
5909                                                                            &shared.6.remote_multimodal,
5910                                                                        )
5911                                                                        .await,
5912                                                                    );
5913                                                                }
5914                                                            }
5915
5916                                                            if let Some(h) = intercept_handle {
5917                                                                let abort_handle = h.abort_handle();
5918                                                                if let Err(elasped) = tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await {
5919                                                                    log::warn!("Handler timeout exceeded {elasped}");
5920                                                                    abort_handle.abort();
5921                                                                }
5922                                                            }
5923
5924                                                            if add_external {
5925                                                                page.set_external(shared.3.clone());
5926                                                            }
5927
5928                                                            let prev_domain = page.base.take();
5929
5930                                                            // Use page's own URL for relative link resolution (not original crawl domain).
5931                                                            // Fixes subdomain pages resolving e.g. href="/path" against wrong host.
5932                                                            page.set_url_parsed_direct();
5933                                                            let page_base = page.base.take().map(Box::new);
5934
5935                                                            if return_page_links {
5936                                                                page.page_links = Some(Default::default());
5937                                                            }
5938
5939                                                            let links = if full_resources {
5940                                                                page.links_full(&shared.1, &page_base).await
5941                                                            } else {
5942                                                                page.links(&shared.1, &page_base).await
5943                                                            };
5944
5945                                                            page.base = prev_domain;
5946
5947                                                            if shared.6.normalize {
5948                                                                page.signature.replace(crate::utils::hash_html(page.get_html_bytes_u8()).await);
5949                                                            }
5950
5951                                                            if let Some(ref cb) = on_should_crawl_callback {
5952                                                                if !cb.call(&page) {
5953                                                                    page.blocked_crawl = true;
5954                                                                    channel_send_page(&shared.2, page, &shared.4);
5955                                                                    drop(permit);
5956                                                                    return Default::default()
5957                                                                }
5958                                                            }
5959
5960                                                            let signature = page.signature;
5961
5962                                                            channel_send_page(
5963                                                                &shared.2, page, &shared.4,
5964                                                            );
5965
5966                                                            (links, signature)
5967                                                        }
5968                                                        _ => Default::default(),
5969                                                    };
5970
5971
5972                                                    drop(permit);
5973
5974                                                    results
5975                                                });
5976                                            }
5977
5978                                            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5979                                        }
5980                                        Some(result) = set.join_next(), if !set.is_empty() => {
5981                                            if let Ok(res) = result {
5982                                                match res.1 {
5983                                                    Some(signature) => {
5984                                                        if self.is_signature_allowed(signature).await {
5985                                                            self.insert_signature(signature).await;
5986                                                            self.links_visited.extend_links(&mut links, res.0);
5987                                                        }
5988                                                    }
5989                                                    _ => {
5990                                                        self.links_visited.extend_links(&mut links, res.0);
5991                                                    }
5992                                                }
5993                                            } else{
5994                                                break
5995                                            }
5996                                        }
5997                                        else => break,
5998                                    };
5999
6000                                    if links.is_empty() && set.is_empty() || exceeded_budget {
6001                                        if exceeded_budget {
6002                                            while set.join_next().await.is_some() {}
6003                                        }
6004                                        break 'outer;
6005                                    }
6006                                }
6007
6008                                self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6009
6010                                if links.is_empty() && set.is_empty() {
6011                                    break;
6012                                }
6013                            }
6014
6015                            self.subscription_guard().await;
6016                            b.dispose();
6017                            // store the extra links.
6018                            if !links.is_empty() {
6019                                self.extra_links.extend(links);
6020                            }
6021                        }
6022                    }
6023                    Err(err) => {
6024                        b.dispose();
6025                        log::error!("{}", err)
6026                    }
6027                }
6028            }
6029            _ => log::error!("Chrome initialization failed."),
6030        }
6031    }
6032
6033    /// Start to crawl website concurrently using chrome with the ability to send it across threads for subscriptions.
6034    #[cfg(not(feature = "decentralized"))]
6035    #[cfg_attr(
6036        all(feature = "tracing", not(feature = "decentralized")),
6037        tracing::instrument(skip_all)
6038    )]
6039    async fn crawl_concurrent_raw_send(
6040        &self,
6041        client: &Client,
6042        handle: &Option<Arc<AtomicI8>>,
6043        url: &Option<&str>,
6044    ) -> Website {
6045        let mut selector: (
6046            CompactString,
6047            smallvec::SmallVec<[CompactString; 2]>,
6048            CompactString,
6049        ) = self.setup_selectors();
6050
6051        let mut website = self.clone();
6052
6053        if let Some(u) = url {
6054            match &website.domain_parsed {
6055                Some(domain_url) => {
6056                    if domain_url.as_str().starts_with(u) {
6057                        website.set_url_only(u);
6058                    } else {
6059                        website.set_url(u);
6060                    }
6061                }
6062                _ => {
6063                    website.set_url(u);
6064                }
6065            }
6066        }
6067
6068        if !website.send_configured {
6069            website.configure_setup().await;
6070        }
6071
6072        if self.single_page() {
6073            website._crawl_establish(client, &mut selector, false).await;
6074            website
6075        } else {
6076            let client_rotator = self.client_rotator.clone();
6077            #[cfg(feature = "hedge")]
6078            let hedge_config = self.configuration.hedge.clone();
6079            let on_should_crawl_callback = self.on_should_crawl_callback.clone();
6080            let full_resources = self.configuration.full_resources;
6081            let return_page_links = self.configuration.return_page_links;
6082            let only_html = self.configuration.only_html && !full_resources;
6083            let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
6084
6085            let (mut interval, throttle) = self.setup_crawl();
6086
6087            let mut links: HashSet<CaseInsensitiveString> = website.drain_extra_links().collect();
6088
6089            links.extend(website._crawl_establish(client, &mut selector, false).await);
6090
6091            let semaphore = self.setup_semaphore();
6092
6093            let shared = Arc::new((
6094                client.to_owned(),
6095                selector,
6096                self.channel.clone(),
6097                self.configuration.external_domains_caseless.clone(),
6098                self.channel_guard.clone(),
6099                self.configuration.retry,
6100                self.configuration.full_resources,
6101                PageLinkBuildSettings::new_full(
6102                    false,
6103                    self.configuration.full_resources,
6104                    self.configuration.subdomains,
6105                    self.configuration.tld,
6106                    self.configuration.normalize,
6107                ),
6108                self.domain_parsed.clone(),
6109                self.on_link_find_callback.clone(),
6110                self.configuration.remote_multimodal.clone(),
6111            ));
6112
6113            let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> = JoinSet::new();
6114
6115            // track budgeting one time.
6116            let mut exceeded_budget = false;
6117            let concurrency = throttle.is_zero();
6118
6119            website
6120                .dequeue(&mut q, &mut links, &mut exceeded_budget)
6121                .await;
6122
6123            if !concurrency && !links.is_empty() {
6124                tokio::time::sleep(*throttle).await;
6125            }
6126
6127            let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
6128                Some(Instant::now())
6129            } else {
6130                None
6131            };
6132
6133            'outer: loop {
6134                #[cfg(all(feature = "agent", feature = "serde"))]
6135                self.apply_url_prefilter(&mut links).await;
6136
6137                let mut stream =
6138                    tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect());
6139
6140                loop {
6141                    if !concurrency {
6142                        tokio::time::sleep(*throttle).await;
6143                    }
6144
6145                    let semaphore =
6146                        get_semaphore(&semaphore, !self.configuration.shared_queue).await;
6147
6148                    tokio::select! {
6149                        biased;
6150                        Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker)   => {
6151                            if !self.handle_process(handle, &mut interval, async {
6152                                emit_log_shutdown(link.inner());
6153                                let permits = set.len();
6154                                set.shutdown().await;
6155                                semaphore.add_permits(permits);
6156                            }).await {
6157                                break 'outer;
6158                            }
6159                            let allowed = website.is_allowed(&link);
6160
6161                            if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
6162                                exceeded_budget = true;
6163                                break;
6164                            }
6165
6166                            if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
6167                                continue;
6168                            }
6169
6170                            emit_log(link.inner());
6171
6172                            website.insert_link(link.clone()).await;
6173
6174                            if let Ok(permit) = semaphore.clone().acquire_owned().await {
6175                                let shared = shared.clone();
6176                                let on_should_crawl_callback = on_should_crawl_callback.clone();
6177                                let rotator = client_rotator.clone();
6178                                #[cfg(feature = "hedge")]
6179                                let hedge_cfg = hedge_config.clone();
6180                                spawn_set("page_fetch", &mut set, async move {
6181                                    let link_result = match &shared.9 {
6182                                        Some(cb) => cb(link, None),
6183                                        _ => (link, None),
6184                                    };
6185
6186                                    let target_url = link_result.0.as_ref();
6187                                    let external_domains_caseless = &shared.3;
6188
6189                                    // Hedge-enabled path
6190                                    #[cfg(feature = "hedge")]
6191                                    let (mut page, mut links, mut links_pages) = {
6192                                        let should_hedge = if let Some(ref hcfg) = hedge_cfg {
6193                                            hcfg.enabled && rotator.as_ref().map_or(false, |r| r.len() > 1)
6194                                        } else {
6195                                            false
6196                                        };
6197
6198                                        if should_hedge {
6199                                            let hcfg = hedge_cfg.as_ref().unwrap();
6200                                            let rot = rotator.as_ref().unwrap();
6201                                            let (primary_client, hedge_client_opt) = rot.next_pair();
6202
6203                                            if let Some(hedge_client) = hedge_client_opt {
6204                                                let delay = hcfg.delay;
6205
6206                                                let primary_fut = async {
6207                                                    let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
6208                                                    let mut links_pages = if return_page_links { Some(HashSet::new()) } else { None };
6209                                                    let mut selectors = shared.1.clone();
6210                                                    let mut r_settings = shared.7;
6211                                                    r_settings.ssg_build = true;
6212                                                    let mut domain_parsed = None;
6213                                                    let page = Page::new_page_streaming(
6214                                                        target_url, primary_client, only_html,
6215                                                        &mut selectors, external_domains_caseless,
6216                                                        &r_settings, &mut links, None, &shared.8,
6217                                                        &mut domain_parsed, &mut links_pages).await;
6218                                                    (page, links, links_pages)
6219                                                };
6220
6221                                                tokio::pin!(primary_fut);
6222
6223                                                tokio::select! {
6224                                                    biased;
6225                                                    result = &mut primary_fut => result,
6226                                                    _ = tokio::time::sleep(delay) => {
6227                                                        log::info!("[hedge] fired after {}ms url={}", delay.as_millis(), target_url);
6228
6229                                                        let hedge_fut = async {
6230                                                            let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
6231                                                            let mut links_pages = if return_page_links { Some(HashSet::new()) } else { None };
6232                                                            let mut selectors = shared.1.clone();
6233                                                            let mut r_settings = shared.7;
6234                                                            r_settings.ssg_build = true;
6235                                                            let mut domain_parsed = None;
6236                                                            let page = Page::new_page_streaming(
6237                                                                target_url, hedge_client, only_html,
6238                                                                &mut selectors, external_domains_caseless,
6239                                                                &r_settings, &mut links, None, &shared.8,
6240                                                                &mut domain_parsed, &mut links_pages).await;
6241                                                            (page, links, links_pages)
6242                                                        };
6243
6244                                                        tokio::pin!(hedge_fut);
6245
6246                                                        tokio::select! {
6247                                                            biased;
6248                                                            result = &mut primary_fut => {
6249                                                                log::info!("[hedge] winner: primary url={}", target_url);
6250                                                                result
6251                                                            }
6252                                                            result = &mut hedge_fut => {
6253                                                                log::info!("[hedge] winner: hedge url={}", target_url);
6254                                                                result
6255                                                            }
6256                                                        }
6257                                                    }
6258                                                }
6259                                            } else {
6260                                                let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
6261                                                let mut links_pages = if return_page_links { Some(HashSet::new()) } else { None };
6262                                                let mut selectors = shared.1.clone();
6263                                                let mut r_settings = shared.7;
6264                                                r_settings.ssg_build = true;
6265                                                let mut domain_parsed = None;
6266                                                let page = Page::new_page_streaming(
6267                                                    target_url, primary_client, only_html,
6268                                                    &mut selectors, external_domains_caseless,
6269                                                    &r_settings, &mut links, None, &shared.8,
6270                                                    &mut domain_parsed, &mut links_pages).await;
6271                                                (page, links, links_pages)
6272                                            }
6273                                        } else {
6274                                            let client = match &rotator {
6275                                                Some(r) => r.next(),
6276                                                None => &shared.0,
6277                                            };
6278                                            let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
6279                                            let mut links_pages = if return_page_links { Some(HashSet::new()) } else { None };
6280                                            let mut selectors = shared.1.clone();
6281                                            let mut r_settings = shared.7;
6282                                            r_settings.ssg_build = true;
6283                                            let mut domain_parsed = None;
6284                                            let page = Page::new_page_streaming(
6285                                                target_url, client, only_html,
6286                                                &mut selectors, external_domains_caseless,
6287                                                &r_settings, &mut links, None, &shared.8,
6288                                                &mut domain_parsed, &mut links_pages).await;
6289                                            (page, links, links_pages)
6290                                        }
6291                                    };
6292
6293                                    #[cfg(not(feature = "hedge"))]
6294                                    let (mut page, mut links, mut links_pages) = {
6295                                        let client = match &rotator {
6296                                            Some(r) => r.next(),
6297                                            None => &shared.0,
6298                                        };
6299                                        let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
6300                                        let mut links_pages = if return_page_links {
6301                                            Some(links.clone())
6302                                        } else {
6303                                            None
6304                                        };
6305                                        let mut relative_selectors = shared.1.clone();
6306                                        let mut r_settings = shared.7;
6307                                        r_settings.ssg_build = true;
6308                                        let mut domain_parsed = None;
6309                                        let page = Page::new_page_streaming(
6310                                            target_url,
6311                                            client, only_html,
6312                                            &mut relative_selectors,
6313                                            external_domains_caseless,
6314                                            &r_settings,
6315                                            &mut links,
6316                                            None,
6317                                            &shared.8,
6318                                            &mut domain_parsed,
6319                                            &mut links_pages).await;
6320                                        (page, links, links_pages)
6321                                    };
6322
6323                                    let mut retry_count = shared.5;
6324
6325                                    while page.should_retry && retry_count > 0 {
6326                                        retry_count -= 1;
6327
6328                                        if let Some(timeout) = page.get_timeout() {
6329                                            tokio::time::sleep(timeout).await;
6330                                        }
6331
6332                                        let retry_client = match &rotator {
6333                                            Some(r) => r.next(),
6334                                            None => &shared.0,
6335                                        };
6336
6337                                        if page.status_code == StatusCode::GATEWAY_TIMEOUT {
6338                                            if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
6339                                                let mut domain_parsed = None;
6340                                                let mut retry_r_settings = shared.7;
6341                                                retry_r_settings.ssg_build = true;
6342                                                let next_page = Page::new_page_streaming(
6343                                                    target_url,
6344                                                    retry_client, only_html,
6345                                                    &mut shared.1.clone(),
6346                                                    external_domains_caseless,
6347                                                    &retry_r_settings,
6348                                                    &mut links,
6349                                                    None,
6350                                                    &shared.8,
6351                                                    &mut domain_parsed,
6352                                                    &mut links_pages).await;
6353
6354                                                page.clone_from(&next_page);
6355
6356                                            }).await
6357                                        {
6358                                            log::warn!("Handler timeout exceeded {elasped}");
6359                                        }
6360
6361                                        } else {
6362                                            let mut domain_parsed = None;
6363                                            let mut retry_r_settings = shared.7;
6364                                            retry_r_settings.ssg_build = true;
6365                                            page.clone_from(&Page::new_page_streaming(
6366                                                target_url,
6367                                                retry_client,
6368                                                only_html,
6369                                                &mut shared.1.clone(),
6370                                                external_domains_caseless,
6371                                                &retry_r_settings,
6372                                                &mut links,
6373                                                None,
6374                                                &shared.8,
6375                                                &mut domain_parsed,
6376                                                &mut links_pages).await);
6377                                        }
6378                                    }
6379
6380                                    if return_page_links {
6381                                        page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new);
6382                                    }
6383
6384                                    // Run remote multimodal extraction if configured (HTTP-only path)
6385                                    #[cfg(all(feature = "agent", feature = "serde"))]
6386                                    if shared.10.is_some() {
6387                                        let html = page.get_html();
6388                                        if !html.is_empty() {
6389                                            use crate::features::automation::{run_remote_multimodal_extraction, AutomationResultExt};
6390                                            let title = page.metadata.as_ref().and_then(|m| m.title.as_ref()).map(|t| t.as_str());
6391                                            if let Ok(Some(result)) = run_remote_multimodal_extraction(
6392                                                &shared.10,
6393                                                &html,
6394                                                target_url,
6395                                                title,
6396                                            ).await {
6397                                                // Store usage on page
6398                                                match page.remote_multimodal_usage.as_mut() {
6399                                                    Some(v) => v.push(result.usage.clone()),
6400                                                    None => page.remote_multimodal_usage = Some(vec![result.usage.clone()]),
6401                                                }
6402                                                // Store extracted data if available
6403                                                if result.extracted.is_some() || result.screenshot.is_some() {
6404                                                    let automation_result = result.to_automation_results();
6405                                                    match page.extra_remote_multimodal_data.as_mut() {
6406                                                        Some(v) => v.push(automation_result),
6407                                                        None => page.extra_remote_multimodal_data = Some(vec![automation_result]),
6408                                                    }
6409                                                }
6410                                            }
6411                                        }
6412                                    }
6413
6414                                    if let Some(ref cb) = on_should_crawl_callback {
6415                                        if !cb.call(&page) {
6416                                            page.blocked_crawl = true;
6417                                            channel_send_page(&shared.2, page, &shared.4);
6418                                            drop(permit);
6419                                            return Default::default()
6420                                        }
6421                                    }
6422
6423                                    let signature = page.signature;
6424
6425                                    channel_send_page(&shared.2, page, &shared.4);
6426
6427                                    drop(permit);
6428
6429                                    (links, signature)
6430                                });
6431                            }
6432
6433                            website.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6434                        },
6435                        Some(result) = set.join_next(), if !set.is_empty() => {
6436                            if let Ok(res) = result {
6437                                match res.1 {
6438                                    Some(signature) => {
6439                                        if website.is_signature_allowed(signature).await {
6440                                            website.insert_signature(signature).await;
6441                                            website.links_visited.extend_links(&mut links, res.0);
6442                                        }
6443                                    }
6444                                    _ => {
6445                                        website.links_visited.extend_links(&mut links, res.0);
6446                                    }
6447                                }
6448                            } else {
6449                                break;
6450                            }
6451                        }
6452                        else => break,
6453                    }
6454
6455                    website
6456                        .dequeue(&mut q, &mut links, &mut exceeded_budget)
6457                        .await;
6458
6459                    if links.is_empty() && set.is_empty() || exceeded_budget {
6460                        // await for all tasks to complete.
6461                        if exceeded_budget {
6462                            while set.join_next().await.is_some() {}
6463                        }
6464                        break 'outer;
6465                    }
6466                }
6467
6468                website.subscription_guard().await;
6469                website
6470                    .dequeue(&mut q, &mut links, &mut exceeded_budget)
6471                    .await;
6472
6473                if links.is_empty() && set.is_empty() {
6474                    break;
6475                }
6476            }
6477            website
6478        }
6479    }
6480
6481    /// Start to crawl website concurrently with the ability to send it across threads for subscriptions.
6482    #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
6483    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6484    async fn crawl_concurrent_send(
6485        &self,
6486        client: &Client,
6487        handle: &Option<Arc<AtomicI8>>,
6488        url: &Option<&str>,
6489    ) -> Website {
6490        use crate::features::chrome::attempt_navigation;
6491
6492        match self.setup_browser().await {
6493            Some(mut b) => {
6494                match attempt_navigation(
6495                    "about:blank",
6496                    &b.browser.0,
6497                    &self.configuration.request_timeout,
6498                    &b.browser.2,
6499                    &self.configuration.viewport,
6500                )
6501                .await
6502                {
6503                    Ok(new_page) => {
6504                        let mut selectors = self.setup_selectors();
6505                        let mut website = self.to_owned();
6506
6507                        if let Some(u) = url {
6508                            match &website.domain_parsed {
6509                                Some(domain_url) => {
6510                                    if domain_url.as_str().starts_with(u) {
6511                                        website.set_url_only(u);
6512                                    } else {
6513                                        website.set_url(u);
6514                                    }
6515                                }
6516                                _ => {
6517                                    website.set_url(u);
6518                                }
6519                            }
6520                        }
6521
6522                        if !website.send_configured {
6523                            website.configure_setup().await;
6524                        }
6525
6526                        let base_links = website
6527                            .crawl_establish(client, &mut selectors, false, &new_page)
6528                            .await;
6529
6530                        drop(new_page);
6531
6532                        if self.single_page() {
6533                            website.subscription_guard().await;
6534                            b.dispose();
6535                            website
6536                        } else {
6537                            let semaphore: Arc<Semaphore> = self.setup_semaphore();
6538                            let (mut interval, throttle) = self.setup_crawl();
6539
6540                            let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
6541
6542                            let mut links: HashSet<CaseInsensitiveString> =
6543                                *self.extra_links.clone();
6544
6545                            links.extend(base_links);
6546
6547                            let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> =
6548                                JoinSet::new();
6549
6550                            let shared = Arc::new((
6551                                client.to_owned(),
6552                                selectors,
6553                                self.channel.clone(),
6554                                self.configuration.external_domains_caseless.clone(),
6555                                self.channel_guard.clone(),
6556                                b.browser.0.clone(),
6557                                self.configuration.clone(),
6558                                self.url.inner().to_string(),
6559                                b.browser.2.clone(),
6560                                self.domain_parsed.clone(),
6561                                self.on_link_find_callback.clone(),
6562                            ));
6563
6564                            let add_external = !shared.3.is_empty();
6565                            let on_should_crawl_callback = self.on_should_crawl_callback.clone();
6566                            let full_resources = self.configuration.full_resources;
6567                            let return_page_links = self.configuration.return_page_links;
6568                            let mut exceeded_budget = false;
6569                            let concurrency = throttle.is_zero();
6570
6571                            website
6572                                .dequeue(&mut q, &mut links, &mut exceeded_budget)
6573                                .await;
6574
6575                            if !concurrency && !links.is_empty() {
6576                                tokio::time::sleep(*throttle).await;
6577                            }
6578
6579                            let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
6580                                Some(Instant::now())
6581                            } else {
6582                                None
6583                            };
6584
6585                            'outer: loop {
6586                                #[cfg(all(feature = "agent", feature = "serde"))]
6587                                self.apply_url_prefilter(&mut links).await;
6588
6589                                let mut stream = tokio_stream::iter::<HashSet<CaseInsensitiveString>>(
6590                                    links.drain().collect(),
6591                                );
6592
6593                                loop {
6594                                    if !concurrency {
6595                                        tokio::time::sleep(*throttle).await;
6596                                    }
6597
6598                                    let semaphore =
6599                                        get_semaphore(&semaphore, !self.configuration.shared_queue)
6600                                            .await;
6601
6602                                    tokio::select! {
6603                                        biased;
6604                                        Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker)  => {
6605                                            if !self
6606                                                .handle_process(
6607                                                    handle,
6608                                                    &mut interval,
6609                                                    async {
6610                                                        emit_log_shutdown(link.inner());
6611                                                        let permits = set.len();
6612                                                        set.shutdown().await;
6613                                                        semaphore.add_permits(permits);
6614                                                    },
6615                                                )
6616                                                .await
6617                                            {
6618                                                break 'outer;
6619                                            }
6620
6621                                            let allowed = website.is_allowed(&link);
6622
6623                                            if allowed
6624                                                .eq(&ProcessLinkStatus::BudgetExceeded)
6625                                            {
6626                                                exceeded_budget = true;
6627                                                break;
6628                                            }
6629                                            if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
6630                                                continue;
6631                                            }
6632
6633                                            emit_log(link.inner());
6634
6635                                            website.insert_link(link.clone()).await;
6636
6637                                            if let Ok(permit) = semaphore.clone().acquire_owned().await {
6638                                                let shared = shared.clone();
6639                                                let on_should_crawl_callback = on_should_crawl_callback.clone();
6640                                                spawn_set("page_fetch", &mut set, async move {
6641                                                    let results = match attempt_navigation("about:blank", &shared.5, &shared.6.request_timeout, &shared.8, &shared.6.viewport).await {
6642                                                        Ok(new_page) => {
6643                                                            let (_, intercept_handle) = tokio::join!(
6644                                                                crate::features::chrome::setup_chrome_events(&new_page, &shared.6),
6645                                                                crate::features::chrome::setup_chrome_interception_base(
6646                                                                    &new_page,
6647                                                                    shared.6.chrome_intercept.enabled,
6648                                                                    &shared.6.auth_challenge_response,
6649                                                                    shared.6.chrome_intercept.block_visuals,
6650                                                                    &shared.7,
6651                                                                )
6652                                                            );
6653
6654                                                            let link_result =
6655                                                                match &shared.10 {
6656                                                                    Some(cb) => cb(link, None),
6657                                                                    _ => (link, None),
6658                                                                };
6659
6660                                                            let target_url = link_result.0.as_ref();
6661
6662                                                            let mut page = Page::new(
6663                                                                target_url,
6664                                                                &shared.0,
6665                                                                &new_page,
6666                                                                &shared.6.wait_for,
6667                                                                &shared.6.screenshot,
6668                                                                false,
6669                                                                &shared.6.openai_config,
6670                                                                &shared.6.execution_scripts,
6671                                                                &shared.6.automation_scripts,
6672                                                                &shared.6.viewport,
6673                                                                &shared.6.request_timeout,
6674                                                                &shared.6.track_events,
6675                                                                shared.6.referer.clone(),
6676                                                                shared.6.max_page_bytes,
6677                                                                shared.6.get_cache_options(),
6678                                                                &shared.6.cache_policy,
6679                                                                &shared.6.remote_multimodal,
6680                                                            )
6681                                                            .await;
6682
6683                                                            let mut retry_count = shared.6.retry;
6684
6685                                                            while page.should_retry && retry_count > 0 {
6686                                                                retry_count -= 1;
6687                                                                if let Some(timeout) = page.get_timeout() {
6688                                                                    tokio::time::sleep(timeout).await;
6689                                                                }
6690                                                                if page.status_code == StatusCode::GATEWAY_TIMEOUT {
6691                                                                    if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
6692                                                                        let p = Page::new(
6693                                                                            target_url,
6694                                                                            &shared.0,
6695                                                                            &new_page,
6696                                                                            &shared.6.wait_for,
6697                                                                            &shared.6.screenshot,
6698                                                                            false,
6699                                                                            &shared.6.openai_config,
6700                                                                            &shared.6.execution_scripts,
6701                                                                            &shared.6.automation_scripts,
6702                                                                            &shared.6.viewport,
6703                                                                            &shared.6.request_timeout,
6704                                                                            &shared.6.track_events,
6705                                                                            shared.6.referer.clone(),
6706                                                                            shared.6.max_page_bytes,
6707                                                                            shared.6.get_cache_options(),
6708                                                                            &shared.6.cache_policy,
6709                                                                            &shared.6.remote_multimodal,
6710                                                                        ).await;
6711                                                                        page.clone_from(&p);
6712
6713                                                                    }).await {
6714                                                                        log::info!("{target_url} backoff gateway timeout exceeded {elasped}");
6715                                                                    }
6716                                                                } else {
6717                                                                    page.clone_from(
6718                                                                        &Page::new(
6719                                                                            target_url,
6720                                                                            &shared.0,
6721                                                                            &new_page,
6722                                                                            &shared.6.wait_for,
6723                                                                            &shared.6.screenshot,
6724                                                                            false,
6725                                                                            &shared.6.openai_config,
6726                                                                            &shared.6.execution_scripts,
6727                                                                            &shared.6.automation_scripts,
6728                                                                            &shared.6.viewport,
6729                                                                            &shared.6.request_timeout,
6730                                                                            &shared.6.track_events,
6731                                                                            shared.6.referer.clone(),
6732                                                                            shared.6.max_page_bytes,
6733                                                                            shared.6.get_cache_options(),
6734                                                                            &shared.6.cache_policy,
6735                                                                            &shared.6.remote_multimodal,
6736                                                                        )
6737                                                                        .await,
6738                                                                    );
6739                                                                }
6740                                                            }
6741
6742                                                            if let Some(h) = intercept_handle {
6743                                                                let abort_handle = h.abort_handle();
6744                                                                if let Err(elasped) = tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await {
6745                                                                    log::warn!("Handler timeout exceeded {elasped}");
6746                                                                    abort_handle.abort();
6747                                                                }
6748                                                            }
6749
6750                                                            if add_external {
6751                                                                page.set_external(shared.3.clone());
6752                                                            }
6753
6754                                                            let prev_domain = page.base.take();
6755
6756                                                            // Use page's own URL for relative link resolution (not original crawl domain).
6757                                                            // Fixes subdomain pages resolving e.g. href="/path" against wrong host.
6758                                                            page.set_url_parsed_direct();
6759                                                            let page_base = page.base.take().map(Box::new);
6760
6761                                                            if return_page_links {
6762                                                                page.page_links = Some(Default::default());
6763                                                            }
6764
6765                                                            let links = if full_resources {
6766                                                                page.links_full(&shared.1, &page_base).await
6767                                                            } else {
6768                                                                page.links(&shared.1, &page_base).await
6769                                                            };
6770
6771                                                            page.base = prev_domain;
6772
6773                                                            if shared.6.normalize {
6774                                                                page.signature.replace(crate::utils::hash_html(page.get_html_bytes_u8()).await);
6775                                                            }
6776
6777                                                            if let Some(ref cb) = on_should_crawl_callback {
6778                                                                if !cb.call(&page) {
6779                                                                    page.blocked_crawl = true;
6780                                                                    channel_send_page(&shared.2, page, &shared.4);
6781                                                                    drop(permit);
6782                                                                    return Default::default()
6783                                                                }
6784                                                            }
6785
6786                                                            let signature = page.signature;
6787
6788                                                            channel_send_page(
6789                                                                &shared.2, page, &shared.4,
6790                                                            );
6791
6792                                                            (links, signature)
6793                                                        }
6794                                                        _ => Default::default(),
6795                                                    };
6796
6797
6798                                                    drop(permit);
6799
6800                                                    results
6801                                                });
6802                                            }
6803
6804                                            website.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6805                                        }
6806                                        Some(result) = set.join_next(), if !set.is_empty() => {
6807                                            if let Ok(res) = result {
6808                                                match res.1 {
6809                                                    Some(signature) => {
6810                                                        if website.is_signature_allowed(signature).await {
6811                                                            website.insert_signature(signature).await;
6812                                                            website.links_visited.extend_links(&mut links, res.0);
6813                                                        }
6814                                                    }
6815                                                    _ => {
6816                                                        website.links_visited.extend_links(&mut links, res.0);
6817                                                    }
6818                                                }
6819                                            } else{
6820                                                break
6821                                            }
6822                                        }
6823                                        else => break,
6824                                    };
6825
6826                                    if links.is_empty() && set.is_empty() || exceeded_budget {
6827                                        if exceeded_budget {
6828                                            while set.join_next().await.is_some() {}
6829                                        }
6830                                        break 'outer;
6831                                    }
6832                                }
6833
6834                                website
6835                                    .dequeue(&mut q, &mut links, &mut exceeded_budget)
6836                                    .await;
6837
6838                                if links.is_empty() && set.is_empty() {
6839                                    break;
6840                                }
6841                            }
6842
6843                            website.subscription_guard().await;
6844                            b.dispose();
6845
6846                            website
6847                        }
6848                    }
6849                    Err(err) => {
6850                        b.dispose();
6851                        log::error!("{}", err);
6852                        self.clone()
6853                    }
6854                }
6855            }
6856            _ => {
6857                log::error!("Chrome initialization failed.");
6858                self.clone()
6859            }
6860        }
6861    }
6862
6863    /// Start to crawl website concurrently with the ability to send it across threads for subscriptions for one page.
6864    #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
6865    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6866    async fn _fetch_chrome(&self, client: &Client, url: &Option<&str>) {
6867        use crate::features::chrome::attempt_navigation;
6868
6869        match self.setup_browser().await {
6870            Some(mut b) => {
6871                match attempt_navigation(
6872                    "about:blank",
6873                    &b.browser.0,
6874                    &self.configuration.request_timeout,
6875                    &b.browser.2,
6876                    &self.configuration.viewport,
6877                )
6878                .await
6879                {
6880                    Ok(new_page) => {
6881                        let mut selectors = self.setup_selectors();
6882                        self.crawl_establish_chrome_one(client, &mut selectors, url, &new_page)
6883                            .await;
6884                        self.subscription_guard().await;
6885                        b.dispose();
6886                    }
6887                    Err(err) => {
6888                        b.dispose();
6889                        log::error!("{}", err);
6890                    }
6891                }
6892            }
6893            _ => {
6894                log::error!("Chrome initialization failed.");
6895            }
6896        }
6897    }
6898
6899    /// Start to crawl website concurrently with the ability to send it across threads for subscriptions for one page.
6900    #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
6901    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6902    async fn _fetch_chrome_persisted(
6903        &self,
6904        client: &Client,
6905        url: &Option<&str>,
6906        b: &crate::features::chrome::BrowserController,
6907    ) {
6908        use crate::features::chrome::attempt_navigation;
6909        match attempt_navigation(
6910            "about:blank",
6911            &b.browser.0,
6912            &self.configuration.request_timeout,
6913            &b.browser.2,
6914            &self.configuration.viewport,
6915        )
6916        .await
6917        {
6918            Ok(new_page) => {
6919                let mut selectors = self.setup_selectors();
6920                self.crawl_establish_chrome_one(client, &mut selectors, url, &new_page)
6921                    .await;
6922                self.subscription_guard().await;
6923            }
6924            Err(err) => {
6925                log::error!("{}", err);
6926            }
6927        }
6928    }
6929
6930    /// Start to crawl website concurrently using WebDriver.
6931    #[cfg(all(
6932        not(feature = "decentralized"),
6933        not(feature = "chrome"),
6934        feature = "webdriver"
6935    ))]
6936    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6937    async fn crawl_concurrent_webdriver(
6938        &mut self,
6939        client: &Client,
6940        handle: &Option<Arc<AtomicI8>>,
6941    ) {
6942        self.start();
6943
6944        match self.setup_webdriver().await {
6945            Some(mut controller) => {
6946                let driver = controller.driver();
6947                let mut selectors = self.setup_selectors();
6948                self.status = CrawlStatus::Active;
6949
6950                if self.single_page() {
6951                    self.crawl_establish_webdriver_one(client, &mut selectors, &None, driver)
6952                        .await;
6953                    self.subscription_guard().await;
6954                    controller.dispose();
6955                } else {
6956                    let semaphore: Arc<Semaphore> = self.setup_semaphore();
6957                    let (mut interval, throttle) = self.setup_crawl();
6958
6959                    let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
6960
6961                    let base_links = self
6962                        .crawl_establish_webdriver_one(client, &mut selectors, &None, driver)
6963                        .await;
6964
6965                    let mut links: HashSet<CaseInsensitiveString> =
6966                        self.drain_extra_links().collect();
6967
6968                    links.extend(base_links);
6969
6970                    self.configuration.configure_allowlist();
6971
6972                    let timeout = self
6973                        .configuration
6974                        .webdriver_config
6975                        .as_ref()
6976                        .and_then(|c| c.timeout);
6977
6978                    let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> =
6979                        JoinSet::new();
6980
6981                    let shared = Arc::new((
6982                        client.to_owned(),
6983                        selectors,
6984                        self.channel.clone(),
6985                        self.configuration.external_domains_caseless.clone(),
6986                        self.channel_guard.clone(),
6987                        driver.clone(),
6988                        self.configuration.clone(),
6989                        self.url.inner().to_string(),
6990                        self.domain_parsed.clone(),
6991                        self.on_link_find_callback.clone(),
6992                        timeout,
6993                    ));
6994
6995                    let add_external = !shared.3.is_empty();
6996                    let on_should_crawl_callback = self.on_should_crawl_callback.clone();
6997                    let full_resources = self.configuration.full_resources;
6998                    let return_page_links = self.configuration.return_page_links;
6999                    let mut exceeded_budget = false;
7000                    let concurrency = throttle.is_zero();
7001
7002                    self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7003
7004                    if !concurrency && !links.is_empty() {
7005                        tokio::time::sleep(*throttle).await;
7006                    }
7007
7008                    let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
7009                        Some(Instant::now())
7010                    } else {
7011                        None
7012                    };
7013
7014                    'outer: loop {
7015                        #[cfg(all(feature = "agent", feature = "serde"))]
7016                        self.apply_url_prefilter(&mut links).await;
7017
7018                        let mut stream = tokio_stream::iter::<HashSet<CaseInsensitiveString>>(
7019                            links.drain().collect(),
7020                        );
7021
7022                        loop {
7023                            if !concurrency {
7024                                tokio::time::sleep(*throttle).await;
7025                            }
7026
7027                            let semaphore =
7028                                get_semaphore(&semaphore, !self.configuration.shared_queue).await;
7029
7030                            tokio::select! {
7031                                biased;
7032                                Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
7033                                    if !self
7034                                        .handle_process(
7035                                            handle,
7036                                            &mut interval,
7037                                            async {
7038                                                emit_log_shutdown(link.inner());
7039                                                let permits = set.len();
7040                                                set.shutdown().await;
7041                                                semaphore.add_permits(permits);
7042                                            },
7043                                        )
7044                                        .await
7045                                    {
7046                                        break 'outer;
7047                                    }
7048
7049                                    let allowed = self.is_allowed(&link);
7050
7051                                    if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
7052                                        exceeded_budget = true;
7053                                        break;
7054                                    }
7055                                    if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
7056                                        continue;
7057                                    }
7058
7059                                    emit_log(link.inner());
7060
7061                                    self.insert_link(link.clone()).await;
7062
7063                                    if let Ok(permit) = semaphore.clone().acquire_owned().await {
7064                                        let shared = shared.clone();
7065                                        let on_should_crawl_callback = on_should_crawl_callback.clone();
7066
7067                                        spawn_set("page_fetch_webdriver", &mut set, async move {
7068                                            let link_result = match &shared.9 {
7069                                                Some(cb) => cb(link, None),
7070                                                _ => (link, None),
7071                                            };
7072
7073                                            let target_url = link_result.0.as_ref();
7074
7075                                            // Setup stealth events before navigation
7076                                            crate::features::webdriver::setup_driver_events(&shared.5, &shared.6).await;
7077
7078                                            let mut page = Page::new_page_webdriver(
7079                                                target_url,
7080                                                &shared.5,
7081                                                shared.10,
7082                                            )
7083                                            .await;
7084
7085                                            let mut retry_count = shared.6.retry;
7086
7087                                            while page.should_retry && retry_count > 0 {
7088                                                retry_count -= 1;
7089                                                if let Some(timeout_duration) = page.get_timeout() {
7090                                                    tokio::time::sleep(timeout_duration).await;
7091                                                }
7092                                                if page.status_code == StatusCode::GATEWAY_TIMEOUT {
7093                                                    if let Err(elapsed) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
7094                                                        let p = Page::new_page_webdriver(
7095                                                            target_url,
7096                                                            &shared.5,
7097                                                            shared.10,
7098                                                        ).await;
7099                                                        page.clone_from(&p);
7100                                                    }).await {
7101                                                        log::info!("{target_url} backoff gateway timeout exceeded {elapsed}");
7102                                                    }
7103                                                } else {
7104                                                    page.clone_from(
7105                                                        &Page::new_page_webdriver(
7106                                                            target_url,
7107                                                            &shared.5,
7108                                                            shared.10,
7109                                                        )
7110                                                        .await,
7111                                                    );
7112                                                }
7113                                            }
7114
7115                                            if add_external {
7116                                                page.set_external(shared.3.clone());
7117                                            }
7118
7119                                            let prev_domain = page.base.take();
7120
7121                                            // Use page's own URL for relative link resolution (not original crawl domain).
7122                                            // Fixes subdomain pages resolving e.g. href="/path" against wrong host.
7123                                            page.set_url_parsed_direct();
7124                                            let page_base = page.base.take().map(Box::new);
7125
7126                                            if return_page_links {
7127                                                page.page_links = Some(Default::default());
7128                                            }
7129
7130                                            let links = if full_resources {
7131                                                page.links_full(&shared.1, &page_base).await
7132                                            } else {
7133                                                page.links(&shared.1, &page_base).await
7134                                            };
7135
7136                                            page.base = prev_domain;
7137
7138                                            if shared.6.normalize {
7139                                                page.signature.replace(crate::utils::hash_html(page.get_html_bytes_u8()).await);
7140                                            }
7141
7142                                            if let Some(ref cb) = on_should_crawl_callback {
7143                                                if !cb.call(&page) {
7144                                                    page.blocked_crawl = true;
7145                                                    channel_send_page(&shared.2, page, &shared.4);
7146                                                    drop(permit);
7147                                                    return Default::default();
7148                                                }
7149                                            }
7150
7151                                            let signature = page.signature;
7152
7153                                            channel_send_page(&shared.2, page, &shared.4);
7154
7155                                            drop(permit);
7156
7157                                            (links, signature)
7158                                        });
7159                                    }
7160
7161                                    self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7162                                }
7163                                Some(result) = set.join_next(), if !set.is_empty() => {
7164                                    if let Ok(res) = result {
7165                                        match res.1 {
7166                                            Some(signature) => {
7167                                                if self.is_signature_allowed(signature).await {
7168                                                    self.insert_signature(signature).await;
7169                                                    self.links_visited.extend_links(&mut links, res.0);
7170                                                }
7171                                            }
7172                                            _ => {
7173                                                self.links_visited.extend_links(&mut links, res.0);
7174                                            }
7175                                        }
7176                                    } else {
7177                                        break
7178                                    }
7179
7180                                    if links.is_empty() && set.is_empty() || exceeded_budget {
7181                                        if exceeded_budget {
7182                                            while set.join_next().await.is_some() {}
7183                                        }
7184                                        break 'outer;
7185                                    }
7186                                }
7187                                else => break,
7188                            };
7189
7190                            if links.is_empty() && set.is_empty() || exceeded_budget {
7191                                if exceeded_budget {
7192                                    while set.join_next().await.is_some() {}
7193                                }
7194                                break 'outer;
7195                            }
7196                        }
7197
7198                        self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7199
7200                        if links.is_empty() && set.is_empty() {
7201                            break;
7202                        }
7203                    }
7204
7205                    self.subscription_guard().await;
7206                    controller.dispose();
7207
7208                    if !links.is_empty() {
7209                        self.extra_links.extend(links);
7210                    }
7211                }
7212            }
7213            None => {
7214                log::error!("WebDriver initialization failed.");
7215            }
7216        }
7217    }
7218
7219    /// Start to crawl website concurrently.
7220    #[cfg(all(
7221        not(feature = "decentralized"),
7222        not(feature = "chrome"),
7223        feature = "webdriver"
7224    ))]
7225    pub async fn crawl_concurrent(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
7226        // Use WebDriver if configured, otherwise fall back to raw HTTP
7227        if self.configuration.webdriver_config.is_some() {
7228            self.crawl_concurrent_webdriver(client, handle).await
7229        } else {
7230            self.crawl_concurrent_raw(client, handle).await
7231        }
7232    }
7233
7234    /// Start to crawl website concurrently.
7235    #[cfg(all(
7236        not(feature = "decentralized"),
7237        not(feature = "chrome"),
7238        not(feature = "webdriver")
7239    ))]
7240    pub async fn crawl_concurrent(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
7241        self.crawl_concurrent_raw(client, handle).await
7242    }
7243
7244    /// Start to crawl website concurrently.
7245    #[cfg(feature = "decentralized")]
7246    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7247    pub async fn crawl_concurrent(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
7248        let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
7249
7250        self.configuration.configure_allowlist();
7251        let domain = self.url.inner().as_str();
7252        let mut interval = Box::pin(tokio::time::interval(Duration::from_millis(10)));
7253        let throttle = Box::pin(self.get_delay());
7254        let on_link_find_callback = self.on_link_find_callback.clone();
7255        // http worker verify
7256        let http_worker = std::env::var("SPIDER_WORKER")
7257            .unwrap_or_else(|_| "http:".to_string())
7258            .starts_with("http:");
7259
7260        let mut links: HashSet<CaseInsensitiveString> = self
7261            .crawl_establish(client, &(domain.into(), Default::default()), http_worker)
7262            .await;
7263
7264        let mut set: JoinSet<HashSet<CaseInsensitiveString>> = JoinSet::new();
7265        let mut exceeded_budget = false;
7266
7267        'outer: loop {
7268            #[cfg(all(feature = "agent", feature = "serde"))]
7269            self.apply_url_prefilter(&mut links).await;
7270
7271            let stream =
7272                tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect())
7273                    .throttle(*throttle);
7274            tokio::pin!(stream);
7275
7276            loop {
7277                match stream.next().await {
7278                    Some(link) => {
7279                        if !self
7280                            .handle_process(handle, &mut interval, async {
7281                                emit_log_shutdown(link.inner());
7282                                set.shutdown().await;
7283                            })
7284                            .await
7285                        {
7286                            break 'outer;
7287                        }
7288
7289                        let allowed = self.is_allowed(&link);
7290
7291                        if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
7292                            exceeded_budget = true;
7293                            break;
7294                        }
7295                        if allowed.eq(&ProcessLinkStatus::Blocked)
7296                            || !self.is_allowed_disk(&link).await
7297                        {
7298                            continue;
7299                        }
7300
7301                        emit_log(link.inner());
7302
7303                        self.insert_link(link.clone()).await;
7304
7305                        if let Ok(permit) = SEM.acquire().await {
7306                            let client = client.clone();
7307                            let on_link_find_callback = on_link_find_callback.clone();
7308
7309                            spawn_set("page_fetch", &mut set, async move {
7310                                let link_results = match &on_link_find_callback.clone() {
7311                                    Some(cb) => cb(link, None),
7312                                    _ => (link, None),
7313                                };
7314                                let link_results = link_results.0.as_ref();
7315                                let page = Page::new_links_only(
7316                                    &if http_worker && link_results.starts_with("https") {
7317                                        link_results.replacen("https", "http", 1).to_string()
7318                                    } else {
7319                                        link_results.to_string()
7320                                    },
7321                                    &client,
7322                                )
7323                                .await;
7324
7325                                drop(permit);
7326
7327                                page.links
7328                            });
7329
7330                            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7331                        }
7332                    }
7333                    _ => break,
7334                }
7335                if exceeded_budget {
7336                    break;
7337                }
7338            }
7339
7340            while let Some(res) = set.join_next().await {
7341                if let Ok(msg) = res {
7342                    self.links_visited.extend_links(&mut links, msg);
7343                }
7344            }
7345
7346            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7347
7348            if links.is_empty() || exceeded_budget {
7349                break;
7350            }
7351        }
7352
7353        if !links.is_empty() {
7354            self.extra_links.extend(links);
7355        }
7356    }
7357
7358    #[cfg(all(feature = "chrome", feature = "real_browser"))]
7359    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7360    /// Warm up the gemini model.
7361    pub async fn warm_up_gemini(&mut self) {
7362        use crate::features::chrome::attempt_navigation;
7363
7364        if let Some(mut b) = self.setup_browser().await {
7365            if let Ok(page) = attempt_navigation(
7366                "about:blank",
7367                &b.browser.0,
7368                &self.configuration.request_timeout,
7369                &b.browser.2,
7370                &self.configuration.viewport,
7371            )
7372            .await
7373            {
7374                let _ = crate::features::solvers::warm_gemini_model(&page).await;
7375                b.dispose();
7376            }
7377        }
7378    }
7379
7380    /// Start to crawl website concurrently using HTTP by default and chrome Javascript Rendering as needed. The glob feature does not work with this at the moment.
7381    #[cfg(all(not(feature = "decentralized"), feature = "smart"))]
7382    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7383    pub async fn crawl_concurrent_smart(
7384        &mut self,
7385        client: &Client,
7386        handle: &Option<Arc<AtomicI8>>,
7387    ) {
7388        use tokio::sync::OnceCell;
7389        self.start();
7390        self.status = CrawlStatus::Active;
7391        let browser: crate::features::chrome::OnceBrowser = OnceCell::new();
7392
7393        let mut selectors: (
7394            CompactString,
7395            smallvec::SmallVec<[CompactString; 2]>,
7396            CompactString,
7397        ) = self.setup_selectors();
7398
7399        if self.single_page() {
7400            self.subscription_guard().await;
7401            self.crawl_establish_smart(&client, &mut selectors, &browser)
7402                .await;
7403        } else {
7404            let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
7405
7406            let mut links: HashSet<CaseInsensitiveString> = self.drain_extra_links().collect();
7407
7408            let (mut interval, throttle) = self.setup_crawl();
7409            let on_should_crawl_callback = self.on_should_crawl_callback.clone();
7410            let return_page_links = self.configuration.return_page_links;
7411
7412            links.extend(
7413                self.crawl_establish_smart(&client, &mut selectors, &browser)
7414                    .await,
7415            );
7416
7417            self.configuration.configure_allowlist();
7418
7419            let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> = JoinSet::new();
7420            let semaphore = self.setup_semaphore();
7421
7422            let shared = Arc::new((
7423                client.to_owned(),
7424                selectors,
7425                self.channel.clone(),
7426                self.channel_guard.clone(),
7427                self.configuration.clone(),
7428                self.domain_parsed.clone(),
7429                browser,
7430                self.on_link_find_callback.clone(),
7431                self.cookie_jar.clone(),
7432            ));
7433
7434            let add_external = self.configuration.external_domains_caseless.len() > 0;
7435            let mut exceeded_budget = false;
7436            let concurrency = throttle.is_zero();
7437
7438            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7439
7440            if !concurrency && !links.is_empty() {
7441                tokio::time::sleep(*throttle).await;
7442            }
7443
7444            let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
7445                Some(Instant::now())
7446            } else {
7447                None
7448            };
7449
7450            'outer: loop {
7451                #[cfg(all(feature = "agent", feature = "serde"))]
7452                self.apply_url_prefilter(&mut links).await;
7453
7454                let mut stream =
7455                    tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect());
7456
7457                loop {
7458                    if !concurrency {
7459                        tokio::time::sleep(*throttle).await;
7460                    }
7461
7462                    let semaphore =
7463                        get_semaphore(&semaphore, !self.configuration.shared_queue).await;
7464
7465                    tokio::select! {
7466                        biased;
7467                        Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker)  => {
7468                            if !self
7469                                .handle_process(
7470                                    handle,
7471                                    &mut interval,
7472                                    async {
7473                                        emit_log_shutdown(&link.inner());
7474                                        let permits = set.len();
7475                                        set.shutdown().await;
7476                                        semaphore.add_permits(permits);
7477
7478                                    },
7479                                )
7480                                .await
7481                            {
7482                                break 'outer;
7483                            }
7484
7485                            let allowed = self.is_allowed(&link);
7486
7487                            if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
7488                                exceeded_budget = true;
7489                                break;
7490                            }
7491                            if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
7492                                continue;
7493                            }
7494
7495                            emit_log(&link.inner());
7496                            self.insert_link(link.clone()).await;
7497
7498                            if let Ok(permit) = semaphore.clone().acquire_owned().await {
7499                                let shared = shared.clone();
7500                                let on_should_crawl_callback = on_should_crawl_callback.clone();
7501                                spawn_set("page_fetch", &mut set, async move {
7502                                    let link_result = match &shared.7 {
7503                                        Some(cb) => cb(link, None),
7504                                        _ => (link, None),
7505                                    };
7506
7507                                    let url = link_result.0.as_ref();
7508                                    let mut page = Page::new_page_with_cache(
7509                                        &url,
7510                                        &shared.0,
7511                                        shared.4.get_cache_options(),
7512                                        &shared.4.cache_policy,
7513                                    )
7514                                    .await;
7515
7516                                    let mut retry_count = shared.4.retry;
7517
7518                                    while page.should_retry && retry_count > 0 {
7519                                        retry_count -= 1;
7520
7521                                        if let Some(timeout) = page.get_timeout() {
7522                                            tokio::time::sleep(timeout).await;
7523                                        }
7524
7525                                        if page.status_code == StatusCode::GATEWAY_TIMEOUT {
7526
7527                                            if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
7528                                                if retry_count.is_power_of_two() {
7529                                                    Website::render_chrome_page(
7530                                                        &shared.4, &shared.0,
7531                                                         &mut page, url,
7532                                                         &shared.5,
7533                                                         &shared.6,
7534                                                    )
7535                                                    .await;
7536                                                } else {
7537                                                    let next_page = Page::new_page_with_cache(
7538                                                        url,
7539                                                        &shared.0,
7540                                                        shared.4.get_cache_options(),
7541                                                        &shared.4.cache_policy,
7542                                                    )
7543                                                    .await;
7544
7545                                                    page.clone_from(&next_page)
7546                                                };
7547
7548                                            }).await
7549                                        {
7550                                            log::info!("backoff gateway timeout exceeded {elasped}");
7551                                        }
7552
7553                                        } else {
7554
7555                                            if retry_count.is_power_of_two() {
7556                                                Website::render_chrome_page(
7557                                                    &shared.4, &shared.0,
7558                                                    &mut page, url,
7559                                                    &shared.5,
7560                                                    &shared.6,
7561                                                )
7562                                                .await;
7563                                            } else {
7564                                                page.clone_from(
7565                                                    &Page::new_page_with_cache(
7566                                                        url,
7567                                                        &shared.0,
7568                                                        shared.4.get_cache_options(),
7569                                                        &shared.4.cache_policy,
7570                                                    )
7571                                                        .await,
7572                                                );
7573                                            }
7574                                        }
7575                                    }
7576
7577                                    if add_external {
7578                                        page.set_external(
7579                                            shared
7580                                                .4
7581                                                .external_domains_caseless
7582                                                .clone(),
7583                                        );
7584                                    }
7585
7586                                    let prev_domain = page.base.take();
7587
7588                                    // Use page's own URL for relative link resolution (not original crawl domain).
7589                                    // Fixes subdomain pages resolving e.g. href="/path" against wrong host.
7590                                    page.set_url_parsed_direct();
7591                                    let page_base = page.base.take().map(Box::new);
7592
7593                                    if return_page_links {
7594                                        page.page_links = Some(Default::default());
7595                                    }
7596
7597                                    let (links, bytes_transferred ) = page
7598                                        .smart_links(
7599                                            &shared.1, &shared.4, &page_base, &shared.6, Some(&shared.8)
7600                                        )
7601                                        .await;
7602
7603                                    page.base = prev_domain;
7604                                    page.bytes_transferred = bytes_transferred;
7605
7606                                    if shared.4.normalize {
7607                                        page.signature.replace(crate::utils::hash_html(&page.get_html_bytes_u8()).await);
7608                                    }
7609
7610                                    // Run remote multimodal extraction if configured (smart HTTP path)
7611                                    #[cfg(all(feature = "agent", feature = "serde"))]
7612                                    if shared.4.remote_multimodal.is_some() {
7613                                        let html = page.get_html();
7614                                        if !html.is_empty() {
7615                                            use crate::features::automation::{run_remote_multimodal_extraction, AutomationResultExt};
7616                                            let title = page.metadata.as_ref().and_then(|m| m.title.as_ref()).map(|t| t.as_str());
7617                                            if let Ok(Some(result)) = run_remote_multimodal_extraction(
7618                                                &shared.4.remote_multimodal,
7619                                                &html,
7620                                                url,
7621                                                title,
7622                                            ).await {
7623                                                // Store usage on page
7624                                                match page.remote_multimodal_usage.as_mut() {
7625                                                    Some(v) => v.push(result.usage.clone()),
7626                                                    None => page.remote_multimodal_usage = Some(vec![result.usage.clone()]),
7627                                                }
7628                                                // Store extracted data if available
7629                                                if result.extracted.is_some() || result.screenshot.is_some() {
7630                                                    let automation_result = result.to_automation_results();
7631                                                    match page.extra_remote_multimodal_data.as_mut() {
7632                                                        Some(v) => v.push(automation_result),
7633                                                        None => page.extra_remote_multimodal_data = Some(vec![automation_result]),
7634                                                    }
7635                                                }
7636                                            }
7637                                        }
7638                                    }
7639
7640                                    if let Some(ref cb) = on_should_crawl_callback {
7641                                        if !cb.call(&page) {
7642                                            page.blocked_crawl = true;
7643                                            channel_send_page(&shared.2, page, &shared.3);
7644                                            drop(permit);
7645                                            return Default::default()
7646                                        }
7647                                    }
7648
7649                                    let signature = page.signature;
7650
7651                                    channel_send_page(&shared.2, page, &shared.3);
7652
7653                                    drop(permit);
7654
7655                                    (links, signature)
7656                                });
7657                            }
7658
7659                            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7660                        }
7661                        Some(result) = set.join_next(), if !set.is_empty() => {
7662                            if let Ok(res) = result {
7663                                match res.1 {
7664                                    Some(signature) => {
7665                                        if self.is_signature_allowed(signature).await {
7666                                            self.insert_signature(signature).await;
7667                                            self.links_visited.extend_links(&mut links, res.0);
7668                                        }
7669                                    }
7670                                    _ => {
7671                                        self.links_visited.extend_links(&mut links, res.0);
7672                                    }
7673                                }
7674                            } else{
7675                                break
7676                            }
7677                        }
7678                        else => break,
7679                    }
7680
7681                    if links.is_empty() && set.is_empty() || exceeded_budget {
7682                        if exceeded_budget {
7683                            while set.join_next().await.is_some() {}
7684                        }
7685                        break 'outer;
7686                    }
7687                }
7688
7689                self.subscription_guard().await;
7690                self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7691
7692                if links.is_empty() && set.is_empty() {
7693                    break;
7694                }
7695            }
7696
7697            if !links.is_empty() {
7698                self.extra_links.extend(links);
7699            }
7700        }
7701    }
7702
7703    /// Sitemap crawl entire lists. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
7704    #[cfg(not(feature = "sitemap"))]
7705    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7706    pub async fn sitemap_crawl(
7707        &mut self,
7708        _client: &Client,
7709        _handle: &Option<Arc<AtomicI8>>,
7710        _scrape: bool,
7711    ) {
7712    }
7713
7714    /// Sitemap crawl entire lists chain. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
7715    #[cfg(not(feature = "sitemap"))]
7716    pub async fn sitemap_crawl_chain(
7717        &mut self,
7718        _client: &Client,
7719        _handle: &Option<Arc<AtomicI8>>,
7720        _scrape: bool,
7721    ) {
7722    }
7723
7724    /// Setup the sitemap path
7725    #[cfg(feature = "sitemap")]
7726    pub(crate) fn get_sitemap_setup(&self, domain: &str) -> (&str, bool) {
7727        let (sitemap_path, needs_trailing) = match &self.configuration.sitemap_url {
7728            Some(sitemap_path) => {
7729                let sitemap_path = sitemap_path.as_str();
7730                if domain.ends_with('/') && sitemap_path.starts_with('/') {
7731                    (&sitemap_path[1..], false)
7732                } else if !domain.ends_with('/')
7733                    && !sitemap_path.is_empty()
7734                    && !sitemap_path.starts_with('/')
7735                {
7736                    (sitemap_path, true)
7737                } else {
7738                    (sitemap_path, false)
7739                }
7740            }
7741            _ => ("sitemap.xml", !domain.ends_with("/")),
7742        };
7743
7744        (sitemap_path, needs_trailing)
7745    }
7746
7747    /// Sitemap crawl entire lists. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
7748    #[cfg(feature = "sitemap")]
7749    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7750    pub(crate) async fn sitemap_crawl_raw(
7751        &mut self,
7752        client: &Client,
7753        handle: &Option<Arc<AtomicI8>>,
7754        scrape: bool,
7755    ) {
7756        let mut exceeded_budget = self.is_over_wild_budget(&self.configuration.budget);
7757
7758        if !exceeded_budget {
7759            let selectors = self.setup_selectors();
7760            let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
7761            let domain = self.url.inner().as_str();
7762            self.domain_parsed = parse_absolute_url(domain);
7763
7764            let persist_links = self.status == CrawlStatus::Start;
7765
7766            let mut interval: Interval = tokio::time::interval(Duration::from_millis(15));
7767
7768            let (sitemap_path, needs_trailing) = self.get_sitemap_setup(domain);
7769
7770            self.configuration.sitemap_url = Some(Box::new(
7771                string_concat!(domain, if needs_trailing { "/" } else { "" }, sitemap_path).into(),
7772            ));
7773
7774            self.configuration.configure_allowlist();
7775
7776            let domain_parsed_ref = self.domain_parsed.as_deref().cloned().map(Box::new);
7777
7778            let shared = Arc::new((
7779                self.channel.clone(),
7780                self.channel_guard.clone(),
7781                selectors,
7782                domain_parsed_ref,
7783            ));
7784            let mut sitemaps = match &self.configuration.sitemap_url {
7785                Some(sitemap) => Vec::from([sitemap.to_owned()]),
7786                _ => Default::default(),
7787            };
7788
7789            let return_page_links = self.configuration.return_page_links;
7790
7791            let mut extra_links = self.extra_links.clone();
7792            self.dequeue(&mut q, &mut extra_links, &mut exceeded_budget)
7793                .await;
7794            self.extra_links.clone_from(&extra_links);
7795
7796            let whitelist_changes = self.configuration.add_sitemap_to_whitelist();
7797
7798            if whitelist_changes.modified() {
7799                self.configuration.set_whitelist();
7800            }
7801
7802            'outer: loop {
7803                let stream =
7804                    tokio_stream::iter::<Vec<Box<CompactString>>>(sitemaps.drain(..).collect());
7805                tokio::pin!(stream);
7806
7807                let mut first_request = false;
7808                let mut attempted_correct = false;
7809
7810                while let Some(mut sitemap_url) = stream.next().await {
7811                    if !self.handle_process(handle, &mut interval, async {}).await {
7812                        break 'outer;
7813                    }
7814
7815                    let link = <CompactString as Clone>::clone(&(*sitemap_url)).into();
7816
7817                    let allowed = self.is_allowed_budgetless(&link);
7818
7819                    if allowed.eq(&ProcessLinkStatus::Blocked) {
7820                        continue;
7821                    }
7822
7823                    self.insert_link(link).await;
7824
7825                    let (tx, mut rx) = tokio::sync::mpsc::channel::<Page>(100);
7826
7827                    let shared = shared.clone();
7828
7829                    let handles = crate::utils::spawn_task("page_fetch", async move {
7830                        let mut pages = Vec::new();
7831
7832                        while let Some(mut page) = rx.recv().await {
7833                            if page.page_links.is_none() {
7834                                let links = page.links(&shared.2, &shared.3).await;
7835                                page.page_links = Some(links.into());
7836                            }
7837
7838                            if scrape || persist_links {
7839                                pages.push(page.clone());
7840                            };
7841
7842                            // reset the page links before sending to the main subscriber.
7843                            if !return_page_links {
7844                                page.page_links = None;
7845                            }
7846
7847                            if shared.0.is_some() {
7848                                channel_send_page(&shared.0, page, &shared.1);
7849                            }
7850                        }
7851
7852                        pages
7853                    });
7854
7855                    while !first_request {
7856                        // try to get the original sitemap if it had an error on the first request make a request to the root html and parse out the sitemap path.
7857                        match client.get(sitemap_url.as_str()).send().await {
7858                            Ok(response) => {
7859                                let limit = *crate::utils::MAX_SIZE_BYTES as u64;
7860
7861                                if let Some(response_content_length) = response.content_length() {
7862                                    if limit > 0 && response_content_length >= limit {
7863                                        // we need a error here
7864                                        first_request = true;
7865                                        log::info!(
7866                                            "{} exceeded parse limit: {:?}",
7867                                            sitemap_url,
7868                                            limit
7869                                        );
7870                                        break;
7871                                    }
7872                                }
7873
7874                                if response.status() == 404 {
7875                                    if !self
7876                                        .sitemap_parse(
7877                                            client,
7878                                            &mut first_request,
7879                                            &mut sitemap_url,
7880                                            &mut attempted_correct,
7881                                        )
7882                                        .await
7883                                    {
7884                                        break;
7885                                    }
7886                                } else {
7887                                    match response.bytes().await {
7888                                        Ok(b) => {
7889                                            first_request = true;
7890                                            self.sitemap_parse_crawl(
7891                                                client,
7892                                                handle,
7893                                                b,
7894                                                &mut interval,
7895                                                &mut exceeded_budget,
7896                                                &tx,
7897                                                &mut sitemaps,
7898                                                true,
7899                                            )
7900                                            .await;
7901                                        }
7902                                        Err(err) => {
7903                                            first_request = true;
7904                                            log::info!("http parse error: {:?}", err.to_string())
7905                                        }
7906                                    };
7907                                }
7908                            }
7909                            Err(err) => {
7910                                // do not retry error again.
7911                                if attempted_correct {
7912                                    first_request = true;
7913                                    break;
7914                                }
7915
7916                                log::info!("attempting to find sitemap path: {}", err.to_string());
7917
7918                                if !self
7919                                    .sitemap_parse(
7920                                        client,
7921                                        &mut first_request,
7922                                        &mut sitemap_url,
7923                                        &mut attempted_correct,
7924                                    )
7925                                    .await
7926                                {
7927                                    break;
7928                                }
7929                            }
7930                        };
7931                    }
7932
7933                    drop(tx);
7934
7935                    if let Ok(mut handle) = handles.await {
7936                        for page in handle.iter_mut() {
7937                            if let Some(mut links) = page.page_links.clone() {
7938                                self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7939                                self.extra_links.extend(*links)
7940                            }
7941                        }
7942                        if scrape {
7943                            if let Some(p) = self.pages.as_mut() {
7944                                p.extend(handle);
7945                            }
7946                        }
7947                    }
7948
7949                    if exceeded_budget {
7950                        break;
7951                    }
7952                }
7953
7954                if sitemaps.is_empty() || exceeded_budget {
7955                    break;
7956                }
7957            }
7958
7959            self.configuration
7960                .remove_sitemap_from_whitelist(whitelist_changes);
7961        }
7962    }
7963
7964    /// Sitemap crawl entire lists using chrome. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
7965    #[cfg(all(
7966        feature = "sitemap",
7967        feature = "chrome",
7968        not(feature = "decentralized")
7969    ))]
7970    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7971    pub(crate) async fn sitemap_crawl_chrome(
7972        &mut self,
7973        client: &Client,
7974        handle: &Option<Arc<AtomicI8>>,
7975        scrape: bool,
7976    ) {
7977        use crate::features::chrome::attempt_navigation;
7978        use sitemap::{
7979            reader::{SiteMapEntity, SiteMapReader},
7980            structs::Location,
7981        };
7982
7983        let mut exceeded_budget = self.is_over_wild_budget(&self.configuration.budget);
7984
7985        if !exceeded_budget {
7986            if let Some(mut b) = self.setup_browser().await {
7987                let selectors = self.setup_selectors();
7988                let semaphore: Arc<Semaphore> = self.setup_semaphore();
7989                let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
7990                let domain = self.url.inner().as_str();
7991                self.domain_parsed = parse_absolute_url(&domain);
7992                let persist_links = self.status == CrawlStatus::Start;
7993
7994                let mut interval = tokio::time::interval(Duration::from_millis(15));
7995
7996                let (sitemap_path, needs_trailing) = self.get_sitemap_setup(&domain);
7997
7998                self.configuration.sitemap_url = Some(Box::new(
7999                    string_concat!(domain, if needs_trailing { "/" } else { "" }, sitemap_path)
8000                        .into(),
8001                ));
8002
8003                self.configuration.configure_allowlist();
8004
8005                let domain_parsed_ref = self.domain_parsed.as_deref().cloned().map(Box::new);
8006
8007                let shared = Arc::new((
8008                    self.channel.clone(),
8009                    self.channel_guard.clone(),
8010                    b.browser.0.clone(),
8011                    self.configuration.clone(),
8012                    self.url.inner().to_string(),
8013                    b.browser.2.clone(),
8014                    selectors.clone(),
8015                    domain_parsed_ref,
8016                ));
8017
8018                let mut sitemaps = match &self.configuration.sitemap_url {
8019                    Some(sitemap) => Vec::from([sitemap.to_owned()]),
8020                    _ => Default::default(),
8021                };
8022
8023                let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
8024                    Some(Instant::now())
8025                } else {
8026                    None
8027                };
8028
8029                let mut extra_links = self.extra_links.clone();
8030                self.dequeue(&mut q, &mut *extra_links, &mut exceeded_budget)
8031                    .await;
8032                self.extra_links.clone_from(&extra_links);
8033                let mut set: JoinSet<Option<Page>> = JoinSet::new();
8034
8035                let whitelist_changes = self.configuration.add_sitemap_to_whitelist();
8036
8037                if whitelist_changes.modified() {
8038                    self.configuration.set_whitelist();
8039                }
8040
8041                'outer: loop {
8042                    let stream: tokio_stream::Iter<std::vec::IntoIter<Box<CompactString>>> =
8043                        tokio_stream::iter::<Vec<Box<CompactString>>>(sitemaps.drain(..).collect());
8044                    tokio::pin!(stream);
8045
8046                    tokio::select! {
8047                        biased;
8048                        Some(sitemap_url) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker)  => {
8049                            if !self.handle_process(handle, &mut interval, async {}).await {
8050                                break 'outer;
8051                            }
8052
8053                            let link = <CompactString as Clone>::clone(&(*sitemap_url)).into();
8054
8055                            let allowed = self.is_allowed_budgetless(&link);
8056
8057                            if allowed.eq(&ProcessLinkStatus::Blocked) {
8058                                continue;
8059                            }
8060
8061                            self.insert_link(link).await;
8062
8063                            match attempt_navigation(
8064                                "about:blank",
8065                                &shared.2,
8066                                &self.configuration.request_timeout,
8067                                &shared.5,
8068                                &self.configuration.viewport,
8069                            )
8070                            .await {
8071                                Ok(new_page) => {
8072                                    let (_, intercept_handle) = tokio::join!(
8073                                        crate::features::chrome::setup_chrome_events(
8074                                            &new_page,
8075                                            &self.configuration
8076                                        ),
8077                                        self.setup_chrome_interception(&new_page)
8078                                    );
8079
8080                                    let mut page = Page::new(
8081                                        &sitemap_url,
8082                                        &client,
8083                                        &new_page,
8084                                        &self.configuration.wait_for,
8085                                        &self.configuration.screenshot,
8086                                        false, // we use the initial about:blank page.
8087                                        &self.configuration.openai_config,
8088                                        &self.configuration.execution_scripts,
8089                                        &self.configuration.automation_scripts,
8090                                        &self.configuration.viewport,
8091                                        &self.configuration.request_timeout,
8092                                        &self.configuration.track_events,
8093                                        self.configuration.referer.clone(),
8094                                        self.configuration.max_page_bytes,
8095                                        self.configuration.get_cache_options(),
8096                                        &self.configuration.cache_policy,
8097                                        &self.configuration.remote_multimodal,
8098                                    )
8099                                    .await;
8100
8101                                    if let Some(h) = intercept_handle {
8102                                        let abort_handle = h.abort_handle();
8103                                        if let Err(elasped) =
8104                                            tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await
8105                                        {
8106                                            log::warn!("Handler timeout exceeded {elasped}");
8107                                            abort_handle.abort();
8108                                        }
8109                                    }
8110
8111                                    drop(new_page);
8112
8113                                    let is_xml_entry = page.get_html_bytes_u8().starts_with(b"<?xml");
8114                                    let is_xml = is_xml_entry
8115                                        && !page.get_html_bytes_u8().ends_with(b"</html>");
8116
8117                                    if is_xml {
8118                                        let reader = SiteMapReader::new(&*page.get_html_bytes_u8());
8119                                        let mut stream = tokio_stream::iter(reader);
8120
8121                                        while let Some(entity) = stream.next().await {
8122                                            if !self.handle_process(handle, &mut interval, async {}).await {
8123                                                break;
8124                                            }
8125                                            match entity {
8126                                                SiteMapEntity::Url(url_entry) => match url_entry.loc {
8127                                                    Location::Url(url) => {
8128                                                        let link: CaseInsensitiveString = url.as_str().into();
8129
8130                                                        let allowed = self.is_allowed(&link);
8131
8132                                                        if allowed.eq(&ProcessLinkStatus::Blocked) {
8133                                                            continue;
8134                                                        }
8135                                                        if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
8136                                                            exceeded_budget = true;
8137                                                            break;
8138                                                        }
8139
8140                                                        self.insert_link(link.clone()).await;
8141
8142                                                        let client = client.clone();
8143                                                        let shared = shared.clone();
8144
8145                                                        spawn_set("page_fetch", &mut set, async move {
8146                                                            if let Ok(new_page) = attempt_navigation(
8147                                                                "about:blank",
8148                                                                &shared.2,
8149                                                                &shared.3.request_timeout,
8150                                                                &shared.5,
8151                                                                &shared.3.viewport,
8152                                                            )
8153                                                            .await
8154                                                            {
8155                                                                let (_, intercept_handle) = tokio::join!(
8156                                                                    crate::features::chrome::setup_chrome_events(
8157                                                                        &new_page, &shared.3,
8158                                                                    ),
8159                                                                    crate::features::chrome::setup_chrome_interception_base(
8160                                                                        &new_page,
8161                                                                        shared.3.chrome_intercept.enabled,
8162                                                                        &shared.3.auth_challenge_response,
8163                                                                        shared.3.chrome_intercept.block_visuals,
8164                                                                        &shared.4,
8165                                                                    )
8166                                                                );
8167
8168                                                                let mut page = Page::new(
8169                                                                    &link.inner(),
8170                                                                    &client,
8171                                                                    &new_page,
8172                                                                    &shared.3.wait_for,
8173                                                                    &shared.3.screenshot,
8174                                                                    false,
8175                                                                    &shared.3.openai_config,
8176                                                                    &shared.3.execution_scripts,
8177                                                                    &shared.3.automation_scripts,
8178                                                                    &shared.3.viewport,
8179                                                                    &shared.3.request_timeout,
8180                                                                    &shared.3.track_events,
8181                                                                    shared.3.referer.clone(),
8182                                                                    shared.3.max_page_bytes,
8183                                                                    shared.3.get_cache_options(),
8184                                                                    &shared.3.cache_policy,
8185                                                                    &shared.3.remote_multimodal,
8186                                                                )
8187                                                                .await;
8188
8189                                                                if let Some(intercept_handle) = intercept_handle
8190                                                                {
8191                                                                    let abort_handle =
8192                                                                        intercept_handle.abort_handle();
8193
8194                                                                    if let Err(elasped) = tokio::time::timeout(
8195                                                                        tokio::time::Duration::from_secs(10),
8196                                                                        async { intercept_handle.await },
8197                                                                    )
8198                                                                    .await
8199                                                                    {
8200                                                                        log::warn!("Handler timeout exceeded {elasped}");
8201                                                                        abort_handle.abort();
8202                                                                    }
8203                                                                }
8204
8205                                                                if page.page_links.is_none() {
8206                                                                    let links =
8207                                                                        page.links(&shared.6, &shared.7).await;
8208                                                                    page.page_links = Some(links.into());
8209                                                                }
8210
8211                                                                Some(page)
8212                                                            } else {
8213                                                                None
8214                                                            }
8215                                                        });
8216                                                    }
8217                                                    Location::None | Location::ParseErr(_) => (),
8218                                                },
8219                                                SiteMapEntity::SiteMap(sitemap_entry) => {
8220                                                    match sitemap_entry.loc {
8221                                                        Location::Url(url) => {
8222                                                            sitemaps.push(Box::new(CompactString::new(
8223                                                                &url.as_str(),
8224                                                            )));
8225                                                        }
8226                                                        Location::None | Location::ParseErr(_) => (),
8227                                                    }
8228                                                }
8229                                                SiteMapEntity::Err(err) => {
8230                                                    log::info!("incorrect sitemap error: {:?}", err.msg(),)
8231                                                }
8232                                            };
8233
8234                                            if exceeded_budget {
8235                                                break;
8236                                            }
8237                                        }
8238                                    } else {
8239
8240                                        if is_xml_entry {
8241                                            page.modify_xml_html();
8242                                        }
8243
8244                                        let links = page.links(&shared.6, &shared.7).await;
8245
8246                                        let mut stream = tokio_stream::iter(links);
8247
8248                                        while let Some(link) = stream.next().await {
8249                                            if !self.handle_process(handle, &mut interval, async {}).await {
8250                                                break;
8251                                            }
8252
8253                                            if link.ends_with(".xml") {
8254                                                sitemaps.push(Box::new(link.inner().clone()));
8255                                                continue;
8256                                            }
8257
8258                                            let allowed = self.is_allowed(&link);
8259
8260                                            if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
8261                                                exceeded_budget = true;
8262                                                break;
8263                                            }
8264                                            if allowed.eq(&ProcessLinkStatus::Blocked) {
8265                                                continue;
8266                                            }
8267
8268                                            self.insert_link(link.clone()).await;
8269
8270                                            let client = client.clone();
8271                                            let shared = shared.clone();
8272
8273                                            spawn_set("page_fetch", &mut set, async move {
8274                                                match attempt_navigation(
8275                                                    "about:blank",
8276                                                    &shared.2,
8277                                                    &shared.3.request_timeout,
8278                                                    &shared.5,
8279                                                    &shared.3.viewport,
8280                                                )
8281                                                .await {
8282                                                    Ok(new_page) => {
8283                                                        let (_, intercept_handle) = tokio::join!(
8284                                                            crate::features::chrome::setup_chrome_events(
8285                                                                &new_page, &shared.3,
8286                                                            ),
8287                                                            crate::features::chrome::setup_chrome_interception_base(
8288                                                                &new_page,
8289                                                                shared.3.chrome_intercept.enabled,
8290                                                                &shared.3.auth_challenge_response,
8291                                                                shared.3.chrome_intercept.block_visuals,
8292                                                                &shared.4,
8293                                                            )
8294                                                        );
8295
8296                                                        let mut page = Page::new(
8297                                                            &link.inner(),
8298                                                            &client,
8299                                                            &new_page,
8300                                                            &shared.3.wait_for,
8301                                                            &shared.3.screenshot,
8302                                                            false,
8303                                                            &shared.3.openai_config,
8304                                                            &shared.3.execution_scripts,
8305                                                            &shared.3.automation_scripts,
8306                                                            &shared.3.viewport,
8307                                                            &shared.3.request_timeout,
8308                                                            &shared.3.track_events,
8309                                                            shared.3.referer.clone(),
8310                                                            shared.3.max_page_bytes,
8311                                                            shared.3.get_cache_options(),
8312                                                            &shared.3.cache_policy,
8313                                                            &shared.3.remote_multimodal,
8314                                                        )
8315                                                        .await;
8316
8317                                                        if let Some(intercept_handle) = intercept_handle {
8318                                                            let abort_handle = intercept_handle.abort_handle();
8319
8320                                                            if let Err(elasped) = tokio::time::timeout(
8321                                                                tokio::time::Duration::from_secs(10),
8322                                                                async { intercept_handle.await },
8323                                                            )
8324                                                            .await
8325                                                            {
8326                                                                log::warn!("Handler timeout exceeded {elasped}");
8327                                                                abort_handle.abort();
8328                                                            }
8329                                                        }
8330
8331                                                        if page.page_links.is_none() {
8332                                                            let links = page.links(&shared.6, &shared.7).await;
8333                                                            page.page_links = Some(links.into());
8334                                                        }
8335
8336                                                        Some(page)
8337                                                    }
8338                                                    Err(err) => {
8339                                                        log::error!("chrome failed to open: {:?}", err);
8340                                                        None
8341                                                    }
8342                                                }
8343                                            });
8344
8345                                            if exceeded_budget {
8346                                                break;
8347                                            }
8348                                        }
8349                                    }
8350                                }
8351                                Err(err) => {
8352                                    log::error!("chrome failed to open: {:?}", err);
8353                                }
8354                            }
8355
8356
8357                        },
8358                        Some(result) = set.join_next(), if !set.is_empty() => {
8359                            if let Ok(res) = result {
8360                                match res {
8361                                    Some(page) => {
8362                                        if let Some(signature) = page.signature {
8363                                            if self.is_signature_allowed(signature).await {
8364                                                if let Some(mut links) = page.page_links.clone() {
8365                                                    self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
8366                                                    self.extra_links.extend(*links)
8367                                                }
8368                                                self.insert_signature(signature).await;
8369
8370                                                channel_send_page(
8371                                                    &shared.0, page.clone(), &shared.1,
8372                                                );
8373
8374                                                if scrape || persist_links {
8375                                                    if let Some(p) = self.pages.as_mut() {
8376                                                        p.push(page);
8377                                                    }
8378                                                }
8379                                            }
8380                                        } else {
8381                                            if let Some(mut links) = page.page_links.clone() {
8382                                                self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
8383                                                self.extra_links.extend(*links)
8384                                            }
8385                                            channel_send_page(
8386                                                &shared.0, page.clone(), &shared.1,
8387                                            );
8388                                            if scrape || persist_links {
8389                                                if let Some(p) = self.pages.as_mut() {
8390                                                    p.push(page);
8391                                                }
8392                                            }
8393                                        }
8394                                    }
8395                                    _ => ()
8396                                }
8397                            } else {
8398                                break;
8399                            }
8400                        }
8401                        else => break,
8402                    }
8403
8404                    if sitemaps.len() == 0 || exceeded_budget {
8405                        break;
8406                    }
8407                }
8408
8409                while let Some(result) = set.join_next().await {
8410                    if let Ok(res) = result {
8411                        match res {
8412                            Some(page) => {
8413                                if let Some(signature) = page.signature {
8414                                    if self.is_signature_allowed(signature).await {
8415                                        if let Some(mut links) = page.page_links.clone() {
8416                                            self.dequeue(&mut q, &mut links, &mut exceeded_budget)
8417                                                .await;
8418                                            self.extra_links.extend(*links)
8419                                        }
8420                                        self.insert_signature(signature).await;
8421                                        channel_send_page(&shared.0, page.clone(), &shared.1);
8422                                        if scrape || persist_links {
8423                                            if let Some(p) = self.pages.as_mut() {
8424                                                p.push(page);
8425                                            }
8426                                        }
8427                                    }
8428                                } else {
8429                                    if let Some(mut links) = page.page_links.clone() {
8430                                        self.dequeue(&mut q, &mut links, &mut exceeded_budget)
8431                                            .await;
8432                                        self.extra_links.extend(*links)
8433                                    }
8434                                    channel_send_page(&shared.0, page.clone(), &shared.1);
8435                                    if scrape || persist_links {
8436                                        if let Some(p) = self.pages.as_mut() {
8437                                            p.push(page);
8438                                        }
8439                                    }
8440                                }
8441                            }
8442                            _ => (),
8443                        }
8444                    }
8445                }
8446                b.dispose();
8447                self.configuration
8448                    .remove_sitemap_from_whitelist(whitelist_changes);
8449            }
8450        }
8451    }
8452
8453    /// Sitemap crawl entire lists. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
8454    #[cfg(feature = "sitemap")]
8455    pub async fn sitemap_crawl(
8456        &mut self,
8457        client: &Client,
8458        handle: &Option<Arc<AtomicI8>>,
8459        scrape: bool,
8460    ) {
8461        self.sitemap_crawl_raw(client, handle, scrape).await
8462    }
8463
8464    /// Sitemap crawl entire lists chain. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
8465    #[cfg(all(
8466        feature = "sitemap",
8467        any(not(feature = "chrome"), feature = "decentralized")
8468    ))]
8469    async fn sitemap_crawl_chain(
8470        &mut self,
8471        client: &Client,
8472        handle: &Option<Arc<AtomicI8>>,
8473        scrape: bool,
8474    ) {
8475        if !self.configuration.ignore_sitemap {
8476            self.sitemap_crawl_raw(client, handle, scrape).await
8477        }
8478    }
8479
8480    /// Sitemap crawl entire lists chain using chrome. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
8481    #[cfg(all(
8482        feature = "sitemap",
8483        feature = "chrome",
8484        not(feature = "decentralized")
8485    ))]
8486    pub async fn sitemap_crawl_chain(
8487        &mut self,
8488        client: &Client,
8489        handle: &Option<Arc<AtomicI8>>,
8490        scrape: bool,
8491    ) {
8492        if !self.configuration.ignore_sitemap {
8493            self.sitemap_crawl_chrome(client, handle, scrape).await
8494        }
8495    }
8496
8497    /// Sitemap parse entire lists. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
8498    #[cfg(feature = "sitemap")]
8499    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
8500    pub async fn sitemap_parse(
8501        &mut self,
8502        client: &Client,
8503        first_request: &mut bool,
8504        sitemap_url: &mut Box<CompactString>,
8505        attempted_correct: &mut bool,
8506    ) -> bool {
8507        let mut valid = !*attempted_correct;
8508
8509        if valid {
8510            if let Some(domain) = &self.domain_parsed {
8511                // attempt to parse the sitemap from the html.
8512                match client.get(domain.as_str()).send().await {
8513                    Ok(response) => {
8514                        let limit = *crate::utils::MAX_SIZE_BYTES as u64;
8515
8516                        if let Some(response_content_length) = response.content_length() {
8517                            if limit > 0 && response_content_length >= limit {
8518                                log::info!("{} exceeded parse limit: {:?}", domain, limit);
8519                                *first_request = true;
8520                                *attempted_correct = true;
8521                                valid = false;
8522                            }
8523                        }
8524
8525                        if valid {
8526                            // stream the bytes to lol_html to parse the sitemap from the path.
8527                            let cell = tokio::sync::OnceCell::new();
8528
8529                            let rewriter_settings = lol_html::Settings {
8530                                element_content_handlers: vec![lol_html::element!(
8531                                    r#"link[rel="sitemap"]"#,
8532                                    |el| {
8533                                        if let Some(href) = el.get_attribute("href") {
8534                                            let _ = cell.set(href);
8535                                        }
8536                                        Ok(())
8537                                    }
8538                                )],
8539                                adjust_charset_on_meta_tag: false,
8540                                ..lol_html::send::Settings::new_for_handler_types()
8541                            };
8542
8543                            let mut rewriter = lol_html::send::HtmlRewriter::new(
8544                                rewriter_settings,
8545                                |_c: &[u8]| {},
8546                            );
8547
8548                            let mut wrote_error = false;
8549                            let mut stream = response.bytes_stream();
8550
8551                            while let Some(chunk) = stream.next().await {
8552                                if let Ok(chunk) = chunk {
8553                                    if rewriter.write(&chunk).is_err() {
8554                                        wrote_error = true;
8555                                        break;
8556                                    }
8557                                }
8558                                if cell.initialized() {
8559                                    break;
8560                                }
8561                            }
8562
8563                            if !wrote_error {
8564                                let _ = rewriter.end();
8565                            }
8566
8567                            if let Some(sitemap) = cell.get() {
8568                                if sitemap.is_empty() {
8569                                    *first_request = true;
8570                                }
8571
8572                                if domain.join(sitemap).is_err() {
8573                                    *first_request = true;
8574                                }
8575                                // if we retried the request here it should succeed.
8576                                *sitemap_url = Box::new(sitemap.into());
8577                                *attempted_correct = true;
8578                            } else {
8579                                *first_request = true;
8580                            }
8581                        }
8582                    }
8583                    Err(err) => {
8584                        *first_request = true;
8585                        valid = false;
8586                        log::info!("http parse error: {:?}", err.to_string())
8587                    }
8588                };
8589            }
8590        }
8591
8592        valid
8593    }
8594    /// Sitemap parse entire lists. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
8595    #[cfg(feature = "sitemap")]
8596    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
8597    async fn sitemap_parse_crawl(
8598        &mut self,
8599        client: &Client,
8600        handle: &Option<Arc<AtomicI8>>,
8601        b: bytes::Bytes,
8602        interval: &mut Interval,
8603        exceeded_budget: &mut bool,
8604        tx: &tokio::sync::mpsc::Sender<Page>,
8605        sitemaps: &mut Vec<Box<CompactString>>,
8606        crawl: bool,
8607    ) {
8608        use sitemap::reader::{SiteMapEntity, SiteMapReader};
8609        use sitemap::structs::Location;
8610
8611        if !b.is_empty() && b.starts_with(b"<?xml") {
8612            let mut stream = tokio_stream::iter(SiteMapReader::new(&*b));
8613
8614            let retry = self.configuration.retry;
8615
8616            while let Some(entity) = stream.next().await {
8617                if !self.handle_process(handle, interval, async {}).await {
8618                    break;
8619                }
8620                match entity {
8621                    SiteMapEntity::Url(url_entry) => match url_entry.loc {
8622                        Location::Url(url) => {
8623                            let link: CaseInsensitiveString = url.as_str().into();
8624
8625                            let allowed = self.is_allowed(&link);
8626
8627                            if allowed.eq(&ProcessLinkStatus::Blocked) {
8628                                continue;
8629                            }
8630
8631                            if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
8632                                *exceeded_budget = true;
8633                                break;
8634                            }
8635
8636                            self.insert_link(link.clone()).await;
8637
8638                            if crawl {
8639                                let client = client.clone();
8640                                let tx = tx.clone();
8641                                let cache_options = self.configuration.get_cache_options();
8642                                let cache_policy = self.configuration.cache_policy.clone();
8643
8644                                crate::utils::spawn_task("page_fetch", async move {
8645                                    let mut page = Page::new_page_with_cache(
8646                                        link.inner(),
8647                                        &client,
8648                                        cache_options.clone(),
8649                                        &cache_policy,
8650                                    )
8651                                    .await;
8652
8653                                    let mut retry_count = retry;
8654
8655                                    while page.should_retry && retry_count > 0 {
8656                                        if let Some(timeout) = page.get_timeout() {
8657                                            tokio::time::sleep(timeout).await;
8658                                        }
8659                                        page.clone_from(
8660                                            &Page::new_page_with_cache(
8661                                                link.inner(),
8662                                                &client,
8663                                                cache_options.clone(),
8664                                                &cache_policy,
8665                                            )
8666                                            .await,
8667                                        );
8668                                        retry_count -= 1;
8669                                    }
8670
8671                                    if let Ok(permit) = tx.reserve().await {
8672                                        permit.send(page);
8673                                    }
8674                                });
8675                            }
8676                        }
8677                        Location::None | Location::ParseErr(_) => (),
8678                    },
8679                    SiteMapEntity::SiteMap(sitemap_entry) => match sitemap_entry.loc {
8680                        Location::Url(url) => {
8681                            sitemaps.push(Box::new(CompactString::new(url.as_str())));
8682                        }
8683                        Location::None | Location::ParseErr(_) => (),
8684                    },
8685                    SiteMapEntity::Err(err) => {
8686                        log::info!("incorrect sitemap error: {:?}", err.msg())
8687                    }
8688                };
8689
8690                if *exceeded_budget {
8691                    break;
8692                }
8693            }
8694        }
8695    }
8696
8697    /// get base link for crawl establishing.
8698    #[cfg(feature = "regex")]
8699    pub fn get_base_link(&self) -> &CaseInsensitiveString {
8700        &self.url
8701    }
8702
8703    /// get base link for crawl establishing.
8704    #[cfg(not(feature = "regex"))]
8705    pub fn get_base_link(&self) -> &CompactString {
8706        self.url.inner()
8707    }
8708
8709    /// Guard the channel from closing until all subscription events complete.
8710    pub async fn subscription_guard(&self) {
8711        if let Some(channel) = &self.channel {
8712            if !channel.1.is_empty() {
8713                if let Some(guard_counter) = &self.channel_guard {
8714                    guard_counter.lock().await
8715                }
8716            }
8717        }
8718    }
8719
8720    /// Launch or connect to browser with setup.
8721    #[cfg(feature = "chrome")]
8722    pub async fn setup_browser_base(
8723        config: &Configuration,
8724        url_parsed: &Option<Box<Url>>,
8725        jar: Option<&Arc<crate::client::cookie::Jar>>,
8726    ) -> Option<crate::features::chrome::BrowserController> {
8727        match crate::features::chrome::launch_browser_cookies(config, url_parsed, jar).await {
8728            Some((browser, browser_handle, context_id)) => {
8729                let browser: Arc<chromiumoxide::Browser> = Arc::new(browser);
8730                let b = (browser, Some(browser_handle), context_id);
8731
8732                Some(crate::features::chrome::BrowserController::new(b))
8733            }
8734            _ => None,
8735        }
8736    }
8737
8738    /// Launch or connect to browser with setup.
8739    #[cfg(feature = "chrome")]
8740    pub async fn setup_browser(&self) -> Option<crate::features::chrome::BrowserController> {
8741        Website::setup_browser_base(
8742            &self.configuration,
8743            self.get_url_parsed(),
8744            Some(&self.cookie_jar),
8745        )
8746        .await
8747    }
8748
8749    /// Launch or connect to WebDriver with setup.
8750    #[cfg(feature = "webdriver")]
8751    pub async fn setup_webdriver(&self) -> Option<crate::features::webdriver::WebDriverController> {
8752        crate::features::webdriver::launch_driver(&self.configuration).await
8753    }
8754
8755    /// Render a page using WebDriver.
8756    #[cfg(feature = "webdriver")]
8757    pub async fn render_webdriver_page(
8758        &self,
8759        url: &str,
8760        driver: &std::sync::Arc<thirtyfour::WebDriver>,
8761    ) -> Option<String> {
8762        use crate::features::webdriver::{
8763            attempt_navigation, get_page_content, setup_driver_events,
8764        };
8765
8766        let timeout = self
8767            .configuration
8768            .webdriver_config
8769            .as_ref()
8770            .and_then(|c| c.timeout);
8771
8772        // Navigate to the URL
8773        if let Err(e) = attempt_navigation(url, driver, &timeout).await {
8774            log::error!("WebDriver navigation failed: {:?}", e);
8775            return None;
8776        }
8777
8778        // Setup events (stealth injection)
8779        setup_driver_events(driver, &self.configuration).await;
8780
8781        // Get page content
8782        match get_page_content(driver).await {
8783            Ok(content) => Some(content),
8784            Err(e) => {
8785                log::error!("Failed to get WebDriver page content: {:?}", e);
8786                None
8787            }
8788        }
8789    }
8790
8791    /// Respect robots.txt file.
8792    pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self {
8793        self.configuration
8794            .with_respect_robots_txt(respect_robots_txt);
8795        self
8796    }
8797
8798    /// Include subdomains detection.
8799    pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self {
8800        self.configuration.with_subdomains(subdomains);
8801        self
8802    }
8803
8804    /// Bypass CSP protection detection. This does nothing without the feat flag `chrome` enabled.
8805    pub fn with_csp_bypass(&mut self, enabled: bool) -> &mut Self {
8806        self.configuration.with_csp_bypass(enabled);
8807        self
8808    }
8809
8810    /// Configure WebDriver for browser automation. This does nothing without the `webdriver` feature flag enabled.
8811    /// When configured, the `crawl()` function will automatically use WebDriver instead of raw HTTP.
8812    #[cfg(feature = "webdriver")]
8813    pub fn with_webdriver(
8814        &mut self,
8815        webdriver_config: crate::features::webdriver_common::WebDriverConfig,
8816    ) -> &mut Self {
8817        self.configuration
8818            .with_webdriver_config(Some(webdriver_config));
8819        self
8820    }
8821
8822    /// Configure WebDriver for browser automation. This does nothing without the `webdriver` feature flag enabled.
8823    #[cfg(not(feature = "webdriver"))]
8824    pub fn with_webdriver(&mut self, _webdriver_config: ()) -> &mut Self {
8825        self
8826    }
8827
8828    /// Use sqlite to store data and track large crawls. This does nothing without the `disk` flag enabled.
8829    #[cfg(feature = "disk")]
8830    pub fn with_sqlite(&mut self, sqlite: bool) -> &mut Self {
8831        if sqlite {
8832            self.enable_sqlite = true;
8833        } else {
8834            self.enable_sqlite = false;
8835            self.sqlite = None;
8836        };
8837        self
8838    }
8839
8840    /// Use sqlite to store data and track large crawls.
8841    #[cfg(not(feature = "disk"))]
8842    pub fn with_sqlite(&mut self, _sqlite: bool) -> &mut Self {
8843        self
8844    }
8845
8846    /// Include tld detection.
8847    pub fn with_tld(&mut self, tld: bool) -> &mut Self {
8848        self.configuration.with_tld(tld);
8849        self
8850    }
8851
8852    /// The max duration for the crawl. This is useful when websites use a robots.txt with long durations and throttle the timeout removing the full concurrency.
8853    pub fn with_crawl_timeout(&mut self, crawl_timeout: Option<Duration>) -> &mut Self {
8854        self.configuration.with_crawl_timeout(crawl_timeout);
8855        self
8856    }
8857
8858    /// Only use HTTP/2.
8859    pub fn with_http2_prior_knowledge(&mut self, http2_prior_knowledge: bool) -> &mut Self {
8860        self.configuration
8861            .with_http2_prior_knowledge(http2_prior_knowledge);
8862        self
8863    }
8864
8865    /// Delay between request as ms.
8866    pub fn with_delay(&mut self, delay: u64) -> &mut Self {
8867        self.configuration.with_delay(delay);
8868        self
8869    }
8870
8871    /// Max time to wait for request.
8872    pub fn with_request_timeout(&mut self, request_timeout: Option<Duration>) -> &mut Self {
8873        self.configuration.with_request_timeout(request_timeout);
8874        self
8875    }
8876
8877    /// Dangerously accept invalid certificates - this should be used as a last resort.
8878    pub fn with_danger_accept_invalid_certs(&mut self, accept_invalid_certs: bool) -> &mut Self {
8879        self.configuration
8880            .with_danger_accept_invalid_certs(accept_invalid_certs);
8881        self
8882    }
8883
8884    /// Add user agent to request.
8885    pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self {
8886        self.configuration.with_user_agent(user_agent);
8887        self
8888    }
8889
8890    /// Preserve the HOST header.
8891    pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self {
8892        self.configuration.with_preserve_host_header(preserve);
8893        self
8894    }
8895
8896    #[cfg(feature = "sitemap")]
8897    /// Add user agent to request. This does nothing without the `sitemap` flag enabled.
8898    pub fn with_sitemap(&mut self, sitemap_url: Option<&str>) -> &mut Self {
8899        self.configuration.with_sitemap(sitemap_url);
8900        self
8901    }
8902
8903    #[cfg(not(feature = "sitemap"))]
8904    /// Add user agent to request. This does nothing without the `sitemap` flag enabled.
8905    pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self {
8906        self
8907    }
8908
8909    /// Use proxies for request.
8910    pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self {
8911        self.configuration.with_proxies(proxies);
8912        self
8913    }
8914
8915    /// Use proxies for request with control between chrome and http.
8916    pub fn with_proxies_direct(
8917        &mut self,
8918        proxies: Option<Vec<crate::configuration::RequestProxy>>,
8919    ) -> &mut Self {
8920        self.configuration.with_proxies_direct(proxies);
8921        self
8922    }
8923
8924    /// Set the concurrency limits. If you set the value to None to use the default limits using the system CPU cors * n.
8925    pub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self {
8926        self.configuration.with_concurrency_limit(limit);
8927        self
8928    }
8929
8930    /// Set a crawl ID to use for tracking crawls. This does nothing without the `control` flag enabled.
8931    #[cfg(not(feature = "control"))]
8932    pub fn with_crawl_id(&mut self, _crawl_id: String) -> &mut Self {
8933        self
8934    }
8935
8936    /// Set a crawl ID to use for tracking crawls. This does nothing without the [control] flag enabled.
8937    #[cfg(feature = "control")]
8938    pub fn with_crawl_id(&mut self, crawl_id: String) -> &mut Self {
8939        self.crawl_id = crawl_id.into();
8940        self
8941    }
8942
8943    /// Add blacklist urls to ignore.
8944    pub fn with_blacklist_url<T>(&mut self, blacklist_url: Option<Vec<T>>) -> &mut Self
8945    where
8946        Vec<CompactString>: From<Vec<T>>,
8947    {
8948        self.configuration.with_blacklist_url(blacklist_url);
8949        self
8950    }
8951
8952    /// Set the retry limit for request. Set the value to 0 for no retries. The default is 0.
8953    pub fn with_retry(&mut self, retry: u8) -> &mut Self {
8954        self.configuration.with_retry(retry);
8955        self
8956    }
8957
8958    /// Skip setting up a control thread for pause, start, and shutdown programmatic handling. This does nothing without the 'control' flag enabled.
8959    pub fn with_no_control_thread(&mut self, no_control_thread: bool) -> &mut Self {
8960        self.configuration.with_no_control_thread(no_control_thread);
8961        self
8962    }
8963
8964    /// Add whitelist urls to allow.
8965    pub fn with_whitelist_url<T>(&mut self, whitelist_url: Option<Vec<T>>) -> &mut Self
8966    where
8967        Vec<CompactString>: From<Vec<T>>,
8968    {
8969        self.configuration.with_whitelist_url(whitelist_url);
8970        self
8971    }
8972
8973    #[cfg(feature = "chrome")]
8974    /// Track the events made via chrome.
8975    pub fn with_event_tracker(
8976        &mut self,
8977        track_events: Option<crate::configuration::ChromeEventTracker>,
8978    ) -> &mut Self {
8979        self.configuration.with_event_tracker(track_events);
8980        self
8981    }
8982
8983    /// Set HTTP headers for request using [reqwest::header::HeaderMap](https://docs.rs/reqwest/latest/reqwest/header/struct.HeaderMap.html).
8984    pub fn with_headers(&mut self, headers: Option<reqwest::header::HeaderMap>) -> &mut Self {
8985        self.configuration.with_headers(headers);
8986        self
8987    }
8988
8989    /// Modify the headers to mimic a real browser.
8990    pub fn with_modify_headers(&mut self, modify_headers: bool) -> &mut Self {
8991        self.configuration.with_modify_headers(modify_headers);
8992        self
8993    }
8994
8995    /// Modify the HTTP client headers to mimic a real browser.
8996    pub fn with_modify_http_client_headers(
8997        &mut self,
8998        modify_http_client_headers: bool,
8999    ) -> &mut Self {
9000        self.configuration
9001            .with_modify_http_client_headers(modify_http_client_headers);
9002        self
9003    }
9004
9005    /// Set a crawl budget per path with levels support /a/b/c or for all paths with "*". This does nothing without the `budget` flag enabled.
9006    pub fn with_budget(&mut self, budget: Option<HashMap<&str, u32>>) -> &mut Self {
9007        self.configuration.with_budget(budget);
9008        self
9009    }
9010
9011    /// Set the crawl budget directly. This does nothing without the `budget` flag enabled.
9012    pub fn set_crawl_budget(&mut self, budget: Option<HashMap<CaseInsensitiveString, u32>>) {
9013        self.configuration.budget = budget;
9014    }
9015
9016    /// Set a crawl depth limit. If the value is 0 there is no limit.
9017    pub fn with_depth(&mut self, depth: usize) -> &mut Self {
9018        self.configuration.with_depth(depth);
9019        self
9020    }
9021
9022    /// Group external domains to treat the crawl as one. If None is passed this will clear all prior domains.
9023    pub fn with_external_domains<'a, 'b>(
9024        &mut self,
9025        external_domains: Option<impl Iterator<Item = String> + 'a>,
9026    ) -> &mut Self {
9027        self.configuration.with_external_domains(external_domains);
9028        self
9029    }
9030
9031    /// Perform a callback to run on each link find.
9032    pub fn with_on_link_find_callback(
9033        &mut self,
9034        on_link_find_callback: Option<OnLinkFindCallback>,
9035    ) -> &mut Self {
9036        match on_link_find_callback {
9037            Some(callback) => self.on_link_find_callback = Some(callback),
9038            _ => self.on_link_find_callback = None,
9039        };
9040        self
9041    }
9042
9043    /// Perform a callback to run on each link find shorthand.
9044    pub fn set_on_link_find<F>(&mut self, f: F)
9045    where
9046        F: Fn(CaseInsensitiveString, Option<String>) -> (CaseInsensitiveString, Option<String>)
9047            + Send
9048            + Sync
9049            + 'static,
9050    {
9051        self.on_link_find_callback = Some(Arc::new(f));
9052    }
9053
9054    /// Use a callback to determine if a page should be ignored. Return false to ensure that the discovered links are not crawled.
9055    pub fn with_on_should_crawl_callback(
9056        &mut self,
9057        on_should_crawl_callback: Option<fn(&Page) -> bool>,
9058    ) -> &mut Self {
9059        match on_should_crawl_callback {
9060            Some(callback) => {
9061                self.on_should_crawl_callback = Some(OnShouldCrawlCallback::Fn(callback))
9062            }
9063            _ => self.on_should_crawl_callback = None,
9064        };
9065        self
9066    }
9067
9068    /// Use an immutable closure to determine if a page should be ignored. Return false to ensure that the discovered links are not crawled.
9069    ///
9070    /// Slightly slower than [`Self::with_on_should_crawl_callback`].
9071    pub fn with_on_should_crawl_callback_closure<F: OnShouldCrawlClosure>(
9072        &mut self,
9073        on_should_crawl_closure: Option<F>,
9074    ) -> &mut Self {
9075        match on_should_crawl_closure {
9076            Some(callback) => {
9077                self.on_should_crawl_callback =
9078                    Some(OnShouldCrawlCallback::Closure(Arc::new(callback)))
9079            }
9080            _ => self.on_should_crawl_callback = None,
9081        };
9082        self
9083    }
9084
9085    /// Cookie string to use in request. This does nothing without the `cookies` flag enabled.
9086    pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self {
9087        self.configuration.with_cookies(cookie_str);
9088        self
9089    }
9090
9091    /// Setup cron jobs to run. This does nothing without the `cron` flag enabled.
9092    pub fn with_cron(&mut self, cron_str: &str, cron_type: CronType) -> &mut Self {
9093        self.configuration.with_cron(cron_str, cron_type);
9094        self
9095    }
9096
9097    /// Overrides default host system locale with the specified one. This does nothing without the `chrome` flag enabled.
9098    pub fn with_locale(&mut self, locale: Option<String>) -> &mut Self {
9099        self.configuration.with_locale(locale);
9100        self
9101    }
9102
9103    /// Use stealth mode for the request. This does nothing without the `chrome` flag enabled.
9104    pub fn with_stealth(&mut self, stealth_mode: bool) -> &mut Self {
9105        self.configuration.with_stealth(stealth_mode);
9106        self
9107    }
9108
9109    /// Use stealth mode for the request. This does nothing without the `chrome` flag enabled.
9110    #[cfg(feature = "chrome")]
9111    pub fn with_stealth_advanced(
9112        &mut self,
9113        stealth_mode: spider_fingerprint::configs::Tier,
9114    ) -> &mut Self {
9115        self.configuration.with_stealth_advanced(stealth_mode);
9116        self
9117    }
9118
9119    /// Set the cache policy.
9120    pub fn with_cache_policy(
9121        &mut self,
9122        cache_policy: Option<crate::utils::BasicCachePolicy>,
9123    ) -> &mut Self {
9124        self.configuration.with_cache_policy(cache_policy);
9125
9126        self
9127    }
9128
9129    /// Use OpenAI to get dynamic javascript to drive the browser. This does nothing without the `openai` flag enabled.
9130    pub fn with_openai(&mut self, openai_configs: Option<configuration::GPTConfigs>) -> &mut Self {
9131        self.configuration.with_openai(openai_configs);
9132        self
9133    }
9134
9135    /// Use a remote multimodal model (vision + HTML + URL) to drive browser automation.
9136    ///
9137    /// When enabled, Spider can ask an OpenAI-compatible “chat completions” endpoint to
9138    /// generate a JSON plan (a list of `WebAutomation` steps), execute those steps against a
9139    /// live Chrome page, then re-capture state and iterate until the model reports it is done
9140    /// (or the configured limits are hit). The default system prompt is set to handle web challenges that can be adjusted if required.
9141    /// Take a look at `DEFAULT_SYSTEM_PROMPT` at spider::features::automation::DEFAULT_SYSTEM_PROMPT for a base line.
9142    ///
9143    /// This is useful for:
9144    /// - handling captchas,
9145    /// - dismissing popups / cookie banners,
9146    /// - navigating to a target page (pricing, docs, etc.),
9147    /// - clicking through multi-step UI flows,
9148    /// - recovering from dynamic page state that plain HTML scraping can’t handle.
9149    ///
9150    /// # Feature gate
9151    /// This method only has an effect when the crate is built with `feature="chrome"`.
9152    /// Without `chrome`, the method is not available.
9153    ///
9154    /// # Parameters
9155    /// - `cfg`: The remote multimodal configuration bundle (endpoint, model, prompts, and runtime knobs).
9156    ///   Pass `None` to disable remote multimodal automation.
9157    ///
9158    /// # Example
9159    /// ```no_run
9160    /// # #[cfg(feature = "chrome")]
9161    /// # async fn demo() -> Result<(), Box<dyn std::error::Error>> {
9162    /// use spider::website::Website;
9163    /// use spider::configuration::Configuration;
9164    /// use spider::features::automation::{RemoteMultimodalConfigs, RemoteMultimodalConfig};
9165    ///
9166    /// // Build the engine configs (similar to GPTConfigs::new(...))
9167    /// let mm_cfgs = RemoteMultimodalConfigs::new(
9168    ///     "http://localhost:11434/v1/chat/completions",
9169    ///     "qwen2.5-vl", // any OpenAI-compatible model id your endpoint understands
9170    /// )
9171    /// // .with_api_key("your-api-key-if-needed")
9172    /// .with_system_prompt_extra("Never log in. Prefer closing popups and continuing.")
9173    /// .with_user_message_extra("Goal: reach the pricing page, then stop.")
9174    /// .with_cfg(RemoteMultimodalConfig {
9175    ///     // keep HTML smaller if you want lower token usage
9176    ///     include_html: true,
9177    ///     html_max_bytes: 24_000,
9178    ///     include_url: true,
9179    ///     include_title: true,
9180    ///     // loop controls
9181    ///     max_rounds: 6,
9182    ///     post_plan_wait_ms: 400,
9183    ///     ..Default::default()
9184    /// })
9185    /// .with_concurrency_limit(8);
9186    ///
9187    /// // Attach to the crawler configuration
9188    /// let mut cfg = Configuration::new();
9189    /// cfg.with_remote_multimodal(Some(mm_cfgs));
9190    ///
9191    /// // Use the configuration in a Website (example)
9192    /// let mut site = Website::new("https://example.com");
9193    /// site.with_config(cfg);
9194    ///
9195    /// // Start crawling/scraping as you normally would...
9196    /// // site.crawl().await?;
9197    ///
9198    /// Ok(())
9199    /// # }
9200    /// ```
9201    ///
9202    /// # Notes
9203    /// - Remote multimodal automation typically requires `feature="serde"` if you deserialize model
9204    ///   steps into `WebAutomation`.
9205    /// - If your endpoint does not support `response_format: {"type":"json_object"}`, disable that
9206    ///   in `RemoteMultimodalConfig` (`request_json_object = false`).
9207    #[cfg(feature = "chrome")]
9208    pub fn with_remote_multimodal(
9209        &mut self,
9210        cfg: Option<crate::features::automation::RemoteMultimodalConfigs>,
9211    ) -> &mut Self {
9212        self.configuration.with_remote_multimodal(cfg);
9213        self
9214    }
9215
9216    /// Use Gemini to get dynamic javascript to drive the browser. This does nothing without the `gemini` flag enabled.
9217    pub fn with_gemini(
9218        &mut self,
9219        gemini_configs: Option<configuration::GeminiConfigs>,
9220    ) -> &mut Self {
9221        self.configuration.with_gemini(gemini_configs);
9222        self
9223    }
9224
9225    /// Cache the page following HTTP rules. This method does nothing if the `cache` feature is not enabled.
9226    pub fn with_caching(&mut self, cache: bool) -> &mut Self {
9227        self.configuration.with_caching(cache);
9228        self
9229    }
9230
9231    /// Skip browser rendering entirely if cached content exists.
9232    pub fn with_cache_skip_browser(&mut self, skip: bool) -> &mut Self {
9233        self.configuration.with_cache_skip_browser(skip);
9234        self
9235    }
9236
9237    /// Enable or disable Service Workers. This method does nothing if the `chrome` feature is not enabled.
9238    pub fn with_service_worker_enabled(&mut self, enabled: bool) -> &mut Self {
9239        self.configuration.with_service_worker_enabled(enabled);
9240        self
9241    }
9242
9243    /// Automatically setup geo-location configurations when using a proxy. This method does nothing if the `chrome` feature is not enabled.
9244    pub fn with_auto_geolocation(&mut self, enabled: bool) -> &mut Self {
9245        self.configuration.with_auto_geolocation(enabled);
9246        self
9247    }
9248
9249    #[cfg(feature = "chrome")]
9250    /// Set custom fingerprint ID for request. This does nothing without the `chrome` flag enabled.
9251    pub fn with_fingerprint_advanced(
9252        &mut self,
9253        fingerprint: crate::configuration::Fingerprint,
9254    ) -> &mut Self {
9255        self.configuration.with_fingerprint_advanced(fingerprint);
9256        self
9257    }
9258
9259    /// Setup custom fingerprinting for chrome. This method does nothing if the `chrome` feature is not enabled.
9260    pub fn with_fingerprint(&mut self, fingerprint: bool) -> &mut Self {
9261        self.configuration.with_fingerprint(fingerprint);
9262        self
9263    }
9264
9265    /// Configures the viewport of the browser, which defaults to 800x600. This method does nothing if the `chrome` feature is not enabled.
9266    pub fn with_viewport(&mut self, viewport: Option<crate::configuration::Viewport>) -> &mut Self {
9267        self.configuration.with_viewport(viewport);
9268        self
9269    }
9270
9271    /// Wait for network request to be idle within a time frame period (500ms no network connections). This does nothing without the `chrome` flag enabled.
9272    pub fn with_wait_for_idle_network(
9273        &mut self,
9274        wait_for_idle_network: Option<crate::configuration::WaitForIdleNetwork>,
9275    ) -> &mut Self {
9276        self.configuration
9277            .with_wait_for_idle_network(wait_for_idle_network);
9278        self
9279    }
9280
9281    /// Wait for network request with a max timeout. This does nothing without the `chrome` flag enabled.
9282    pub fn with_wait_for_idle_network0(
9283        &mut self,
9284        wait_for_idle_network: Option<crate::configuration::WaitForIdleNetwork>,
9285    ) -> &mut Self {
9286        self.configuration
9287            .with_wait_for_idle_network0(wait_for_idle_network);
9288        self
9289    }
9290
9291    /// Wait for network to be almost idle with a max timeout. This does nothing without the `chrome` flag enabled.
9292    pub fn with_wait_for_almost_idle_network0(
9293        &mut self,
9294        wait_for_idle_network: Option<crate::configuration::WaitForIdleNetwork>,
9295    ) -> &mut Self {
9296        self.configuration
9297            .with_wait_for_almost_idle_network0(wait_for_idle_network);
9298        self
9299    }
9300
9301    /// Wait for a CSS query selector. This method does nothing if the `chrome` feature is not enabled.
9302    pub fn with_wait_for_selector(
9303        &mut self,
9304        wait_for_selector: Option<crate::configuration::WaitForSelector>,
9305    ) -> &mut Self {
9306        self.configuration.with_wait_for_selector(wait_for_selector);
9307        self
9308    }
9309
9310    /// Wait for idle dom mutations for target element. This method does nothing if the `chrome` feature is not enabled.
9311    pub fn with_wait_for_idle_dom(
9312        &mut self,
9313        wait_for_selector: Option<crate::configuration::WaitForSelector>,
9314    ) -> &mut Self {
9315        self.configuration.with_wait_for_idle_dom(wait_for_selector);
9316        self
9317    }
9318
9319    /// Wait for a delay. Should only be used for testing. This method does nothing if the `chrome` feature is not enabled.
9320    pub fn with_wait_for_delay(
9321        &mut self,
9322        wait_for_delay: Option<crate::configuration::WaitForDelay>,
9323    ) -> &mut Self {
9324        self.configuration.with_wait_for_delay(wait_for_delay);
9325        self
9326    }
9327
9328    /// The default http connect timeout.
9329    pub fn with_default_http_connect_timeout(
9330        &mut self,
9331        default_http_connect_timeout: Option<Duration>,
9332    ) -> &mut Self {
9333        self.configuration
9334            .with_default_http_connect_timeout(default_http_connect_timeout);
9335
9336        self
9337    }
9338
9339    /// The default http read timeout.
9340    pub fn with_default_http_read_timeout(
9341        &mut self,
9342        default_http_read_timeout: Option<Duration>,
9343    ) -> &mut Self {
9344        self.configuration
9345            .with_default_http_read_timeout(default_http_read_timeout);
9346
9347        self
9348    }
9349
9350    /// Set the max redirects allowed for request.
9351    pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self {
9352        self.configuration.with_redirect_limit(redirect_limit);
9353        self
9354    }
9355
9356    /// Set the redirect policy to use, either Strict or Loose by default.
9357    pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self {
9358        self.configuration.with_redirect_policy(policy);
9359        self
9360    }
9361
9362    /// Use request intercept for the request to only allow content that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the `chrome_intercept` flag is not enabled.
9363    pub fn with_chrome_intercept(
9364        &mut self,
9365        chrome_intercept: RequestInterceptConfiguration,
9366    ) -> &mut Self {
9367        self.configuration
9368            .with_chrome_intercept(chrome_intercept, &self.domain_parsed);
9369        self
9370    }
9371
9372    /// Add a referer to the request.
9373    pub fn with_referer(&mut self, referer: Option<String>) -> &mut Self {
9374        self.configuration.with_referer(referer);
9375        self
9376    }
9377
9378    /// Add a referer to the request.
9379    pub fn with_referrer(&mut self, referer: Option<String>) -> &mut Self {
9380        self.configuration.with_referrer(referer);
9381        self
9382    }
9383
9384    /// Determine whether to collect all the resources found on pages.
9385    pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self {
9386        self.configuration.with_full_resources(full_resources);
9387        self
9388    }
9389
9390    /// Dismiss all dialogs on the page. This method does nothing if the `chrome` feature is not enabled.
9391    pub fn with_dismiss_dialogs(&mut self, full_resources: bool) -> &mut Self {
9392        self.configuration.with_dismiss_dialogs(full_resources);
9393        self
9394    }
9395
9396    /// Set the request emuluation. This method does nothing if the `wreq` flag is not enabled.
9397    #[cfg(feature = "wreq")]
9398    pub fn with_emulation(&mut self, emulation: Option<wreq_util::Emulation>) -> &mut Self {
9399        self.configuration.with_emulation(emulation);
9400        self
9401    }
9402
9403    /// Ignore the sitemap when crawling. This method does nothing if the `sitemap` flag is not enabled.
9404    pub fn with_ignore_sitemap(&mut self, ignore_sitemap: bool) -> &mut Self {
9405        self.configuration.with_ignore_sitemap(ignore_sitemap);
9406        self
9407    }
9408
9409    /// Overrides default host system timezone with the specified one. This does nothing without the `chrome` flag enabled.
9410    pub fn with_timezone_id(&mut self, timezone_id: Option<String>) -> &mut Self {
9411        self.configuration.with_timezone_id(timezone_id);
9412        self
9413    }
9414
9415    /// Set a custom script to evaluate on new document creation. This does nothing without the feat flag `chrome` enabled.
9416    pub fn with_evaluate_on_new_document(
9417        &mut self,
9418        evaluate_on_new_document: Option<Box<String>>,
9419    ) -> &mut Self {
9420        self.configuration
9421            .with_evaluate_on_new_document(evaluate_on_new_document);
9422
9423        self
9424    }
9425
9426    /// Set a crawl page limit. If the value is 0 there is no limit.
9427    pub fn with_limit(&mut self, limit: u32) -> &mut Self {
9428        self.configuration.with_limit(limit);
9429        self
9430    }
9431
9432    /// Set the chrome screenshot configuration. This does nothing without the `chrome` flag enabled.
9433    pub fn with_screenshot(
9434        &mut self,
9435        screenshot_config: Option<configuration::ScreenShotConfig>,
9436    ) -> &mut Self {
9437        self.configuration.with_screenshot(screenshot_config);
9438        self
9439    }
9440
9441    /// Use a shared semaphore to evenly handle workloads. The default is false.
9442    pub fn with_shared_queue(&mut self, shared_queue: bool) -> &mut Self {
9443        self.configuration.with_shared_queue(shared_queue);
9444        self
9445    }
9446
9447    /// Set the authentiation challenge response. This does nothing without the feat flag `chrome` enabled.
9448    pub fn with_auth_challenge_response(
9449        &mut self,
9450        auth_challenge_response: Option<configuration::AuthChallengeResponse>,
9451    ) -> &mut Self {
9452        self.configuration
9453            .with_auth_challenge_response(auth_challenge_response);
9454        self
9455    }
9456
9457    /// Return the links found on the page in the channel subscriptions. This method does nothing if the `decentralized` is enabled.
9458    pub fn with_return_page_links(&mut self, return_page_links: bool) -> &mut Self {
9459        self.configuration.with_return_page_links(return_page_links);
9460        self
9461    }
9462
9463    /// Set the connection url for the chrome instance. This method does nothing if the `chrome` is not enabled.
9464    pub fn with_chrome_connection(&mut self, chrome_connection_url: Option<String>) -> &mut Self {
9465        self.configuration
9466            .with_chrome_connection(chrome_connection_url);
9467        self
9468    }
9469
9470    /// Set JS to run on certain pages. This method does nothing if the `chrome` is not enabled.
9471    pub fn with_execution_scripts(
9472        &mut self,
9473        execution_scripts: Option<ExecutionScriptsMap>,
9474    ) -> &mut Self {
9475        self.configuration.with_execution_scripts(execution_scripts);
9476        self
9477    }
9478
9479    /// Run web automated actions on certain pages. This method does nothing if the `chrome` is not enabled.
9480    pub fn with_automation_scripts(
9481        &mut self,
9482        automation_scripts: Option<AutomationScriptsMap>,
9483    ) -> &mut Self {
9484        self.configuration
9485            .with_automation_scripts(automation_scripts);
9486        self
9487    }
9488
9489    /// Bind the connections only on the network interface.
9490    pub fn with_network_interface(&mut self, network_interface: Option<String>) -> &mut Self {
9491        self.configuration.with_network_interface(network_interface);
9492        self
9493    }
9494
9495    /// Bind to a local IP Address.
9496    pub fn with_local_address(&mut self, local_address: Option<IpAddr>) -> &mut Self {
9497        self.configuration.with_local_address(local_address);
9498        self
9499    }
9500
9501    /// Block assets from loading from the network. Focus primarly on HTML documents.
9502    pub fn with_block_assets(&mut self, only_html: bool) -> &mut Self {
9503        self.configuration.with_block_assets(only_html);
9504        self
9505    }
9506
9507    /// Normalize the content de-duplicating trailing slash pages and other pages that can be duplicated. This may initially show the link in your links_visited or subscription calls but, the following links will not be crawled.
9508    pub fn with_normalize(&mut self, normalize: bool) -> &mut Self {
9509        self.configuration.with_normalize(normalize);
9510        self
9511    }
9512
9513    /// Store all the links found on the disk to share the state. This does nothing without the `disk` flag enabled.
9514    pub fn with_shared_state(&mut self, shared: bool) -> &mut Self {
9515        self.configuration.with_shared_state(shared);
9516        self
9517    }
9518
9519    /// Set the max amount of bytes to collect per page. Only used for chrome atm.
9520    pub fn with_max_page_bytes(&mut self, max_page_bytes: Option<f64>) -> &mut Self {
9521        self.configuration.with_max_page_bytes(max_page_bytes);
9522        self
9523    }
9524
9525    /// Set the max amount of bytes to collected for the browser context. Only used for chrome atm.
9526    pub fn with_max_bytes_allowed(&mut self, max_bytes_allowed: Option<u64>) -> &mut Self {
9527        self.configuration.with_max_bytes_allowed(max_bytes_allowed);
9528        self
9529    }
9530
9531    /// Set the configuration for the website directly.
9532    pub fn with_config(&mut self, config: Configuration) -> &mut Self {
9533        self.configuration = config.into();
9534        self
9535    }
9536
9537    /// Set a [spider.cloud](https://spider.cloud) API key (Proxy mode).
9538    #[cfg(feature = "spider_cloud")]
9539    pub fn with_spider_cloud(&mut self, api_key: &str) -> &mut Self {
9540        self.configuration.with_spider_cloud(api_key);
9541        self
9542    }
9543
9544    /// Set a [spider.cloud](https://spider.cloud) API key (no-op without `spider_cloud` feature).
9545    #[cfg(not(feature = "spider_cloud"))]
9546    pub fn with_spider_cloud(&mut self, _api_key: &str) -> &mut Self {
9547        self
9548    }
9549
9550    /// Set a [spider.cloud](https://spider.cloud) config.
9551    #[cfg(feature = "spider_cloud")]
9552    pub fn with_spider_cloud_config(
9553        &mut self,
9554        config: crate::configuration::SpiderCloudConfig,
9555    ) -> &mut Self {
9556        self.configuration.with_spider_cloud_config(config);
9557        self
9558    }
9559
9560    /// Set a [spider.cloud](https://spider.cloud) config (no-op without `spider_cloud` feature).
9561    #[cfg(not(feature = "spider_cloud"))]
9562    pub fn with_spider_cloud_config(&mut self, _config: ()) -> &mut Self {
9563        self
9564    }
9565
9566    /// Set the hedged request (work-stealing) configuration.
9567    #[cfg(feature = "hedge")]
9568    pub fn with_hedge(&mut self, config: crate::utils::hedge::HedgeConfig) -> &mut Self {
9569        self.configuration.with_hedge(config);
9570        self
9571    }
9572
9573    /// Set the hedged request configuration (no-op without `hedge` feature).
9574    #[cfg(not(feature = "hedge"))]
9575    pub fn with_hedge(&mut self, _config: ()) -> &mut Self {
9576        self
9577    }
9578
9579    /// Build the website configuration when using with_builder.
9580    pub fn build(&self) -> Result<Self, Self> {
9581        if self.domain_parsed.is_none() {
9582            Err(self.to_owned())
9583        } else {
9584            Ok(self.to_owned())
9585        }
9586    }
9587
9588    /// Clear the HTTP headers for the requests.
9589    pub fn clear_headers(&mut self) {
9590        if let Some(headers) = self.configuration.headers.as_mut() {
9591            headers.0.clear();
9592        }
9593    }
9594
9595    /// Determine if the budget has a wildcard path and the depth limit distance. This does nothing without the `budget` flag enabled.
9596    pub fn determine_limits(&mut self) {
9597        self.configuration.configure_budget();
9598        if self.configuration.inner_budget.is_some() {
9599            let wild_card_budget = match &self.configuration.inner_budget {
9600                Some(budget) => budget.contains_key(&*WILD_CARD_PATH),
9601                _ => false,
9602            };
9603            self.configuration.wild_card_budgeting = wild_card_budget;
9604        }
9605        if self.configuration.depth > 0 && self.domain_parsed.is_some() {
9606            if let Some(domain) = &self.domain_parsed {
9607                if let Some(segments) = domain.path_segments() {
9608                    let segments_cnt = segments.count();
9609
9610                    if segments_cnt > self.configuration.depth {
9611                        self.configuration.depth_distance = self.configuration.depth
9612                            + self.configuration.depth.abs_diff(segments_cnt);
9613                    } else {
9614                        self.configuration.depth_distance = self.configuration.depth;
9615                    }
9616                }
9617            }
9618        }
9619    }
9620
9621    #[cfg(not(feature = "sync"))]
9622    /// Sets up a subscription to receive concurrent data. This will panic if it is larger than `usize::MAX / 2`.
9623    /// Set the value to `0` to use the semaphore permits. If the subscription is going to block or use async methods,
9624    /// make sure to spawn a task to avoid losing messages. This does nothing unless the `sync` flag is enabled.
9625    ///
9626    /// # Examples
9627    ///
9628    /// Subscribe and receive messages using an async tokio environment:
9629    ///
9630    /// ```rust
9631    /// use spider::{tokio, website::Website};
9632    ///
9633    /// #[tokio::main]
9634    /// async fn main() {
9635    ///     let mut website = Website::new("http://example.com");
9636    ///     let mut rx = website.subscribe(0).unwrap();
9637    ///
9638    ///     tokio::spawn(async move {
9639    ///         while let Ok(page) = rx.recv().await {
9640    ///             tokio::spawn(async move {
9641    ///                 // Process the received page.
9642    ///                 // If performing non-blocking tasks or managing a high subscription count, configure accordingly.
9643    ///             });
9644    ///         }
9645    ///     });
9646    ///
9647    ///     website.crawl().await;
9648    /// }
9649    /// ```
9650    pub fn subscribe(&mut self, _capacity: usize) -> Option<broadcast::Receiver<Page>> {
9651        None
9652    }
9653
9654    /// Sets up a subscription to receive concurrent data. This will panic if it is larger than `usize::MAX / 2`.
9655    /// Set the value to `0` to use the semaphore permits. If the subscription is going to block or use async methods,
9656    /// make sure to spawn a task to avoid losing messages. This does nothing unless the `sync` flag is enabled.
9657    ///
9658    /// # Examples
9659    ///
9660    /// Subscribe and receive messages using an async tokio environment:
9661    ///
9662    /// ```rust
9663    /// use spider::{tokio, website::Website};
9664    ///
9665    /// #[tokio::main]
9666    /// async fn main() {
9667    ///     let mut website = Website::new("http://example.com");
9668    ///     let mut rx = website.subscribe(0).unwrap();
9669    ///
9670    ///     tokio::spawn(async move {
9671    ///         while let Ok(page) = rx.recv().await {
9672    ///             tokio::spawn(async move {
9673    ///                 // Process the received page.
9674    ///                 // If performing non-blocking tasks or managing a high subscription count, configure accordingly.
9675    ///             });
9676    ///         }
9677    ///     });
9678    ///
9679    ///     website.crawl().await;
9680    /// }
9681    /// ```
9682    #[cfg(feature = "sync")]
9683    pub fn subscribe(&mut self, capacity: usize) -> Option<broadcast::Receiver<Page>> {
9684        let channel = self.channel.get_or_insert_with(|| {
9685            let (tx, rx) = broadcast::channel(
9686                (if capacity == 0 {
9687                    *DEFAULT_PERMITS
9688                } else {
9689                    capacity
9690                })
9691                .max(1),
9692            );
9693            (tx, Arc::new(rx))
9694        });
9695
9696        let rx2 = channel.0.subscribe();
9697
9698        Some(rx2)
9699    }
9700
9701    /// Get a sender for queueing extra links mid crawl. This does nothing unless the `sync` flag is enabled.
9702    #[cfg(feature = "sync")]
9703    pub fn queue(&mut self, capacity: usize) -> Option<broadcast::Sender<String>> {
9704        let channel = self.channel_queue.get_or_insert_with(|| {
9705            let (tx, rx) = broadcast::channel(capacity);
9706            (tx, Arc::new(rx))
9707        });
9708
9709        Some(channel.0.to_owned())
9710    }
9711
9712    /// Get a sender for queueing extra links mid crawl. This does nothing unless the `sync` flag is enabled.
9713    #[cfg(not(feature = "sync"))]
9714    pub fn queue(
9715        &mut self,
9716        _capacity: usize,
9717    ) -> Option<Arc<(broadcast::Sender<Page>, broadcast::Receiver<Page>)>> {
9718        None
9719    }
9720
9721    /// Remove subscriptions for data. This is useful for auto droping subscriptions that are running on another thread. This does nothing without the `sync` flag enabled.
9722    #[cfg(not(feature = "sync"))]
9723    pub fn unsubscribe(&mut self) {}
9724
9725    /// Remove subscriptions for data. This is useful for auto droping subscriptions that are running on another thread. This does nothing without the `sync` flag enabled.
9726    #[cfg(feature = "sync")]
9727    pub fn unsubscribe(&mut self) {
9728        self.channel.take();
9729    }
9730
9731    /// Get the channel sender to send manual subscriptions.
9732    pub fn get_channel(
9733        &self,
9734    ) -> &Option<(broadcast::Sender<Page>, Arc<broadcast::Receiver<Page>>)> {
9735        &self.channel
9736    }
9737
9738    /// Get the channel guard to send manual subscriptions from closing.
9739    pub fn get_channel_guard(&self) -> &Option<ChannelGuard> {
9740        &self.channel_guard
9741    }
9742
9743    /// Setup subscription counter to track concurrent operation completions.
9744    /// This helps keep a chrome instance active until all operations are completed from all threads to safely take screenshots and other actions.
9745    /// Make sure to call `inc` if you take a guard. Without calling `inc` in the subscription receiver the crawl will stay in a infinite loop.
9746    /// This does nothing without the `sync` flag enabled. You also need to use the 'chrome_store_page' to keep the page alive between request.
9747    ///
9748    /// # Example
9749    ///
9750    /// ```
9751    /// use spider::tokio;
9752    /// use spider::website::Website;
9753    /// #[tokio::main]
9754    ///
9755    /// async fn main() {
9756    ///     let mut website: Website = Website::new("http://example.com");
9757    ///     let mut rx2 = website.subscribe(18).unwrap();
9758    ///     let mut rxg = website.subscribe_guard().unwrap();
9759    ///
9760    ///     tokio::spawn(async move {
9761    ///         while let Ok(page) = rx2.recv().await {
9762    ///             println!("📸 - {:?}", page.get_url());
9763    ///             page
9764    ///                 .screenshot(
9765    ///                     true,
9766    ///                     true,
9767    ///                     spider::configuration::CaptureScreenshotFormat::Png,
9768    ///                     Some(75),
9769    ///                     None::<std::path::PathBuf>,
9770    ///                     None,
9771    ///                 )
9772    ///                 .await;
9773    ///             rxg.inc();
9774    ///         }
9775    ///     });
9776    ///     website.crawl().await;
9777    /// }
9778    /// ```
9779    #[cfg(not(feature = "sync"))]
9780    pub fn subscribe_guard(&mut self) -> Option<ChannelGuard> {
9781        None
9782    }
9783
9784    /// Setup subscription counter to track concurrent operation completions.
9785    /// This helps keep a chrome instance active until all operations are completed from all threads to safely take screenshots and other actions.
9786    /// Make sure to call `inc` if you take a guard. Without calling `inc` in the subscription receiver the crawl will stay in a infinite loop.
9787    /// This does nothing without the `sync` flag enabled. You also need to use the 'chrome_store_page' to keep the page alive between request.
9788    ///
9789    /// # Example
9790    ///
9791    /// ```
9792    /// use spider::tokio;
9793    /// use spider::website::Website;
9794    ///
9795    /// #[tokio::main]
9796    /// async fn main() {
9797    ///     let mut website: Website = Website::new("http://example.com");
9798    ///     let mut rx2 = website.subscribe(18).unwrap();
9799    ///     let mut rxg = website.subscribe_guard().unwrap();
9800    ///
9801    ///     tokio::spawn(async move {
9802    ///         while let Ok(page) = rx2.recv().await {
9803    ///             println!("📸 - {:?}", page.get_url());
9804    ///             page
9805    ///                 .screenshot(
9806    ///                     true,
9807    ///                     true,
9808    ///                     spider::configuration::CaptureScreenshotFormat::Png,
9809    ///                     Some(75),
9810    ///                     None::<std::path::PathBuf>,
9811    ///                     None,
9812    ///                 )
9813    ///                 .await;
9814    ///             rxg.inc();
9815    ///         }
9816    ///     });
9817    ///     website.crawl().await;
9818    /// }
9819    /// ```
9820    #[cfg(feature = "sync")]
9821    pub fn subscribe_guard(&mut self) -> Option<ChannelGuard> {
9822        // *note*: it would be better to handle this on page drop if the subscription is used automatically. For now we add the API upfront.
9823        let channel_guard = self.channel_guard.get_or_insert_with(ChannelGuard::new);
9824        Some(channel_guard.clone())
9825    }
9826
9827    #[cfg(feature = "cron")]
9828    /// Start a cron job - if you use subscribe on another thread you need to abort the handle in conjuction with runner.stop.
9829    pub async fn run_cron(&self) -> Runner {
9830        async_job::Runner::new()
9831            .add(Box::new(self.clone()))
9832            .run()
9833            .await
9834    }
9835
9836    #[cfg(not(feature = "control"))]
9837    /// Get the attached crawl id.
9838    pub fn get_crawl_id(&self) -> Option<&Box<String>> {
9839        None
9840    }
9841
9842    #[cfg(feature = "control")]
9843    /// Get the attached crawl id.
9844    pub fn get_crawl_id(&self) -> Option<&Box<String>> {
9845        if self.crawl_id.is_empty() {
9846            None
9847        } else {
9848            Some(&self.crawl_id)
9849        }
9850    }
9851
9852    /// Set extra useful information.
9853    #[cfg(feature = "extra_information")]
9854    pub fn set_extra_info(&mut self, info: Option<String>) {
9855        self.extra_info = info.map(|f| f.into());
9856    }
9857
9858    /// Get extra information stored.
9859    #[cfg(feature = "extra_information")]
9860    pub fn get_extra_info(&self) -> Option<&Box<String>> {
9861        self.extra_info.as_ref()
9862    }
9863
9864    /// Set the initial HTML page instead of firing a request to the URL.
9865    pub fn set_seeded_html(&mut self, html: Option<String>) {
9866        self.seed_html = html;
9867    }
9868
9869    /// Get the initial seeded html.
9870    pub fn get_seeded_html(&self) -> &Option<String> {
9871        &self.seed_html
9872    }
9873
9874    /// Apply configuration from a `PromptConfiguration` generated by an LLM.
9875    ///
9876    /// This method takes a configuration object produced by
9877    /// `RemoteMultimodalEngine::configure_from_prompt()` and applies the
9878    /// settings to this website.
9879    ///
9880    /// # Example
9881    /// ```ignore
9882    /// use spider::features::automation::{RemoteMultimodalEngine, configure_crawler_from_prompt};
9883    ///
9884    /// let config = configure_crawler_from_prompt(
9885    ///     "http://localhost:11434/v1/chat/completions",
9886    ///     "llama3",
9887    ///     None,
9888    ///     "Crawl blog posts only, respect robots.txt, max 100 pages, 200ms delay"
9889    /// ).await?;
9890    ///
9891    /// let mut website = Website::new("https://example.com");
9892    /// website.apply_prompt_configuration(&config);
9893    /// ```
9894    #[cfg(feature = "serde")]
9895    pub fn apply_prompt_configuration(
9896        &mut self,
9897        config: &crate::features::automation::PromptConfiguration,
9898    ) -> &mut Self {
9899        // Core crawling
9900        if let Some(v) = config.respect_robots_txt {
9901            self.configuration.respect_robots_txt = v;
9902        }
9903        if let Some(v) = config.subdomains {
9904            self.configuration.subdomains = v;
9905        }
9906        if let Some(v) = config.tld {
9907            self.configuration.tld = v;
9908        }
9909        if let Some(v) = config.depth {
9910            self.configuration.depth = v;
9911        }
9912        if let Some(v) = config.delay {
9913            self.configuration.delay = v;
9914        }
9915        if let Some(ms) = config.request_timeout_ms {
9916            self.configuration.request_timeout =
9917                Some(Box::new(std::time::Duration::from_millis(ms)));
9918        }
9919        if let Some(ms) = config.crawl_timeout_ms {
9920            self.configuration.crawl_timeout = Some(std::time::Duration::from_millis(ms));
9921        }
9922
9923        // URL filtering
9924        if let Some(ref urls) = config.blacklist_url {
9925            self.configuration.blacklist_url =
9926                Some(urls.iter().map(|s| s.as_str().into()).collect());
9927        }
9928        if let Some(ref urls) = config.whitelist_url {
9929            self.configuration.whitelist_url =
9930                Some(urls.iter().map(|s| s.as_str().into()).collect());
9931        }
9932        if let Some(ref domains) = config.external_domains {
9933            for domain in domains {
9934                self.configuration
9935                    .external_domains_caseless
9936                    .insert(case_insensitive_string::CaseInsensitiveString::new(domain));
9937            }
9938        }
9939
9940        // Request settings
9941        if let Some(ref ua) = config.user_agent {
9942            self.configuration.user_agent = Some(Box::new(ua.as_str().into()));
9943        }
9944        if let Some(v) = config.http2_prior_knowledge {
9945            self.configuration.http2_prior_knowledge = v;
9946        }
9947        if let Some(v) = config.accept_invalid_certs {
9948            self.configuration.accept_invalid_certs = v;
9949        }
9950
9951        // Limits
9952        if let Some(v) = config.redirect_limit {
9953            self.configuration.redirect_limit = Box::new(v);
9954        }
9955        if let Some(ref budget_map) = config.budget {
9956            let mut budget = hashbrown::HashMap::new();
9957            for (k, v) in budget_map {
9958                budget.insert(case_insensitive_string::CaseInsensitiveString::new(k), *v);
9959            }
9960            self.configuration.budget = Some(budget);
9961        }
9962        if let Some(v) = config.max_page_bytes {
9963            self.configuration.max_page_bytes = Some(v);
9964        }
9965
9966        // Content
9967        if let Some(v) = config.full_resources {
9968            self.configuration.full_resources = v;
9969        }
9970        if let Some(v) = config.only_html {
9971            self.configuration.only_html = v;
9972        }
9973        if let Some(v) = config.return_page_links {
9974            self.configuration.return_page_links = v;
9975        }
9976
9977        // Chrome options
9978        #[cfg(feature = "chrome")]
9979        if let Some(true) = config.use_chrome {
9980            // Chrome is enabled via feature flag, this is a hint for the user
9981        }
9982        if let Some(ref mode) = config.stealth_mode {
9983            self.configuration.stealth_mode = match mode.to_lowercase().as_str() {
9984                "basic" => spider_fingerprint::configs::Tier::Basic,
9985                "low" => spider_fingerprint::configs::Tier::Low,
9986                "mid" => spider_fingerprint::configs::Tier::Mid,
9987                "full" => spider_fingerprint::configs::Tier::Full,
9988                _ => spider_fingerprint::configs::Tier::None,
9989            };
9990        }
9991        if config.viewport_width.is_some() || config.viewport_height.is_some() {
9992            let width = config.viewport_width.unwrap_or(800);
9993            let height = config.viewport_height.unwrap_or(600);
9994            self.configuration.viewport = Some(crate::configuration::Viewport::new(width, height));
9995        }
9996        #[cfg(feature = "chrome")]
9997        {
9998            let mut wait_for = self.configuration.wait_for.take().unwrap_or_default();
9999
10000            if let Some(true) = config.wait_for_idle_network {
10001                wait_for.idle_network =
10002                    Some(crate::features::chrome_common::WaitForIdleNetwork::new(
10003                        Some(std::time::Duration::from_secs(30)),
10004                    ));
10005            }
10006            if let Some(ms) = config.wait_for_delay_ms {
10007                wait_for.delay = Some(crate::features::chrome_common::WaitForDelay::new(Some(
10008                    std::time::Duration::from_millis(ms),
10009                )));
10010            }
10011            if let Some(ref selector) = config.wait_for_selector {
10012                wait_for.selector = Some(crate::features::chrome_common::WaitForSelector::new(
10013                    Some(std::time::Duration::from_secs(30)),
10014                    selector.clone(),
10015                ));
10016            }
10017
10018            if wait_for.idle_network.is_some()
10019                || wait_for.delay.is_some()
10020                || wait_for.selector.is_some()
10021            {
10022                self.configuration.wait_for = Some(wait_for);
10023            }
10024        }
10025        #[cfg(feature = "chrome")]
10026        if let Some(ref js) = config.evaluate_on_new_document {
10027            self.configuration.evaluate_on_new_document = Some(Box::new(js.clone()));
10028        }
10029
10030        // Performance
10031        if let Some(v) = config.shared_queue {
10032            self.configuration.shared_queue = v;
10033        }
10034        if let Some(v) = config.retry {
10035            self.configuration.retry = v;
10036        }
10037
10038        self
10039    }
10040
10041    /// Configure the website from a natural language prompt using an LLM.
10042    ///
10043    /// This is a convenience method that calls the LLM to generate configuration
10044    /// and applies it to the website in one step.
10045    ///
10046    /// # Arguments
10047    /// * `api_url` - OpenAI-compatible chat completions endpoint
10048    /// * `model_name` - Model identifier (e.g., "gpt-4", "llama3", "qwen2.5")
10049    /// * `api_key` - Optional API key for authenticated endpoints
10050    /// * `prompt` - Natural language description of crawling requirements
10051    ///
10052    /// # Example
10053    /// ```ignore
10054    /// let mut website = Website::new("https://example.com");
10055    /// website.configure_from_prompt(
10056    ///     "http://localhost:11434/v1/chat/completions",
10057    ///     "llama3",
10058    ///     None,
10059    ///     "Only crawl product pages, use 100ms delay, max depth 5, respect robots.txt"
10060    /// ).await?;
10061    ///
10062    /// website.crawl().await;
10063    /// ```
10064    /// Requires the `agent` and `serde` features.
10065    #[cfg(all(feature = "agent", feature = "serde"))]
10066    pub async fn configure_from_prompt(
10067        &mut self,
10068        api_url: &str,
10069        model_name: &str,
10070        api_key: Option<&str>,
10071        prompt: &str,
10072    ) -> Result<&mut Self, crate::features::automation::EngineError> {
10073        let config = crate::features::automation::configure_crawler_from_prompt(
10074            api_url, model_name, api_key, prompt,
10075        )
10076        .await?;
10077        Ok(self.apply_prompt_configuration(&config))
10078    }
10079}
10080
10081/// Channel broadcast send the Page to receivers.
10082pub fn channel_send_page(
10083    channel: &Option<(
10084        tokio::sync::broadcast::Sender<Page>,
10085        std::sync::Arc<tokio::sync::broadcast::Receiver<Page>>,
10086    )>,
10087    page: Page,
10088    channel_guard: &Option<ChannelGuard>,
10089) {
10090    if let Some(c) = channel {
10091        if c.0.send(page).is_ok() {
10092            if let Some(guard) = channel_guard {
10093                ChannelGuard::inc_guard(&guard.0 .1)
10094            }
10095        }
10096    }
10097}
10098
10099/// Guard a channel from closing until all concurrent operations are done.
10100#[derive(Debug, Clone)]
10101pub struct ChannelGuard(Arc<(AtomicBool, AtomicUsize, AtomicUsize)>);
10102
10103impl ChannelGuard {
10104    /// Create a new channel guard. The tuple has the guard control and the counter.
10105    #[cfg(feature = "sync")]
10106    pub(crate) fn new() -> ChannelGuard {
10107        ChannelGuard(Arc::new((
10108            AtomicBool::new(true),
10109            AtomicUsize::new(0),
10110            AtomicUsize::new(0),
10111        )))
10112    }
10113    /// Lock the channel until complete. This is only used for when storing the chrome page outside.
10114    pub(crate) async fn lock(&self) {
10115        if self.0 .0.load(Ordering::Relaxed) {
10116            let old = self.0 .1.load(Ordering::Relaxed);
10117
10118            while self
10119                .0
10120                 .2
10121                .compare_exchange_weak(old, 0, Ordering::Acquire, Ordering::Relaxed)
10122                .is_err()
10123            {
10124                tokio::task::yield_now().await;
10125            }
10126            std::sync::atomic::fence(Ordering::Acquire);
10127        }
10128    }
10129
10130    /// Set the guard control manually. If this is set to false the loop will not enter.
10131    pub fn guard(&mut self, guard: bool) {
10132        self.0 .0.store(guard, Ordering::Release);
10133    }
10134
10135    /// Increment the guard channel completions.
10136    // rename on next major since logic is now flow-controlled.
10137    pub fn inc(&mut self) {
10138        self.0 .2.fetch_add(1, std::sync::atomic::Ordering::Release);
10139    }
10140
10141    /// Increment a guard channel completions.
10142    pub(crate) fn inc_guard(guard: &AtomicUsize) {
10143        guard.fetch_add(1, std::sync::atomic::Ordering::Release);
10144    }
10145}
10146
10147impl Drop for ChannelGuard {
10148    fn drop(&mut self) {
10149        self.0 .0.store(false, Ordering::Release);
10150    }
10151}
10152
10153#[cfg(feature = "cron")]
10154/// Start a cron job taking ownership of the website
10155pub async fn run_cron(website: Website) -> Runner {
10156    async_job::Runner::new().add(Box::new(website)).run().await
10157}
10158
10159#[cfg(feature = "cron")]
10160#[async_trait]
10161impl Job for Website {
10162    fn schedule(&self) -> Option<async_job::Schedule> {
10163        match self.configuration.cron_str.parse() {
10164            Ok(schedule) => Some(schedule),
10165            Err(e) => {
10166                log::error!("{:?}", e);
10167                None
10168            }
10169        }
10170    }
10171    async fn handle(&mut self) {
10172        log::info!(
10173            "CRON: {} - cron job running {}",
10174            self.get_url().as_ref(),
10175            self.now()
10176        );
10177        if self.configuration.cron_type == CronType::Crawl {
10178            self.crawl().await;
10179        } else {
10180            self.scrape().await;
10181        }
10182    }
10183}
10184
10185impl std::fmt::Display for Website {
10186    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
10187        write!(
10188            f,
10189            "Website:\n  URL: {}\n ID: {:?}\n Configuration: {:?}",
10190            self.get_url(),
10191            self.get_crawl_id(),
10192            self.configuration
10193        )
10194    }
10195}
10196
10197impl std::error::Error for Website {}
10198
10199#[tokio::test]
10200#[cfg(not(feature = "decentralized"))]
10201async fn crawl() {
10202    let url = "https://choosealicense.com";
10203    let mut website: Website = Website::new(url);
10204    website.crawl().await;
10205    assert!(
10206        website
10207            .links_visited
10208            .contains(&"https://choosealicense.com/licenses/".into()),
10209        "{:?}",
10210        website.links_visited
10211    );
10212}
10213
10214#[tokio::test]
10215#[cfg(feature = "cron")]
10216async fn crawl_cron() {
10217    let url = "https://choosealicense.com";
10218    let mut website: Website = Website::new(url)
10219        .with_cron("1/5 * * * * *", Default::default())
10220        .build()
10221        .unwrap();
10222    let mut rx2 = website.subscribe(16).unwrap();
10223
10224    // handle an event on every cron
10225    let join_handle = tokio::spawn(async move {
10226        let mut links_visited = HashSet::new();
10227        while let Ok(res) = rx2.recv().await {
10228            let url = res.get_url();
10229            links_visited.insert(CaseInsensitiveString::new(url));
10230        }
10231        assert!(
10232            links_visited.contains(&CaseInsensitiveString::from(
10233                "https://choosealicense.com/licenses/"
10234            )),
10235            "{:?}",
10236            links_visited
10237        );
10238    });
10239
10240    let mut runner = website.run_cron().await;
10241    log::debug!("Starting the Runner for 10 seconds");
10242    tokio::time::sleep(Duration::from_secs(10)).await;
10243    runner.stop().await;
10244    join_handle.abort();
10245    let _ = join_handle.await;
10246}
10247
10248#[tokio::test]
10249#[cfg(feature = "cron")]
10250async fn crawl_cron_own() {
10251    let url = "https://choosealicense.com";
10252    let mut website: Website = Website::new(url)
10253        .with_cron("1/5 * * * * *", Default::default())
10254        .build()
10255        .unwrap();
10256    let mut rx2 = website.subscribe(16).unwrap();
10257
10258    // handle an event on every cron
10259    let join_handle = tokio::spawn(async move {
10260        let mut links_visited = HashSet::new();
10261        while let Ok(res) = rx2.recv().await {
10262            let url = res.get_url();
10263            links_visited.insert(CaseInsensitiveString::new(url));
10264        }
10265        assert!(
10266            links_visited.contains(&CaseInsensitiveString::from(
10267                "https://choosealicense.com/licenses/"
10268            )),
10269            "{:?}",
10270            links_visited
10271        );
10272    });
10273
10274    let mut runner = run_cron(website).await;
10275    log::debug!("Starting the Runner for 10 seconds");
10276    tokio::time::sleep(Duration::from_secs(10)).await;
10277    let _ = tokio::join!(runner.stop(), join_handle);
10278}
10279
10280#[tokio::test]
10281#[cfg(not(feature = "decentralized"))]
10282async fn scrape() {
10283    let mut website: Website = Website::new("https://choosealicense.com");
10284    website.scrape().await;
10285    assert!(
10286        website
10287            .links_visited
10288            .contains(&"https://choosealicense.com/licenses/".into()),
10289        "{:?}",
10290        website.links_visited
10291    );
10292
10293    assert!(!website.get_pages().unwrap()[0].get_html().is_empty());
10294}
10295
10296#[tokio::test]
10297#[cfg(not(feature = "decentralized"))]
10298async fn crawl_invalid() {
10299    let mut website: Website = Website::new("https://w.com");
10300    website.crawl().await;
10301    assert!(website.links_visited.len() <= 1); // only the target url should exist
10302}
10303
10304#[tokio::test]
10305#[cfg(feature = "decentralized")]
10306async fn crawl_invalid() {
10307    let domain = "https://w.com";
10308    let mut website: Website = Website::new(domain);
10309    website.crawl().await;
10310    let links = website.links_visited.get_links();
10311    let root = CaseInsensitiveString::from(format!("{}/", domain));
10312
10313    assert!(links.contains(&root), "{:?}", links);
10314
10315    #[cfg(feature = "sitemap")]
10316    {
10317        let sitemap = CaseInsensitiveString::from(format!("{}/sitemap.xml", domain));
10318        assert!(links.len() <= 2, "{:?}", links);
10319        if links.len() == 2 {
10320            assert!(links.contains(&sitemap), "{:?}", links);
10321        }
10322    }
10323
10324    #[cfg(not(feature = "sitemap"))]
10325    {
10326        assert_eq!(links.len(), 1, "{:?}", links);
10327    }
10328}
10329
10330#[tokio::test]
10331async fn not_crawl_blacklist() {
10332    let mut website: Website = Website::new("https://choosealicense.com");
10333    website.configuration.blacklist_url = Some(Vec::from([CompactString::from(
10334        "https://choosealicense.com/licenses/",
10335    )]));
10336
10337    website.crawl().await;
10338    assert!(
10339        !website
10340            .links_visited
10341            .contains(&"https://choosealicense.com/licenses/".into()),
10342        "{:?}",
10343        website.links_visited
10344    );
10345}
10346
10347#[tokio::test]
10348#[cfg(feature = "regex")]
10349async fn not_crawl_blacklist_regex() {
10350    let mut website: Website = Website::new("https://choosealicense.com");
10351    website.with_blacklist_url(Some(Vec::from(["choosealicense.com".into()])));
10352    website.crawl().await;
10353    assert_eq!(website.links_visited.len(), 0);
10354}
10355
10356#[test]
10357#[cfg(feature = "ua_generator")]
10358fn randomize_website_agent() {
10359    assert!(!get_ua(false).is_empty());
10360}
10361
10362#[tokio::test]
10363#[cfg(not(feature = "decentralized"))]
10364async fn test_respect_robots_txt() {
10365    let mut website: Website = Website::new("https://stackoverflow.com");
10366    website.configuration.respect_robots_txt = true;
10367    website.configuration.user_agent = Some(Box::new("*".into()));
10368
10369    let (client, _): (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) =
10370        website.setup().await;
10371
10372    website.configure_robots_parser(&client).await;
10373
10374    assert_eq!(website.configuration.delay, 0);
10375
10376    assert!(!&website
10377        .is_allowed(&"https://stackoverflow.com/posts/".into())
10378        .eq(&ProcessLinkStatus::Allowed));
10379
10380    // test match for bing bot
10381    let mut website_second: Website = Website::new("https://www.mongodb.com");
10382    website_second.configuration.respect_robots_txt = true;
10383    website_second.configuration.user_agent = Some(Box::new("bingbot".into()));
10384
10385    let (client_second, _): (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) =
10386        website_second.setup().await;
10387    website_second.configure_robots_parser(&client_second).await;
10388
10389    assert!(!&website
10390        .is_allowed(&"https://www.mongodb.com/community/forums/auth/".into())
10391        .eq(&ProcessLinkStatus::Allowed));
10392
10393    // assert_eq!(website_second.configuration.delay, 60000); // should equal one minute in ms
10394}
10395
10396#[tokio::test]
10397#[cfg(not(feature = "decentralized"))]
10398#[ignore]
10399async fn test_crawl_subdomains() {
10400    let mut website: Website = Website::new("https://choosealicense.com");
10401    website.configuration.subdomains = true;
10402    website.crawl().await;
10403    assert!(
10404        website
10405            .links_visited
10406            .contains(&"https://choosealicense.com/licenses/".into()),
10407        "{:?}",
10408        website.links_visited
10409    );
10410}
10411
10412#[tokio::test]
10413#[cfg(all(
10414    not(feature = "regex"),
10415    not(feature = "openai"),
10416    not(feature = "gemini")
10417))]
10418async fn test_with_configuration() {
10419    let mut website = Website::new("https://choosealicense.com");
10420
10421    website
10422        .with_respect_robots_txt(true)
10423        .with_subdomains(true)
10424        .with_tld(false)
10425        .with_delay(0)
10426        .with_request_timeout(None)
10427        .with_http2_prior_knowledge(false)
10428        .with_user_agent(Some(crate::page::TEST_AGENT_NAME))
10429        .with_headers(None)
10430        .with_proxies(None);
10431
10432    let mut configuration = Box::new(configuration::Configuration::new());
10433
10434    configuration.respect_robots_txt = true;
10435    configuration.subdomains = true;
10436    configuration.tld = false;
10437    configuration.delay = 0;
10438    configuration.request_timeout = None;
10439    configuration.http2_prior_knowledge = false;
10440    configuration.user_agent = Some(Box::new(CompactString::new(crate::page::TEST_AGENT_NAME)));
10441    configuration.headers = None;
10442    configuration.proxies = None;
10443
10444    assert!(
10445        website.configuration == configuration,
10446        "Left\n{:?}\n\nRight\n{:?}",
10447        website.configuration,
10448        configuration
10449    );
10450}
10451
10452#[tokio::test]
10453#[cfg(all(feature = "glob", not(feature = "decentralized")))]
10454async fn test_crawl_glob() {
10455    let mut website: Website =
10456        Website::new("https://choosealicense.com/licenses/{mit,apache-2.0,mpl-2.0}/");
10457    website.crawl().await;
10458
10459    // check for either https/http in collection
10460    assert!(
10461        website
10462            .links_visited
10463            .contains(&"https://choosealicense.com/licenses/".into())
10464            || website
10465                .links_visited
10466                .contains(&"http://choosealicense.com/licenses/".into()),
10467        "{:?}",
10468        website.links_visited
10469    );
10470}
10471
10472#[tokio::test]
10473#[ignore]
10474#[cfg(not(feature = "decentralized"))]
10475async fn test_crawl_tld() {
10476    let mut website: Website = Website::new("https://choosealicense.com");
10477    website.configuration.tld = true;
10478    website.with_limit(10);
10479    website.crawl().await;
10480
10481    assert!(
10482        website.links_visited.len() > 1,
10483        "expected more than 1 link visited with tld enabled, got {:?}",
10484        website.links_visited
10485    );
10486}
10487
10488#[tokio::test]
10489#[cfg(all(feature = "sync", not(feature = "decentralized")))]
10490async fn test_crawl_subscription() {
10491    let mut website: Website = Website::new("https://choosealicense.com");
10492    let mut rx2 = website.subscribe(100).unwrap();
10493
10494    let join_handle = tokio::spawn(async move {
10495        let mut count = 0;
10496
10497        while let Ok(_) = rx2.recv().await {
10498            count += 1;
10499        }
10500        count
10501    });
10502
10503    website.crawl().await;
10504    website.unsubscribe();
10505    let website_links = website.get_links().len();
10506    let count = join_handle.await.unwrap();
10507
10508    // no subscription if did not fulfill. The root page is always captured in links.
10509    assert!(count == website_links, "{:?}", true);
10510}
10511
10512#[tokio::test]
10513#[cfg(all(feature = "socks", not(feature = "decentralized")))]
10514async fn test_crawl_proxy() {
10515    let mut website: Website = Website::new("https://choosealicense.com");
10516    website
10517        .configuration
10518        .proxies
10519        .get_or_insert(Default::default())
10520        .push("socks5://127.0.0.1:1080".into());
10521
10522    website.crawl().await;
10523
10524    let mut license_found = false;
10525
10526    for links_visited in website.get_links() {
10527        // Proxy may return http or https in socks5 per platform.
10528        // We may want to replace the protocol with the host of the platform regardless of proxy response.
10529        if links_visited.as_ref().contains("/licenses/") {
10530            license_found = true;
10531        };
10532    }
10533
10534    assert!(license_found, "{:?}", website.links_visited);
10535}
10536
10537#[tokio::test]
10538async fn test_link_duplicates() {
10539    fn has_unique_elements<T>(iter: T) -> bool
10540    where
10541        T: IntoIterator,
10542        T::Item: Eq + std::hash::Hash,
10543    {
10544        let mut uniq = HashSet::new();
10545        iter.into_iter().all(move |x| uniq.insert(x))
10546    }
10547
10548    let mut website: Website = Website::new("http://0.0.0.0:8000");
10549    website.crawl().await;
10550
10551    assert!(has_unique_elements(website.links_visited.get_links()));
10552}
10553
10554#[tokio::test]
10555async fn test_crawl_budget() {
10556    let mut website: Website = Website::new("https://choosealicense.com");
10557    website.with_budget(Some(HashMap::from([("*", 1), ("/licenses", 1)])));
10558    website.crawl().await;
10559
10560    assert!(website.links_visited.len() <= 1);
10561}
10562
10563#[tokio::test]
10564#[cfg(feature = "control")]
10565#[ignore]
10566async fn test_crawl_pause_resume() {
10567    use crate::utils::{pause, resume};
10568
10569    let domain = "https://choosealicense.com/";
10570    let mut website: Website = Website::new(domain);
10571
10572    let start = tokio::time::Instant::now();
10573
10574    tokio::spawn(async move {
10575        pause(domain).await;
10576        // static website test pause/resume - scan will never take longer than 5secs for target website choosealicense
10577        tokio::time::sleep(Duration::from_millis(5000)).await;
10578        resume(domain).await;
10579    });
10580
10581    website.crawl().await;
10582
10583    let duration = start.elapsed();
10584
10585    assert!(duration.as_secs() >= 5, "{:?}", duration);
10586
10587    assert!(
10588        website
10589            .links_visited
10590            .contains(&"https://choosealicense.com/licenses/".into()),
10591        "{:?}",
10592        website.links_visited
10593    );
10594}
10595
10596#[cfg(feature = "control")]
10597#[ignore]
10598#[tokio::test]
10599async fn test_crawl_shutdown() {
10600    use crate::utils::shutdown;
10601
10602    // use target blog to prevent shutdown of prior crawler
10603    let domain = "https://spider.cloud/";
10604    let mut website: Website = Website::new(domain);
10605
10606    tokio::spawn(async move {
10607        shutdown(domain).await;
10608    });
10609
10610    website.crawl().await;
10611    let links_visited_count = website.links_visited.len();
10612
10613    assert!(links_visited_count <= 1, "{:?}", links_visited_count);
10614}
10615
10616#[tokio::test]
10617#[cfg(all(feature = "cache_request", not(feature = "decentralized")))]
10618async fn test_cache() {
10619    let domain = "https://choosealicense.com/";
10620    let mut website: Website = Website::new(&domain);
10621    website.configuration.cache = true;
10622
10623    let fresh_start = tokio::time::Instant::now();
10624    website.crawl().await;
10625    let fresh_duration = fresh_start.elapsed();
10626
10627    let cached_start = tokio::time::Instant::now();
10628    website.crawl().await;
10629    let cached_duration = cached_start.elapsed();
10630
10631    // cache should be faster at least 5x.
10632    assert!(
10633        fresh_duration.as_millis() > cached_duration.as_millis() * 5,
10634        "{:?}",
10635        cached_duration
10636    );
10637}
10638
10639#[tokio::test]
10640#[cfg(all(
10641    not(feature = "decentralized"),
10642    feature = "smart",
10643    feature = "cache_chrome_hybrid"
10644))]
10645async fn test_crawl_smart_uses_seeded_cache_with_skip_browser() {
10646    use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
10647    use std::collections::HashMap as StdHashMap;
10648
10649    let target_url = "http://localhost:9/cache-smart-test";
10650    let cache_key = create_cache_key_raw(target_url, None, None);
10651
10652    let mut response_headers = StdHashMap::new();
10653    response_headers.insert("content-type".to_string(), "text/html".to_string());
10654    response_headers.insert(
10655        "cache-control".to_string(),
10656        "public, max-age=3600".to_string(),
10657    );
10658
10659    let body =
10660        b"<html><head><title>Cached Smart Test</title></head><body>cached</body></html>".to_vec();
10661    let http_response = HttpResponse {
10662        body,
10663        headers: response_headers,
10664        status: 200,
10665        url: Url::parse(target_url).expect("valid cache test url"),
10666        version: HttpVersion::Http11,
10667    };
10668
10669    let request_headers = StdHashMap::new();
10670
10671    put_hybrid_cache(&cache_key, http_response, "GET", request_headers).await;
10672
10673    let mut website = Website::new(target_url);
10674    website.configuration.cache = true;
10675    website.with_cache_skip_browser(true);
10676    website.with_budget(Some(HashMap::from([("*", 1)])));
10677
10678    let start = tokio::time::Instant::now();
10679    website.crawl_smart().await;
10680    let elapsed = start.elapsed();
10681
10682    assert_eq!(website.initial_status_code, StatusCode::OK);
10683    assert!(website.initial_html_length > 0);
10684    assert!(!website.initial_page_should_retry);
10685    assert!(
10686        website.links_visited.contains(&target_url.into()),
10687        "expected smart crawl to visit the cached target"
10688    );
10689
10690    eprintln!("crawl_smart cached latency: {}ms", elapsed.as_millis());
10691}
10692
10693#[tokio::test]
10694#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
10695async fn test_cache_shortcircuit_single_page() {
10696    use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
10697    use std::collections::HashMap as StdHashMap;
10698
10699    let target_url = "http://localhost:9/shortcircuit-test";
10700    let cache_key = create_cache_key_raw(target_url, None, None);
10701
10702    let mut response_headers = StdHashMap::new();
10703    response_headers.insert("content-type".to_string(), "text/html".to_string());
10704    response_headers.insert(
10705        "cache-control".to_string(),
10706        "public, max-age=3600".to_string(),
10707    );
10708
10709    let body =
10710        b"<html><head><title>Shortcircuit</title></head><body><h1>Cached!</h1></body></html>"
10711            .to_vec();
10712    let http_response = HttpResponse {
10713        body,
10714        headers: response_headers,
10715        status: 200,
10716        url: Url::parse(target_url).expect("valid url"),
10717        version: HttpVersion::Http11,
10718    };
10719
10720    let request_headers = StdHashMap::new();
10721
10722    put_hybrid_cache(&cache_key, http_response, "GET", request_headers).await;
10723
10724    let mut website = Website::new(target_url);
10725    website.configuration.cache = true;
10726    website.with_cache_skip_browser(true);
10727    website.with_budget(Some(HashMap::from([("*", 1)])));
10728
10729    let mut rx = website.subscribe(4).unwrap();
10730    let handle = tokio::spawn(async move { rx.recv().await.ok() });
10731
10732    let start = tokio::time::Instant::now();
10733    website.crawl().await;
10734    let elapsed = start.elapsed();
10735
10736    let page = handle.await.unwrap().expect("page received via channel");
10737    assert!(
10738        page.get_html().contains("Cached!"),
10739        "expected cached HTML content"
10740    );
10741    assert_eq!(page.status_code, StatusCode::OK);
10742    assert_eq!(website.initial_status_code, StatusCode::OK);
10743    assert!(website.initial_html_length > 0);
10744    // Must be fast (no browser launch — typically <100ms)
10745    assert!(
10746        elapsed.as_millis() < 2000,
10747        "shortcircuit too slow: {elapsed:?}"
10748    );
10749    eprintln!(
10750        "shortcircuit single_page latency: {}ms",
10751        elapsed.as_millis()
10752    );
10753}
10754
10755#[tokio::test]
10756#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
10757async fn test_cache_shortcircuit_miss_falls_through() {
10758    let mut website = Website::new("http://localhost:9/uncached-shortcircuit");
10759    website.configuration.cache = true;
10760    website.with_cache_skip_browser(true);
10761    website.with_budget(Some(HashMap::from([("*", 1)])));
10762
10763    // Should not panic, just fall through to normal crawl (will fail to connect, that's OK)
10764    website.crawl_raw().await;
10765}
10766
10767#[tokio::test]
10768#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
10769async fn test_cache_shortcircuit_not_without_skip_browser() {
10770    use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
10771    use std::collections::HashMap as StdHashMap;
10772
10773    let target_url = "http://localhost:9/no-skip-shortcircuit";
10774    let cache_key = create_cache_key_raw(target_url, None, None);
10775
10776    let mut response_headers = StdHashMap::new();
10777    response_headers.insert("content-type".to_string(), "text/html".to_string());
10778    response_headers.insert(
10779        "cache-control".to_string(),
10780        "public, max-age=3600".to_string(),
10781    );
10782
10783    let body = b"<html><body>No Skip</body></html>".to_vec();
10784    let http_response = HttpResponse {
10785        body,
10786        headers: response_headers,
10787        status: 200,
10788        url: Url::parse(target_url).expect("valid url"),
10789        version: HttpVersion::Http11,
10790    };
10791
10792    let request_headers = StdHashMap::new();
10793
10794    put_hybrid_cache(&cache_key, http_response, "GET", request_headers).await;
10795
10796    let mut website = Website::new(target_url);
10797    website.configuration.cache = true;
10798    website.with_cache_skip_browser(false); // NOT skip_browser
10799    website.with_budget(Some(HashMap::from([("*", 1)])));
10800
10801    // Verify the shortcircuit guard rejects when skip_browser=false
10802    website.configuration.configure_budget();
10803    assert!(
10804        !crate::utils::cache_skip_browser(&website.configuration.get_cache_options()),
10805        "cache_skip_browser should be false when skip_browser is disabled"
10806    );
10807
10808    // Should still work via normal path (cache hit in Page::new)
10809    website.crawl_raw().await;
10810}
10811
10812#[tokio::test]
10813#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
10814async fn test_cache_shortcircuit_not_for_multi_page() {
10815    use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
10816    use std::collections::HashMap as StdHashMap;
10817
10818    let target_url = "http://localhost:9/multi-page-shortcircuit";
10819    let cache_key = create_cache_key_raw(target_url, None, None);
10820
10821    let mut response_headers = StdHashMap::new();
10822    response_headers.insert("content-type".to_string(), "text/html".to_string());
10823    response_headers.insert(
10824        "cache-control".to_string(),
10825        "public, max-age=3600".to_string(),
10826    );
10827
10828    let body = b"<html><body>Multi Page</body></html>".to_vec();
10829    let http_response = HttpResponse {
10830        body,
10831        headers: response_headers,
10832        status: 200,
10833        url: Url::parse(target_url).expect("valid url"),
10834        version: HttpVersion::Http11,
10835    };
10836
10837    let request_headers = StdHashMap::new();
10838
10839    put_hybrid_cache(&cache_key, http_response, "GET", request_headers).await;
10840
10841    let mut website = Website::new(target_url);
10842    website.configuration.cache = true;
10843    website.with_cache_skip_browser(true);
10844    website.with_budget(Some(HashMap::from([("*", 5)]))); // NOT single_page
10845
10846    // Shortcircuit should NOT activate for limit > 1
10847    // (per-task cache check handles multi-page instead)
10848    website.crawl_raw().await;
10849}
10850
10851#[tokio::test]
10852#[cfg(all(
10853    not(feature = "decentralized"),
10854    feature = "smart",
10855    feature = "cache_chrome_hybrid"
10856))]
10857async fn test_cache_shortcircuit_crawl_smart() {
10858    use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
10859    use std::collections::HashMap as StdHashMap;
10860
10861    let target_url = "http://localhost:9/smart-shortcircuit-test";
10862    let cache_key = create_cache_key_raw(target_url, None, None);
10863
10864    let mut response_headers = StdHashMap::new();
10865    response_headers.insert("content-type".to_string(), "text/html".to_string());
10866    response_headers.insert(
10867        "cache-control".to_string(),
10868        "public, max-age=3600".to_string(),
10869    );
10870
10871    let body =
10872        b"<html><head><title>Smart Shortcircuit</title></head><body>Smart Cached</body></html>"
10873            .to_vec();
10874    let http_response = HttpResponse {
10875        body,
10876        headers: response_headers,
10877        status: 200,
10878        url: Url::parse(target_url).expect("valid url"),
10879        version: HttpVersion::Http11,
10880    };
10881
10882    let request_headers = StdHashMap::new();
10883
10884    put_hybrid_cache(&cache_key, http_response, "GET", request_headers).await;
10885
10886    let mut website = Website::new(target_url);
10887    website.configuration.cache = true;
10888    website.with_cache_skip_browser(true);
10889    website.with_budget(Some(HashMap::from([("*", 1)])));
10890
10891    let mut rx = website.subscribe(4).unwrap();
10892    let handle = tokio::spawn(async move { rx.recv().await.ok() });
10893
10894    let start = tokio::time::Instant::now();
10895    website.crawl_smart().await;
10896    let elapsed = start.elapsed();
10897
10898    let page = handle.await.unwrap().expect("page received");
10899    assert!(
10900        page.get_html().contains("Smart Cached"),
10901        "expected cached HTML in crawl_smart"
10902    );
10903    assert_eq!(website.initial_status_code, StatusCode::OK);
10904    assert!(website.initial_html_length > 0);
10905    assert!(
10906        elapsed.as_millis() < 2000,
10907        "crawl_smart shortcircuit too slow: {elapsed:?}"
10908    );
10909    eprintln!(
10910        "crawl_smart shortcircuit latency: {}ms",
10911        elapsed.as_millis()
10912    );
10913}
10914
10915#[cfg(test)]
10916mod tests {
10917
10918    #[cfg(not(feature = "decentralized"))]
10919    #[test]
10920    fn test_client_rotator_round_robin() {
10921        // Build 3 simple clients to verify round-robin cycling.
10922        let clients: Vec<crate::Client> = (0..3)
10923            .map(|_| {
10924                #[cfg(not(feature = "cache_request"))]
10925                {
10926                    unsafe { crate::ClientBuilder::new().build().unwrap_unchecked() }
10927                }
10928                #[cfg(feature = "cache_request")]
10929                {
10930                    reqwest_middleware::ClientBuilder::new(unsafe {
10931                        reqwest::ClientBuilder::new().build().unwrap_unchecked()
10932                    })
10933                    .build()
10934                }
10935            })
10936            .collect();
10937
10938        let rotator = crate::website::ClientRotator::new(clients);
10939        assert_eq!(rotator.len(), 3);
10940        assert!(!rotator.is_empty());
10941
10942        // Each call to next() should advance the index.
10943        // We verify the pattern cycles by checking the internal index.
10944        let _ = rotator.next(); // index 0
10945        let _ = rotator.next(); // index 1
10946        let _ = rotator.next(); // index 2
10947        let _ = rotator.next(); // index 3 -> wraps to 0
10948
10949        // After 4 calls, the atomic index should be 4.
10950        let current_idx = rotator.index.load(crate::website::Ordering::Relaxed);
10951        assert_eq!(current_idx, 4);
10952    }
10953
10954    #[cfg(not(feature = "decentralized"))]
10955    #[test]
10956    fn test_build_rotated_clients_with_multiple_proxies() {
10957        let mut website = crate::website::Website::new("http://example.com");
10958        website.configuration.with_proxies(Some(vec![
10959            "http://proxy1.example.com:8080".to_string(),
10960            "http://proxy2.example.com:8080".to_string(),
10961            "http://proxy3.example.com:8080".to_string(),
10962        ]));
10963
10964        let rotator = website.build_rotated_clients();
10965        assert!(rotator.is_some(), "Should build rotator with 3 proxies");
10966        let rotator = rotator.unwrap();
10967        assert_eq!(rotator.len(), 3);
10968    }
10969
10970    #[cfg(not(feature = "decentralized"))]
10971    #[test]
10972    fn test_build_rotated_clients_single_proxy_returns_none() {
10973        let mut website = crate::website::Website::new("http://example.com");
10974        website
10975            .configuration
10976            .with_proxies(Some(vec!["http://proxy1.example.com:8080".to_string()]));
10977
10978        let rotator = website.build_rotated_clients();
10979        assert!(
10980            rotator.is_none(),
10981            "Should not build rotator with only 1 proxy"
10982        );
10983    }
10984
10985    #[cfg(not(feature = "decentralized"))]
10986    #[test]
10987    fn test_build_rotated_clients_no_proxies_returns_none() {
10988        let website = crate::website::Website::new("http://example.com");
10989        let rotator = website.build_rotated_clients();
10990        assert!(
10991            rotator.is_none(),
10992            "Should not build rotator with no proxies"
10993        );
10994    }
10995}
10996
10997#[tokio::test]
10998#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
10999async fn test_cache_phase_multi_page_all_cached() {
11000    use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
11001    use std::collections::HashMap as StdHashMap;
11002
11003    let root_url = "http://localhost:9/cache-phase-root";
11004    let sub1_url = "http://localhost:9/cache-phase-sub1";
11005    let sub2_url = "http://localhost:9/cache-phase-sub2";
11006
11007    // Root links to sub1 and sub2
11008    let root_html = format!(
11009        "<html><head><title>Root</title></head><body>\
11010         <a href=\"{}\">Sub1</a>\
11011         <a href=\"{}\">Sub2</a>\
11012         </body></html>",
11013        sub1_url, sub2_url
11014    );
11015    // Sub pages have no outgoing links
11016    let sub1_html =
11017        "<html><head><title>Sub1</title></head><body><h1>Sub1 Content</h1></body></html>";
11018    let sub2_html =
11019        "<html><head><title>Sub2</title></head><body><h1>Sub2 Content</h1></body></html>";
11020
11021    let request_headers = StdHashMap::new();
11022    let response_headers = {
11023        let mut h = StdHashMap::new();
11024        h.insert("content-type".to_string(), "text/html".to_string());
11025        h.insert(
11026            "cache-control".to_string(),
11027            "public, max-age=3600".to_string(),
11028        );
11029        h
11030    };
11031
11032    // Seed all three pages into cache
11033    for (url, html) in [
11034        (root_url, root_html.as_str()),
11035        (sub1_url, sub1_html),
11036        (sub2_url, sub2_html),
11037    ] {
11038        let cache_key = create_cache_key_raw(url, None, None);
11039        let http_response = HttpResponse {
11040            body: html.as_bytes().to_vec(),
11041            headers: response_headers.clone(),
11042            status: 200,
11043            url: Url::parse(url).expect("valid url"),
11044            version: HttpVersion::Http11,
11045        };
11046        put_hybrid_cache(&cache_key, http_response, "GET", request_headers.clone()).await;
11047    }
11048
11049    let mut website = Website::new(root_url);
11050    website.configuration.cache = true;
11051    website.with_cache_skip_browser(true);
11052    website.with_budget(Some(HashMap::from([("*", 10)])));
11053
11054    let mut rx = website.subscribe(16).unwrap();
11055
11056    website.crawl_raw().await;
11057
11058    // Collect pages received
11059    let mut pages = Vec::new();
11060    while let Ok(page) = rx.try_recv() {
11061        pages.push(page.get_url().to_string());
11062    }
11063
11064    assert!(
11065        pages.contains(&root_url.to_string()),
11066        "root page should be served from cache"
11067    );
11068    assert!(
11069        pages.contains(&sub1_url.to_string()),
11070        "sub1 page should be served from cache"
11071    );
11072    assert!(
11073        pages.contains(&sub2_url.to_string()),
11074        "sub2 page should be served from cache"
11075    );
11076    assert_eq!(pages.len(), 3, "exactly 3 pages expected");
11077    assert_eq!(website.initial_status_code, StatusCode::OK);
11078    assert!(website.initial_html_length > 0);
11079}
11080
11081#[tokio::test]
11082#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
11083async fn test_cache_phase_partial_miss() {
11084    use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
11085    use std::collections::HashMap as StdHashMap;
11086
11087    let root_url = "http://localhost:9/cache-phase-partial-root";
11088    let sub_url = "http://localhost:9/cache-phase-partial-sub";
11089
11090    // Root links to sub (which is NOT in cache)
11091    let root_html = format!(
11092        "<html><head><title>Root</title></head><body>\
11093         <a href=\"{}\">Sub</a></body></html>",
11094        sub_url
11095    );
11096
11097    let request_headers = StdHashMap::new();
11098    let response_headers = {
11099        let mut h = StdHashMap::new();
11100        h.insert("content-type".to_string(), "text/html".to_string());
11101        h.insert(
11102            "cache-control".to_string(),
11103            "public, max-age=3600".to_string(),
11104        );
11105        h
11106    };
11107
11108    // Only seed root, NOT sub
11109    let cache_key = create_cache_key_raw(root_url, None, None);
11110    let http_response = HttpResponse {
11111        body: root_html.as_bytes().to_vec(),
11112        headers: response_headers,
11113        status: 200,
11114        url: Url::parse(root_url).expect("valid url"),
11115        version: HttpVersion::Http11,
11116    };
11117    put_hybrid_cache(&cache_key, http_response, "GET", request_headers).await;
11118
11119    let mut website = Website::new(root_url);
11120    website.configuration.cache = true;
11121    website.with_cache_skip_browser(true);
11122    website.with_budget(Some(HashMap::from([("*", 10)])));
11123
11124    let mut rx = website.subscribe(16).unwrap();
11125
11126    // crawl_raw will: cache phase serves root, puts sub in extra_links,
11127    // then falls through to raw HTTP which fails to connect (localhost:9).
11128    website.crawl_raw().await;
11129
11130    // Root should have been served from cache
11131    let mut pages = Vec::new();
11132    while let Ok(page) = rx.try_recv() {
11133        pages.push(page.get_url().to_string());
11134    }
11135
11136    assert!(
11137        pages.contains(&root_url.to_string()),
11138        "root page should be served from cache"
11139    );
11140    // sub_url was NOT cached — it goes to extra_links then raw HTTP (which fails).
11141    // We just verify root was served and no panic occurred.
11142    assert_eq!(website.initial_status_code, StatusCode::OK);
11143}
11144
11145#[tokio::test]
11146#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
11147async fn test_cache_phase_skipped_without_skip_browser() {
11148    use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
11149    use std::collections::HashMap as StdHashMap;
11150
11151    let root_url = "http://localhost:9/cache-phase-no-skip";
11152
11153    let request_headers = StdHashMap::new();
11154    let response_headers = {
11155        let mut h = StdHashMap::new();
11156        h.insert("content-type".to_string(), "text/html".to_string());
11157        h.insert(
11158            "cache-control".to_string(),
11159            "public, max-age=3600".to_string(),
11160        );
11161        h
11162    };
11163
11164    let cache_key = create_cache_key_raw(root_url, None, None);
11165    let http_response = HttpResponse {
11166        body: b"<html><body>Cached</body></html>".to_vec(),
11167        headers: response_headers,
11168        status: 200,
11169        url: Url::parse(root_url).expect("valid url"),
11170        version: HttpVersion::Http11,
11171    };
11172    put_hybrid_cache(&cache_key, http_response, "GET", request_headers).await;
11173
11174    let mut website = Website::new(root_url);
11175    website.configuration.cache = true;
11176    website.with_cache_skip_browser(false); // Deliberately NOT skip_browser
11177    website.with_budget(Some(HashMap::from([("*", 5)])));
11178
11179    // Cache phase should not activate without skip_browser
11180    website.configuration.configure_budget();
11181    assert!(
11182        !crate::utils::cache_skip_browser(&website.configuration.get_cache_options()),
11183        "cache_skip_browser should be false"
11184    );
11185
11186    // Should not panic — falls through to normal crawl
11187    website.crawl_raw().await;
11188}
11189
11190#[tokio::test]
11191#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
11192async fn test_cache_phase_respects_budget() {
11193    use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
11194    use std::collections::HashMap as StdHashMap;
11195
11196    let root_url = "http://localhost:9/cache-phase-budget";
11197    let sub1_url = "http://localhost:9/cache-phase-budget-s1";
11198    let sub2_url = "http://localhost:9/cache-phase-budget-s2";
11199
11200    let root_html = format!(
11201        "<html><body><a href=\"{}\">S1</a><a href=\"{}\">S2</a></body></html>",
11202        sub1_url, sub2_url
11203    );
11204    let sub_html = "<html><body>Sub</body></html>";
11205
11206    let request_headers = StdHashMap::new();
11207    let response_headers = {
11208        let mut h = StdHashMap::new();
11209        h.insert("content-type".to_string(), "text/html".to_string());
11210        h.insert(
11211            "cache-control".to_string(),
11212            "public, max-age=3600".to_string(),
11213        );
11214        h
11215    };
11216
11217    for (url, html) in [
11218        (root_url, root_html.as_str()),
11219        (sub1_url, sub_html),
11220        (sub2_url, sub_html),
11221    ] {
11222        let cache_key = create_cache_key_raw(url, None, None);
11223        let http_response = HttpResponse {
11224            body: html.as_bytes().to_vec(),
11225            headers: response_headers.clone(),
11226            status: 200,
11227            url: Url::parse(url).expect("valid url"),
11228            version: HttpVersion::Http11,
11229        };
11230        put_hybrid_cache(&cache_key, http_response, "GET", request_headers.clone()).await;
11231    }
11232
11233    // Budget of 2: root + 1 sub page
11234    let mut website = Website::new(root_url);
11235    website.configuration.cache = true;
11236    website.with_cache_skip_browser(true);
11237    website.with_budget(Some(HashMap::from([("*", 2)])));
11238
11239    let mut rx = website.subscribe(16).unwrap();
11240    website.crawl_raw().await;
11241
11242    let mut pages = Vec::new();
11243    while let Ok(page) = rx.try_recv() {
11244        pages.push(page.get_url().to_string());
11245    }
11246
11247    // Budget is 2 so at most 2 pages should be served
11248    assert!(
11249        pages.len() <= 2,
11250        "budget should limit pages to at most 2, got {}",
11251        pages.len()
11252    );
11253    assert!(
11254        pages.contains(&root_url.to_string()),
11255        "root page should always be served"
11256    );
11257}
11258
11259#[tokio::test]
11260#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
11261async fn test_cache_phase_initial_miss_falls_through() {
11262    // If the initial URL is NOT cached, cache phase returns false and
11263    // falls through to normal crawl (which will fail to connect — that's OK).
11264    let root_url = "http://localhost:9/cache-phase-miss-initial";
11265
11266    let mut website = Website::new(root_url);
11267    website.configuration.cache = true;
11268    website.with_cache_skip_browser(true);
11269    website.with_budget(Some(HashMap::from([("*", 5)])));
11270
11271    // Should not panic
11272    website.crawl_raw().await;
11273}
11274
11275#[tokio::test]
11276#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
11277async fn test_cache_phase_dedup_signatures() {
11278    use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
11279    use std::collections::HashMap as StdHashMap;
11280
11281    let root_url = "http://localhost:9/cache-phase-dedup";
11282    let dup_url = "http://localhost:9/cache-phase-dedup-dup";
11283
11284    // Both pages have identical HTML — normalization should dedup
11285    let html = "<html><body><a href=\"http://localhost:9/cache-phase-dedup-dup\">Link</a><p>Same Content</p></body></html>";
11286
11287    let request_headers = StdHashMap::new();
11288    let response_headers = {
11289        let mut h = StdHashMap::new();
11290        h.insert("content-type".to_string(), "text/html".to_string());
11291        h.insert(
11292            "cache-control".to_string(),
11293            "public, max-age=3600".to_string(),
11294        );
11295        h
11296    };
11297
11298    for url in [root_url, dup_url] {
11299        let cache_key = create_cache_key_raw(url, None, None);
11300        let http_response = HttpResponse {
11301            body: html.as_bytes().to_vec(),
11302            headers: response_headers.clone(),
11303            status: 200,
11304            url: Url::parse(url).expect("valid url"),
11305            version: HttpVersion::Http11,
11306        };
11307        put_hybrid_cache(&cache_key, http_response, "GET", request_headers.clone()).await;
11308    }
11309
11310    let mut website = Website::new(root_url);
11311    website.configuration.cache = true;
11312    website.configuration.normalize = true;
11313    website.with_cache_skip_browser(true);
11314    website.with_budget(Some(HashMap::from([("*", 10)])));
11315
11316    let mut rx = website.subscribe(16).unwrap();
11317    website.crawl_raw().await;
11318
11319    let mut pages = Vec::new();
11320    while let Ok(page) = rx.try_recv() {
11321        pages.push(page.get_url().to_string());
11322    }
11323
11324    // Root always served. Dup should be skipped due to signature dedup.
11325    assert!(
11326        pages.contains(&root_url.to_string()),
11327        "root page should be served"
11328    );
11329    // The dup page should be filtered by normalize/signature dedup.
11330    // With identical content + normalization, only root should appear.
11331    assert!(
11332        pages.len() <= 2,
11333        "signature dedup should limit duplicate content"
11334    );
11335}
11336
11337/// Verify that cache_chrome_hybrid_mem properly writes and reads cached entries
11338/// (regression: `put_hybrid_cache` was no-op without `cache_chrome_hybrid` feature).
11339#[tokio::test]
11340#[cfg(all(
11341    not(feature = "decentralized"),
11342    any(feature = "cache_chrome_hybrid", feature = "cache_chrome_hybrid_mem")
11343))]
11344async fn test_cache_shortcircuit_single_page_mem() {
11345    use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
11346    use std::collections::HashMap as StdHashMap;
11347
11348    let target_url = "http://localhost:9/shortcircuit-mem-test";
11349    let cache_key = create_cache_key_raw(target_url, None, None);
11350
11351    let mut response_headers = StdHashMap::new();
11352    response_headers.insert("content-type".to_string(), "text/html".to_string());
11353    response_headers.insert(
11354        "cache-control".to_string(),
11355        "public, max-age=3600".to_string(),
11356    );
11357
11358    let body = b"<html><head><title>MemCached</title></head><body><h1>In-Memory Cached!</h1></body></html>"
11359        .to_vec();
11360    let http_response = HttpResponse {
11361        body,
11362        headers: response_headers,
11363        status: 200,
11364        url: Url::parse(target_url).expect("valid url"),
11365        version: HttpVersion::Http11,
11366    };
11367
11368    let request_headers = StdHashMap::new();
11369
11370    put_hybrid_cache(&cache_key, http_response, "GET", request_headers).await;
11371
11372    let mut website = Website::new(target_url);
11373    website.configuration.cache = true;
11374    website.with_cache_skip_browser(true);
11375    website.with_budget(Some(HashMap::from([("*", 1)])));
11376
11377    let mut rx = website.subscribe(4).unwrap();
11378    let handle = tokio::spawn(async move { rx.recv().await.ok() });
11379
11380    let start = tokio::time::Instant::now();
11381    website.crawl().await;
11382    let elapsed = start.elapsed();
11383
11384    let page = handle.await.unwrap().expect("page received via channel");
11385    assert!(
11386        page.get_html().contains("In-Memory Cached!"),
11387        "expected cached HTML content from mem cache"
11388    );
11389    assert_eq!(page.status_code, StatusCode::OK);
11390    assert_eq!(website.initial_status_code, StatusCode::OK);
11391    assert!(website.initial_html_length > 0);
11392    // Must be fast (no browser launch — typically <100ms)
11393    assert!(
11394        elapsed.as_millis() < 2000,
11395        "shortcircuit too slow: {elapsed:?}"
11396    );
11397    eprintln!(
11398        "shortcircuit single_page (mem) latency: {}ms",
11399        elapsed.as_millis()
11400    );
11401}
11402
11403/// Verify multi-page cache phase works with cache_chrome_hybrid_mem.
11404#[tokio::test]
11405#[cfg(all(
11406    not(feature = "decentralized"),
11407    any(feature = "cache_chrome_hybrid", feature = "cache_chrome_hybrid_mem")
11408))]
11409async fn test_cache_phase_multi_page_all_cached_mem() {
11410    use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
11411    use std::collections::HashMap as StdHashMap;
11412
11413    let root_url = "http://localhost:9/cache-phase-mem-root";
11414    let sub1_url = "http://localhost:9/cache-phase-mem-sub1";
11415    let sub2_url = "http://localhost:9/cache-phase-mem-sub2";
11416
11417    let root_html = format!(
11418        "<html><head><title>Root</title></head><body>\
11419         <a href=\"{}\">Sub1</a><a href=\"{}\">Sub2</a></body></html>",
11420        sub1_url, sub2_url
11421    );
11422    let sub1_html =
11423        "<html><head><title>Sub1</title></head><body><h1>Sub1 Mem Content</h1></body></html>";
11424    let sub2_html =
11425        "<html><head><title>Sub2</title></head><body><h1>Sub2 Mem Content</h1></body></html>";
11426
11427    let request_headers = StdHashMap::new();
11428    let response_headers = {
11429        let mut h = StdHashMap::new();
11430        h.insert("content-type".to_string(), "text/html".to_string());
11431        h.insert(
11432            "cache-control".to_string(),
11433            "public, max-age=3600".to_string(),
11434        );
11435        h
11436    };
11437
11438    for (url, html) in [
11439        (root_url, root_html.as_str()),
11440        (sub1_url, sub1_html),
11441        (sub2_url, sub2_html),
11442    ] {
11443        let cache_key = create_cache_key_raw(url, None, None);
11444        let http_response = HttpResponse {
11445            body: html.as_bytes().to_vec(),
11446            headers: response_headers.clone(),
11447            status: 200,
11448            url: Url::parse(url).expect("valid url"),
11449            version: HttpVersion::Http11,
11450        };
11451        put_hybrid_cache(&cache_key, http_response, "GET", request_headers.clone()).await;
11452    }
11453
11454    let mut website = Website::new(root_url);
11455    website.configuration.cache = true;
11456    website.with_cache_skip_browser(true);
11457    website.with_budget(Some(HashMap::from([("*", 10)])));
11458
11459    let mut rx = website.subscribe(16).unwrap();
11460    website.crawl_raw().await;
11461
11462    let mut pages = Vec::new();
11463    while let Ok(page) = rx.try_recv() {
11464        pages.push(page.get_url().to_string());
11465    }
11466
11467    assert!(
11468        pages.contains(&root_url.to_string()),
11469        "root page should be served from mem cache"
11470    );
11471    assert_eq!(pages.len(), 3, "exactly 3 pages expected from mem cache");
11472    assert_eq!(website.initial_status_code, StatusCode::OK);
11473    assert!(website.initial_html_length > 0);
11474}
spider/website.rs

spider/
website.rs