Skip to main content

spider/
website.rs

1use crate::black_list::contains;
2use crate::client::redirect::Policy;
3use crate::compact_str::CompactString;
4use crate::configuration::{
5    self, get_ua, AutomationScriptsMap, Configuration, ExecutionScriptsMap, RedirectPolicy,
6    SerializableHeaderMap,
7};
8use crate::{page::build, utils::PageResponse};
9
10#[cfg(feature = "smart")]
11use crate::features::chrome::OnceBrowser;
12use crate::features::chrome_common::RequestInterceptConfiguration;
13#[cfg(feature = "disk")]
14use crate::features::disk::DatabaseHandler;
15use crate::packages::robotparser::parser::RobotFileParser;
16use crate::page::{
17    AntiBotTech, Page, PageLinkBuildSettings, CHROME_UNKNOWN_STATUS_ERROR, UNKNOWN_STATUS_ERROR,
18};
19use crate::utils::abs::{convert_abs_url, parse_absolute_url};
20use crate::utils::interner::ListBucket;
21use crate::utils::{
22    crawl_duration_expired, emit_log, emit_log_shutdown, get_path_from_url, get_semaphore,
23    networking_capable, prepare_url, setup_website_selectors, spawn_set, AllowedDomainTypes,
24};
25use crate::{CaseInsensitiveString, Client, ClientBuilder, RelativeSelectors};
26#[cfg(feature = "cron")]
27use async_job::{async_trait, Job, Runner};
28use hashbrown::{HashMap, HashSet};
29use reqwest::header::REFERER;
30use reqwest::StatusCode;
31use std::fmt;
32use std::net::IpAddr;
33use std::sync::atomic::{AtomicBool, AtomicI8, AtomicUsize, Ordering};
34use std::sync::Arc;
35use std::time::{Duration, Instant};
36use tokio::{
37    sync::{broadcast, Semaphore},
38    task::JoinSet,
39    time::Interval,
40};
41use tokio_stream::StreamExt;
42use url::Url;
43
44#[cfg(feature = "cache_request")]
45use http_cache_reqwest::{Cache, CacheMode, HttpCache, HttpCacheOptions};
46
47#[cfg(feature = "cache_request")]
48pub use http_global_cache::CACACHE_MANAGER;
49
50/// The max backoff duration in seconds.
51const BACKOFF_MAX_DURATION: tokio::time::Duration = tokio::time::Duration::from_secs(60);
52
53/// calculate the base limits
54pub fn calc_limits(multiplier: usize) -> usize {
55    let logical = num_cpus::get();
56    let physical = num_cpus::get_physical();
57
58    let sem_limit = if logical > physical {
59        (logical) / (physical)
60    } else {
61        logical
62    };
63
64    let (sem_limit, sem_max) = if logical == physical {
65        (sem_limit * physical, 30 * multiplier)
66    } else {
67        (sem_limit * 2, 20 * multiplier)
68    };
69
70    sem_limit.max(sem_max)
71}
72
73/// Javascript challenge pages.
74static JS_SAFE_CHALLENGE_PATTERNS: &[&str] = &[
75    r#"Enable JavaScript and cookies to continue"#, // Cloudflare
76    r#"To continue, please enable JavaScript in your browser settings"#, // Akamai, F5
77    r#"Please enable JavaScript to view the page content"#, // AWS WAF
78];
79
80/// check if the page is a javascript challenge
81pub fn is_safe_javascript_challenge(page: &Page) -> bool {
82    let page = page.get_html_bytes_u8();
83
84    let page_size = page.len();
85
86    if page_size == 0 || page_size > 10_000 {
87        return false;
88    }
89
90    AC_JS_CHALLENGE.find(page).is_some()
91}
92
93#[cfg(all(
94    any(
95        target_os = "android",
96        target_os = "fuchsia",
97        target_os = "illumos",
98        target_os = "ios",
99        target_os = "linux",
100        target_os = "macos",
101        target_os = "solaris",
102        target_os = "tvos",
103        target_os = "visionos",
104        target_os = "watchos",
105    ),
106    not(feature = "wreq")
107))]
108/// Bind connections only on the specified network interface.
109pub fn set_interface(client: ClientBuilder, network_interface: &str) -> ClientBuilder {
110    client.interface(&network_interface)
111}
112
113#[cfg(not(any(
114    feature = "wreq",
115    target_os = "android",
116    target_os = "fuchsia",
117    target_os = "illumos",
118    target_os = "ios",
119    target_os = "linux",
120    target_os = "macos",
121    target_os = "solaris",
122    target_os = "tvos",
123    target_os = "visionos",
124    target_os = "watchos",
125)))]
126/// Bind connections only on the specified network interface.
127pub fn set_interface(client: ClientBuilder, _interface: &str) -> ClientBuilder {
128    client
129}
130
131lazy_static! {
132    static ref AC_JS_CHALLENGE: aho_corasick::AhoCorasick =  aho_corasick::AhoCorasick::new(JS_SAFE_CHALLENGE_PATTERNS).expect("safe challenges");
133    /// The default Semaphore limits.
134    pub static ref DEFAULT_PERMITS: usize = calc_limits(1);
135    /// The shared global Semaphore.
136    pub(crate) static ref SEM_SHARED: Arc<Semaphore> = {
137        let base_limit = match std::env::var("SEMAPHORE_MULTIPLIER") {
138            Ok(multiplier) => match multiplier.parse::<isize>() {
139                Ok(parsed_value) => (*DEFAULT_PERMITS as isize)
140                    .wrapping_mul(parsed_value)
141                    .max(1) as usize,
142                Err(_) => *DEFAULT_PERMITS,
143            },
144            _ => *DEFAULT_PERMITS,
145        };
146        Arc::new(Semaphore::const_new(base_limit))
147    };
148    /// The max links to store in memory.
149    pub(crate) static ref LINKS_VISITED_MEMORY_LIMIT: usize = {
150        const DEFAULT_LIMIT: usize = 15_000;
151
152        match std::env::var("LINKS_VISITED_MEMORY_LIMIT") {
153            Ok(limit) => limit.parse::<usize>().unwrap_or(DEFAULT_LIMIT),
154            _ => DEFAULT_LIMIT
155        }
156    };
157    static ref WILD_CARD_PATH: CaseInsensitiveString = CaseInsensitiveString::from("*");
158}
159
160#[cfg(not(feature = "decentralized"))]
161lazy_static! {
162    /// The global Semaphore.
163    static ref SEM: Semaphore = {
164        let base_limit = calc_limits(1);
165
166        let base_limit = match std::env::var("SEMAPHORE_MULTIPLIER") {
167            Ok(multiplier) => match multiplier.parse::<isize>() {
168                Ok(parsed_value) => (base_limit as isize * parsed_value).max(1) as usize,
169                Err(_) => base_limit,
170            },
171            _ => base_limit,
172        };
173
174        Semaphore::const_new(base_limit)
175    };
176}
177
178#[cfg(feature = "decentralized")]
179lazy_static! {
180    /// The global worker count.
181    static ref WORKERS: HashSet<String> = {
182        let mut set: HashSet<_> = HashSet::new();
183
184        for worker in std::env::var("SPIDER_WORKER_SCRAPER")
185            .unwrap_or_else(|_| "http://127.0.0.1:3031".to_string())
186            .split(",")
187        {
188            set.insert(worker.to_string());
189        }
190
191        for worker in std::env::var("SPIDER_WORKER")
192            .unwrap_or_else(|_| "http://127.0.0.1:3030".to_string())
193            .split(",")
194        {
195            set.insert(worker.to_string());
196        }
197
198        set
199    };
200    static ref SEM: Semaphore = {
201        let sem_limit = calc_limits(3);
202        Semaphore::const_new(sem_limit * WORKERS.len())
203    };
204}
205
206// const INVALID_URL: &str = "The domain should be a valid URL, refer to <https://www.w3.org/TR/2011/WD-html5-20110525/urls.html#valid-url>.";
207
208/// the active status of the crawl.
209#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, strum::EnumString, strum::Display)]
210#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
211pub enum CrawlStatus {
212    /// The crawl did not start yet.
213    #[default]
214    Start,
215    /// The crawl is idle and has completed.
216    Idle,
217    /// The crawl is active.
218    Active,
219    /// The crawl blocked from network ratelimit, firewall, etc.
220    Blocked,
221    /// Crawl blocked from spider firewall.
222    FirewallBlocked,
223    /// The crawl failed from a server error.
224    ServerError,
225    /// The crawl failed from a connection error with proxy or dns.
226    ConnectError,
227    /// The crawl was rate limited.
228    RateLimited,
229    /// The initial request ran without returning html.
230    Empty,
231    /// The URL of the website is invalid. Crawl cannot commence.
232    Invalid,
233    #[cfg(feature = "control")]
234    /// The crawl shutdown manually.
235    Shutdown,
236    #[cfg(feature = "control")]
237    /// The crawl paused manually.
238    Paused,
239}
240
241/// The link activity for the crawl.
242#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, strum::EnumString, strum::Display)]
243#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
244pub enum ProcessLinkStatus {
245    /// The link can process.
246    #[default]
247    Allowed,
248    /// The link is blocked.
249    Blocked,
250    /// The budget is exceeded for the crawl.
251    BudgetExceeded,
252}
253
254/// The type of cron job to run
255#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, strum::EnumString, strum::Display)]
256#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
257pub enum CronType {
258    #[default]
259    /// Crawl collecting links, page data, and etc.
260    Crawl,
261    /// Scrape collecting links, page data as bytes to store, and etc.
262    Scrape,
263}
264
265#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, strum::EnumString, strum::Display)]
266#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
267/// Generic website meta info for handling retries.
268pub enum WebsiteMetaInfo {
269    /// The page requires Javascript.
270    RequiresJavascript,
271    /// Standard apache 403 page that requires a special http header for access like a custom iframe server.
272    Apache403,
273    /// Standard Open Resty 403 page that requires a special http header for access like a custom iframe server.
274    OpenResty403,
275    /// No meta info.
276    #[default]
277    None,
278}
279
280/// On link find callback rewrite a url if it meets a condition.
281pub type OnLinkFindCallback = Arc<
282    dyn Fn(CaseInsensitiveString, Option<String>) -> (CaseInsensitiveString, Option<String>)
283        + Send
284        + Sync,
285>;
286
287/// Callback closure that determines if a link should be crawled or not.
288pub trait OnShouldCrawlClosure: Fn(&Page) -> bool + Send + Sync + 'static {}
289impl<F: Fn(&Page) -> bool + Send + Sync + 'static> OnShouldCrawlClosure for F {}
290
291/// Callback closure or function pointer that determines if a link should be crawled or not.
292#[derive(Clone)]
293pub enum OnShouldCrawlCallback {
294    /// Static function pointer.
295    Fn(fn(&Page) -> bool),
296
297    /// Closure.
298    Closure(Arc<dyn OnShouldCrawlClosure>),
299}
300impl OnShouldCrawlCallback {
301    fn call(&self, page: &Page) -> bool {
302        match self {
303            Self::Fn(func) => func(page),
304            Self::Closure(closure) => closure(page),
305        }
306    }
307}
308
309/// Round-robin client rotator for proxy rotation.
310/// Each client is built with a single proxy, and `next()` cycles through them.
311#[derive(Clone)]
312pub struct ClientRotator {
313    clients: Vec<Client>,
314    index: Arc<AtomicUsize>,
315}
316
317impl ClientRotator {
318    /// Create a new rotator from a list of clients.
319    pub fn new(clients: Vec<Client>) -> Self {
320        Self {
321            clients,
322            index: Arc::new(AtomicUsize::new(0)),
323        }
324    }
325
326    /// Get the next client in round-robin order.
327    pub fn next(&self) -> &Client {
328        let idx = self.index.fetch_add(1, Ordering::Relaxed) % self.clients.len();
329        &self.clients[idx]
330    }
331
332    /// Number of clients in the rotator.
333    pub fn len(&self) -> usize {
334        self.clients.len()
335    }
336
337    /// Whether the rotator is empty.
338    pub fn is_empty(&self) -> bool {
339        self.clients.is_empty()
340    }
341}
342
343/// Represents a website to crawl and gather all links or page content.
344/// ```rust
345/// use spider::website::Website;
346/// let mut website = Website::new("http://example.com");
347/// website.crawl();
348/// // `Website` will be filled with links or pages when crawled. If you need pages with the resource
349/// // call the `website.scrape` method with `website.get_pages` instead.
350/// for link in website.get_links() {
351///     // do something
352/// }
353/// ```
354#[derive(Clone, Default)]
355pub struct Website {
356    /// Configuration properties for website.
357    pub configuration: Box<Configuration>,
358    /// The callback when a link is found.
359    pub on_link_find_callback: Option<OnLinkFindCallback>,
360    /// The callback to use if a page should be ignored. Return false to ensure that the discovered links are not crawled.
361    pub on_should_crawl_callback: Option<OnShouldCrawlCallback>,
362    /// Set the crawl ID to track. This allows explicit targeting for shutdown, pause, and etc.
363    pub crawl_id: Box<String>,
364    #[cfg(feature = "extra_information")]
365    /// Extra information to store.
366    pub extra_info: Option<Box<String>>,
367    /// Seed the initial html for crawling.
368    seed_html: Option<String>,
369    /// All URLs visited.
370    links_visited: Box<ListBucket>,
371    /// All signatures.
372    signatures: Box<HashSet<u64>>,
373    /// Extra links to crawl.
374    extra_links: Box<HashSet<CaseInsensitiveString>>,
375    /// Pages visited.
376    pages: Option<Vec<Page>>,
377    /// Robot.txt parser.
378    robot_file_parser: Option<Box<RobotFileParser>>,
379    /// Base url of the crawl.
380    url: Box<CaseInsensitiveString>,
381    /// The domain url parsed.
382    domain_parsed: Option<Box<Url>>,
383    /// Subscribe and broadcast changes.
384    channel: Option<(broadcast::Sender<Page>, Arc<broadcast::Receiver<Page>>)>,
385    /// Guard counter for channel handling. This prevents things like the browser from closing after the crawl so that subscriptions can finalize events.
386    channel_guard: Option<ChannelGuard>,
387    /// Send links to process during the crawl.
388    channel_queue: Option<(broadcast::Sender<String>, Arc<broadcast::Receiver<String>>)>,
389    /// The status of the active crawl this is mapped to a general status and not the HTTP status code.
390    status: CrawlStatus,
391    /// The initial status code of the first request.
392    initial_status_code: StatusCode,
393    /// The initial anti-bot tech found.
394    initial_anti_bot_tech: AntiBotTech,
395    /// The initial bytes size of the first request.
396    initial_html_length: usize,
397    /// The initial page had a waf detection.
398    initial_page_waf_check: bool,
399    /// The initial page should retry.
400    initial_page_should_retry: bool,
401    /// The website was manually stopped.
402    shutdown: bool,
403    /// The request client. Stored for re-use between runs.
404    client: Option<Client>,
405    /// Round-robin client rotator for proxy rotation. Built when 2+ proxies are configured.
406    client_rotator: Option<Arc<ClientRotator>>,
407    /// The disk handler to use.
408    #[cfg(feature = "disk")]
409    sqlite: Option<Box<DatabaseHandler>>,
410    /// Configure sqlite on start
411    #[cfg(feature = "disk")]
412    enable_sqlite: bool,
413    /// Was the setup already configured for sync sendable thread use?
414    send_configured: bool,
415    /// The website requires javascript to load. This will be sent as a hint when http request.
416    website_meta_info: WebsiteMetaInfo,
417    /// Skip the initial link?
418    skip_initial: bool,
419    #[cfg(feature = "cookies")]
420    /// Cookie jar between request.
421    pub cookie_jar: Arc<reqwest::cookie::Jar>,
422}
423
424impl fmt::Debug for Website {
425    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
426        let domain_str = self.domain_parsed.as_ref().map(|u| u.as_str().to_owned());
427        let pages_len = self.pages.as_ref().map(|p| p.len()).unwrap_or(0);
428
429        let mut ds = f.debug_struct("Website");
430
431        ds.field("url", &self.url.as_ref())
432            .field("crawl_id", &self.crawl_id)
433            .field("domain_parsed", &domain_str)
434            // callbacks – just show presence, avoids Fn: Debug bound
435            .field(
436                "on_link_find_callback",
437                &self.on_link_find_callback.is_some(),
438            )
439            .field(
440                "on_should_crawl_callback",
441                &self.on_should_crawl_callback.is_some(),
442            )
443            // state + counters
444            .field("status", &self.status)
445            .field("shutdown", &self.shutdown)
446            .field("extra_links_len", &self.extra_links.len())
447            .field("signatures_len", &self.signatures.len())
448            .field("pages_len", &pages_len)
449            // channels / sqlite / client: just booleans
450            .field("channel_present", &self.channel.is_some())
451            .field("channel_queue_present", &self.channel_queue.is_some())
452            .field("client_present", &self.client.is_some())
453            // initial page info
454            .field("initial_status_code", &self.initial_status_code)
455            .field("initial_html_length", &self.initial_html_length)
456            .field("initial_anti_bot_tech", &self.initial_anti_bot_tech)
457            .field("initial_page_waf_check", &self.initial_page_waf_check)
458            .field("initial_page_should_retry", &self.initial_page_should_retry)
459            // misc flags/meta
460            .field("send_configured", &self.send_configured)
461            .field("website_meta_info", &self.website_meta_info)
462            .field("skip_initial", &self.skip_initial);
463
464        #[cfg(feature = "disk")]
465        {
466            ds.field("sqlite_present", &self.sqlite.is_some())
467                .field("enable_sqlite", &self.enable_sqlite);
468        }
469
470        ds.finish()
471    }
472}
473
474impl Website {
475    /// Initialize the Website with a starting link to crawl and check the firewall base.
476    fn _new(url: &str, check_firewall: bool) -> Self {
477        let url = url.trim();
478        let url: Box<CaseInsensitiveString> = if networking_capable(url) {
479            CaseInsensitiveString::new(&url).into()
480        } else {
481            CaseInsensitiveString::new(&prepare_url(url)).into()
482        };
483
484        let domain_parsed: Option<Box<Url>> = parse_absolute_url(&url);
485        let mut status = CrawlStatus::Start;
486
487        if let Some(u) = &domain_parsed {
488            if check_firewall && crate::utils::abs::block_website(&u) {
489                status = CrawlStatus::FirewallBlocked;
490            }
491        }
492
493        Self {
494            configuration: Configuration::new().into(),
495            status,
496            domain_parsed,
497            url,
498            #[cfg(feature = "disk")]
499            enable_sqlite: true,
500            ..Default::default()
501        }
502    }
503
504    /// Initialize the Website with a starting link to crawl.
505    pub fn new(url: &str) -> Self {
506        Website::_new(url, true)
507    }
508
509    /// Initialize the Website with a starting link to crawl and check the firewall.
510    pub fn new_with_firewall(url: &str, check_firewall: bool) -> Self {
511        Website::_new(url, check_firewall)
512    }
513
514    /// Setup a shared database.
515    #[cfg(feature = "disk")]
516    pub fn setup_database_handler(&self) -> Box<DatabaseHandler> {
517        Box::new(DatabaseHandler::new(&Some(self.target_id())))
518    }
519
520    #[cfg(feature = "disk")]
521    /// Setup the sqlist usage.
522    pub fn setup_shared_db(&mut self, db: Box<DatabaseHandler>) {
523        self.sqlite = Some(db)
524    }
525
526    #[cfg(feature = "disk")]
527    /// Setup the sqlist usage.
528    pub fn setup_sqlite(&mut self) {
529        if self.sqlite.is_none() {
530            self.sqlite = Some(self.setup_database_handler())
531        }
532    }
533
534    /// Set the url of the website to re-use configuration and data.
535    pub fn set_url(&mut self, url: &str) -> &mut Self {
536        let url = if url.starts_with(' ') || url.ends_with(' ') {
537            url.trim()
538        } else {
539            url
540        };
541
542        let domain: Box<CaseInsensitiveString> = if networking_capable(url) {
543            CaseInsensitiveString::new(&url).into()
544        } else {
545            CaseInsensitiveString::new(&prepare_url(&url)).into()
546        };
547
548        self.domain_parsed = parse_absolute_url(&domain);
549        self.url = domain;
550        self
551    }
552
553    /// Set the direct url of the website to re-use configuration and data without parsing the domain.
554    pub fn set_url_only(&mut self, url: &str) -> &mut Self {
555        self.url = CaseInsensitiveString::new(&url).into();
556        self
557    }
558
559    /// Get the target id for a crawl. This takes the crawl ID and the url and concats it without delimiters.
560    pub fn target_id(&self) -> String {
561        string_concat!(self.crawl_id, self.url.inner())
562    }
563
564    /// Single page request.
565    pub fn single_page(&self) -> bool {
566        match &self.configuration.inner_budget {
567            Some(b) => match b.get(&*WILD_CARD_PATH) {
568                Some(b) => b.eq(&1),
569                _ => false,
570            },
571            _ => false,
572        }
573    }
574
575    /// Setup SQLite. This does nothing with `disk` flag enabled.
576    #[cfg(feature = "disk")]
577    pub fn setup_disk(&mut self) {
578        if self.enable_sqlite && self.sqlite.is_none() {
579            self.setup_sqlite();
580        }
581        // run full on sqlite.
582        if self.configuration.shared {
583            if let Some(sqlite) = self.sqlite.as_mut() {
584                sqlite.seeded = true;
585                // sqlite.persist = true;
586            }
587        }
588    }
589
590    #[cfg(feature = "disk")]
591    /// Set the sqlite disk persistance.
592    pub fn set_disk_persistance(&mut self, persist: bool) -> &mut Self {
593        if self.enable_sqlite {
594            if !self.sqlite.is_none() {
595                if let Some(sqlite) = self.sqlite.as_mut() {
596                    sqlite.persist = persist;
597                }
598            }
599        }
600        self
601    }
602
603    /// Setup SQLite. This does nothing with `disk` flag enabled.
604    #[cfg(not(feature = "disk"))]
605    pub fn setup_disk(&mut self) {}
606
607    /// Get the robots.txt parser.
608    pub fn get_robots_parser(&self) -> &Option<Box<RobotFileParser>> {
609        &self.robot_file_parser
610    }
611
612    /// Does the website require javascript to run?
613    pub fn get_requires_javascript(&self) -> bool {
614        self.website_meta_info == WebsiteMetaInfo::RequiresJavascript
615    }
616
617    /// Get the website meta information that can help with retry handling.
618    pub fn get_website_meta_info(&self) -> &WebsiteMetaInfo {
619        &self.website_meta_info
620    }
621
622    /// Check if URL exists (ignore case). This does nothing with `disk` flag enabled.
623    #[cfg(feature = "disk")]
624    pub async fn is_allowed_disk(&self, url_to_check: &str) -> bool {
625        match &self.sqlite {
626            Some(sqlite) => {
627                if !sqlite.ready() {
628                    true
629                } else {
630                    let db_pool = sqlite.get_db_pool().await;
631                    let allowed = sqlite.url_exists(db_pool, url_to_check).await;
632
633                    !allowed
634                }
635            }
636            _ => true,
637        }
638    }
639
640    /// Check if URL exists (ignore case). This does nothing with `disk` flag enabled.
641    #[cfg(not(feature = "disk"))]
642    pub async fn is_allowed_disk(&self, _url_to_check: &str) -> bool {
643        true
644    }
645
646    /// Check if signature exists (ignore case). This does nothing with `disk` flag enabled.
647    #[cfg(feature = "disk")]
648    pub async fn is_allowed_signature_disk(&self, signature_to_check: u64) -> bool {
649        match &self.sqlite {
650            Some(sqlite) => {
651                if !sqlite.ready() {
652                    true
653                } else {
654                    let db_pool = sqlite.get_db_pool().await;
655
656                    !sqlite.signature_exists(db_pool, signature_to_check).await
657                }
658            }
659            _ => true,
660        }
661    }
662
663    /// Check if signature exists (ignore case). This does nothing with `disk` flag enabled.
664    #[cfg(not(feature = "disk"))]
665    pub async fn is_allowed_signature_disk(&self, _signature_to_check: u64) -> bool {
666        true
667    }
668
669    /// Is the signature allowed.
670    pub async fn is_signature_allowed(&self, signature: u64) -> bool {
671        !self.signatures.contains(&signature) || self.is_allowed_signature_disk(signature).await
672    }
673
674    /// Clear the disk. This does nothing with `disk` flag enabled.
675    #[cfg(feature = "disk")]
676    pub async fn clear_disk(&self) {
677        if let Some(sqlite) = &self.sqlite {
678            if sqlite.pool_inited() {
679                let _ = DatabaseHandler::clear_table(sqlite.get_db_pool().await).await;
680            }
681        }
682    }
683
684    /// Clear the disk. This does nothing with `disk` flag enabled.
685    #[cfg(not(feature = "disk"))]
686    pub async fn clear_disk(&self) {}
687
688    /// Check if the disk is enabled. This does nothing with `disk` flag enabled.
689    #[cfg(not(feature = "disk"))]
690    pub(crate) fn shared_disk_enabled(&self) -> bool {
691        false
692    }
693
694    /// Check if the disk is enabled. This does nothing with `disk` flag enabled.
695    #[cfg(feature = "disk")]
696    pub(crate) fn shared_disk_enabled(&self) -> bool {
697        self.configuration.shared && self.sqlite.is_some()
698    }
699
700    /// Insert a new URL to disk if it doesn't exist. This does nothing with `disk` flag enabled.
701    #[cfg(feature = "disk")]
702    pub async fn insert_url_disk(&self, new_url: &str) {
703        if let Some(sqlite) = &self.sqlite {
704            sqlite.insert_url(sqlite.get_db_pool().await, new_url).await
705        }
706    }
707
708    /// Insert a new signature to disk if it doesn't exist. This does nothing with `disk` flag enabled.
709    #[cfg(feature = "disk")]
710    pub async fn insert_signature_disk(&self, signature: u64) {
711        if let Some(sqlite) = &self.sqlite {
712            sqlite
713                .insert_signature(sqlite.get_db_pool().await, signature)
714                .await
715        }
716    }
717
718    /// Insert a new URL if it doesn't exist. This does nothing with `disk` flag enabled.
719    #[cfg(feature = "disk")]
720    pub async fn insert_link(&mut self, new_url: CaseInsensitiveString) {
721        let mem_load = crate::utils::detect_system::get_global_memory_state().await;
722        let beyond_memory_limits = self.links_visited.len() >= *LINKS_VISITED_MEMORY_LIMIT;
723        let seed_check = mem_load == 2 || mem_load == 1 || beyond_memory_limits;
724
725        if seed_check {
726            let mut seeded = false;
727            if let Some(sqlite) = &self.sqlite {
728                if !sqlite.ready() {
729                    let _ = self.seed().await;
730                    seeded = true;
731                }
732            }
733            if let Some(sqlite) = self.sqlite.as_mut() {
734                sqlite.set_seeded(seeded);
735            }
736        }
737
738        if mem_load == 2 || beyond_memory_limits || self.shared_disk_enabled() {
739            self.insert_url_disk(&new_url).await
740        } else if mem_load == 1 {
741            if self.links_visited.len() <= 100 {
742                self.links_visited.insert(new_url);
743            } else {
744                self.insert_url_disk(&new_url).await
745            }
746        } else {
747            self.links_visited.insert(new_url);
748        }
749    }
750
751    /// Insert a new URL if it doesn't exist. This does nothing with `disk` flag enabled.
752    #[cfg(not(feature = "disk"))]
753    pub async fn insert_link(&mut self, link: CaseInsensitiveString) {
754        self.links_visited.insert(link);
755    }
756
757    /// Insert a new signature if it doesn't exist. This does nothing with `disk` flag enabled.
758    #[cfg(feature = "disk")]
759    pub async fn insert_signature(&mut self, new_signature: u64) {
760        let mem_load = crate::utils::detect_system::get_global_memory_state().await;
761        let beyond_memory_limits = self.signatures.len() >= *LINKS_VISITED_MEMORY_LIMIT;
762        let seed_check = mem_load == 2 || mem_load == 1 || beyond_memory_limits;
763
764        if seed_check {
765            let mut seeded = false;
766            if let Some(sqlite) = &self.sqlite {
767                if !sqlite.ready() {
768                    let _ = self.seed().await;
769                    seeded = true;
770                }
771            }
772            if let Some(sqlite) = self.sqlite.as_mut() {
773                sqlite.set_seeded(seeded);
774            }
775        }
776
777        if mem_load == 2 || beyond_memory_limits || self.shared_disk_enabled() {
778            self.insert_signature_disk(new_signature).await
779        } else if mem_load == 1 {
780            if self.signatures.len() <= 100 {
781                self.signatures.insert(new_signature);
782            } else {
783                self.insert_signature_disk(new_signature).await
784            }
785        } else {
786            self.signatures.insert(new_signature);
787        }
788    }
789
790    /// Insert a new signature if it doesn't exist. This does nothing with `disk` flag enabled.
791    #[cfg(not(feature = "disk"))]
792    pub async fn insert_signature(&mut self, new_signature: u64) {
793        self.signatures.insert(new_signature);
794    }
795
796    /// Seed the DB and clear the Hashset. This does nothing with `disk` flag enabled.
797    #[cfg(feature = "disk")]
798    pub async fn seed(&mut self) -> Result<(), sqlx::Error> {
799        let links = self.get_links();
800
801        if let Some(sqlite) = &self.sqlite {
802            if let Ok(links) = sqlite.seed(sqlite.get_db_pool().await, links).await {
803                self.links_visited.clear();
804
805                for link in links {
806                    self.links_visited.insert(link);
807                }
808
809                if let Some(sqlite) = self.sqlite.as_mut() {
810                    sqlite.seeded = true;
811                }
812            }
813        }
814
815        Ok(())
816    }
817
818    /// Return `false` if the crawl should shutdown. Process in between each link.
819    async fn handle_process<T>(
820        &self,
821        handle: &Option<Arc<AtomicI8>>,
822        interval: &mut Interval,
823        shutdown: T,
824    ) -> bool
825    where
826        T: std::future::Future<Output = ()>,
827    {
828        if self.shutdown {
829            (shutdown).await;
830            false
831        } else {
832            match handle.as_ref() {
833                Some(handle) => {
834                    while handle.load(Ordering::Relaxed) == 1 {
835                        interval.tick().await;
836                    }
837                    if handle.load(Ordering::Relaxed) == 2 {
838                        (shutdown).await;
839                        false
840                    } else {
841                        true
842                    }
843                }
844                _ => true,
845            }
846        }
847    }
848
849    /// return `true` if URL:
850    ///
851    /// - is not already crawled
852    /// - is not over depth
853    /// - is not over crawl budget
854    /// - is optionally whitelisted
855    /// - is not blacklisted
856    /// - is not forbidden in robot.txt file (if parameter is defined)
857    #[inline]
858    #[cfg(not(feature = "regex"))]
859    pub fn is_allowed(&mut self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
860        let status = self.is_allowed_budgetless(link);
861
862        if status.eq(&ProcessLinkStatus::Allowed) {
863            if self.is_over_budget(link) {
864                return ProcessLinkStatus::BudgetExceeded;
865            }
866        }
867
868        status
869    }
870
871    /// return `true` if URL:
872    ///
873    /// - is not already crawled
874    /// - is not over depth
875    /// - is not over crawl budget
876    /// - is optionally whitelisted
877    /// - is not blacklisted
878    /// - is not forbidden in robot.txt file (if parameter is defined)
879    #[inline]
880    #[cfg(feature = "regex")]
881    pub fn is_allowed(&mut self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
882        let status = self.is_allowed_budgetless(link);
883
884        if status.eq(&ProcessLinkStatus::Allowed) {
885            if self.is_over_budget(link) {
886                return ProcessLinkStatus::BudgetExceeded;
887            }
888        }
889        status
890    }
891
892    /// return `true` if URL:
893    ///
894    /// - is not already crawled
895    /// - is not over depth
896    /// - is optionally whitelisted
897    /// - is not blacklisted
898    /// - is not forbidden in robot.txt file (if parameter is defined)
899    #[inline]
900    #[cfg(not(feature = "regex"))]
901    pub fn is_allowed_budgetless(&mut self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
902        if self.links_visited.contains(link) {
903            ProcessLinkStatus::Blocked
904        } else {
905            let status = self.is_allowed_default(link.inner());
906
907            if status.eq(&ProcessLinkStatus::Allowed) {
908                if self.is_over_depth(link) {
909                    return ProcessLinkStatus::Blocked;
910                }
911            }
912
913            status
914        }
915    }
916
917    /// return `true` if URL:
918    ///
919    /// - is not already crawled
920    /// - is not over depth
921    /// - is optionally whitelisted
922    /// - is not blacklisted
923    /// - is not forbidden in robot.txt file (if parameter is defined)
924    #[inline]
925    #[cfg(feature = "regex")]
926    pub fn is_allowed_budgetless(&mut self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
927        if self.links_visited.contains(link) {
928            ProcessLinkStatus::Blocked
929        } else {
930            let status = self.is_allowed_default(link);
931            if status.eq(&ProcessLinkStatus::Allowed) {
932                if self.is_over_depth(link) {
933                    return ProcessLinkStatus::Blocked;
934                }
935            }
936            status
937        }
938    }
939
940    /// return `true` if URL:
941    ///
942    /// - is optionally whitelisted
943    /// - is not blacklisted
944    /// - is not forbidden in robot.txt file (if parameter is defined)
945    #[inline]
946    #[cfg(feature = "regex")]
947    pub fn is_allowed_default(&self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
948        let blacklist = self.configuration.get_blacklist_compiled();
949        let whitelist = self.configuration.get_whitelist_compiled();
950
951        let blocked_whitelist = !whitelist.is_empty() && !contains(&whitelist, link.inner());
952        let blocked_blacklist = !blacklist.is_empty() && contains(&blacklist, link.inner());
953
954        if blocked_whitelist || blocked_blacklist || !self.is_allowed_robots(&link.as_ref()) {
955            ProcessLinkStatus::Blocked
956        } else {
957            ProcessLinkStatus::Allowed
958        }
959    }
960
961    /// return `true` if URL:
962    ///
963    /// - is optionally whitelisted
964    /// - is not blacklisted
965    /// - is not forbidden in robot.txt file (if parameter is defined)
966    #[inline]
967    #[cfg(not(feature = "regex"))]
968    pub fn is_allowed_default(&self, link: &CompactString) -> ProcessLinkStatus {
969        let whitelist = self.configuration.get_whitelist_compiled();
970        let blacklist = self.configuration.get_blacklist_compiled();
971
972        let blocked_whitelist = !whitelist.is_empty() && !contains(whitelist, link);
973        let blocked_blacklist = !blacklist.is_empty() && contains(blacklist, link);
974
975        if blocked_whitelist || blocked_blacklist || !self.is_allowed_robots(link) {
976            ProcessLinkStatus::Blocked
977        } else {
978            ProcessLinkStatus::Allowed
979        }
980    }
981
982    /// return `true` if URL:
983    ///
984    /// - is not forbidden in robot.txt file (if parameter is defined)
985    pub fn is_allowed_robots(&self, link: &str) -> bool {
986        if self.configuration.respect_robots_txt {
987            if let Some(r) = &self.robot_file_parser {
988                return r.can_fetch(
989                    match &self.configuration.user_agent {
990                        Some(ua) => ua,
991                        _ => "*",
992                    },
993                    link,
994                );
995            }
996        }
997
998        true
999    }
1000
1001    /// Detect if the inner budget is exceeded
1002    pub(crate) fn is_over_inner_depth_budget(&mut self, link: &CaseInsensitiveString) -> bool {
1003        let mut over = false;
1004
1005        if let Some(segments) = get_path_from_url(link)
1006            .strip_prefix('/')
1007            .map(|remainder| remainder.split('/'))
1008        {
1009            let mut depth: usize = 0;
1010
1011            for _ in segments {
1012                depth = depth.saturating_add(1);
1013                if depth > self.configuration.depth_distance {
1014                    over = true;
1015                    break;
1016                }
1017            }
1018        }
1019
1020        over
1021    }
1022
1023    /// is over the wild card budget.
1024    #[cfg(feature = "sitemap")]
1025    pub(crate) fn is_over_wild_budget(
1026        &self,
1027        budget: &Option<hashbrown::HashMap<case_insensitive_string::CaseInsensitiveString, u32>>,
1028    ) -> bool {
1029        let exceeded_wild_budget = if self.configuration.wild_card_budgeting {
1030            match budget {
1031                Some(budget) => match budget.get(&*WILD_CARD_PATH) {
1032                    Some(budget) => {
1033                        if budget.abs_diff(0) == 1 {
1034                            true
1035                        } else {
1036                            false
1037                        }
1038                    }
1039                    _ => false,
1040                },
1041                _ => false,
1042            }
1043        } else {
1044            false
1045        };
1046        exceeded_wild_budget
1047    }
1048
1049    /// Detect if the inner budget is exceeded
1050    pub(crate) fn is_over_inner_budget(&mut self, link: &CaseInsensitiveString) -> bool {
1051        match self.configuration.inner_budget.as_mut() {
1052            Some(budget) => {
1053                let exceeded_wild_budget = if self.configuration.wild_card_budgeting {
1054                    match budget.get_mut(&*WILD_CARD_PATH) {
1055                        Some(budget) => {
1056                            if budget.abs_diff(0) == 1 {
1057                                true
1058                            } else {
1059                                *budget -= 1;
1060                                false
1061                            }
1062                        }
1063                        _ => false,
1064                    }
1065                } else {
1066                    false
1067                };
1068
1069                // set this up prior to crawl to avoid checks per link.
1070                // If only the wild card budget is set we can safely skip all checks.
1071                let skip_paths = self.configuration.wild_card_budgeting && budget.len() == 1;
1072                let has_depth_control = self.configuration.depth_distance > 0;
1073
1074                // check if paths pass
1075                if !skip_paths && !exceeded_wild_budget {
1076                    let path_segments = get_path_from_url(link)
1077                        .strip_prefix('/')
1078                        .map(|remainder| remainder.split('/'));
1079
1080                    match path_segments {
1081                        Some(segments) => {
1082                            let mut joint_segment = CaseInsensitiveString::default();
1083                            let mut over = false;
1084                            let mut depth: usize = 0;
1085
1086                            for seg in segments {
1087                                if has_depth_control {
1088                                    depth = depth.saturating_add(1);
1089                                    if depth > self.configuration.depth_distance {
1090                                        over = true;
1091                                        break;
1092                                    }
1093                                }
1094
1095                                joint_segment.push_str(seg);
1096
1097                                if budget.contains_key(&joint_segment) {
1098                                    if let Some(budget) = budget.get_mut(&joint_segment) {
1099                                        if budget.abs_diff(0) == 0 || *budget == 0 {
1100                                            over = true;
1101                                            break;
1102                                        } else {
1103                                            *budget -= 1;
1104                                            continue;
1105                                        }
1106                                    }
1107                                }
1108                            }
1109
1110                            over
1111                        }
1112                        _ => false,
1113                    }
1114                } else {
1115                    exceeded_wild_budget
1116                }
1117            }
1118            _ => false,
1119        }
1120    }
1121
1122    /// Validate if url exceeds crawl depth and should be ignored.
1123    pub(crate) fn is_over_depth(&mut self, link: &CaseInsensitiveString) -> bool {
1124        self.configuration.depth_distance > 0 && self.is_over_inner_depth_budget(link)
1125    }
1126
1127    /// Validate if url exceeds crawl budget and should not be handled.
1128    pub(crate) fn is_over_budget(&mut self, link: &CaseInsensitiveString) -> bool {
1129        self.is_over_inner_budget(link)
1130    }
1131
1132    /// Amount of pages crawled in memory only. Use get_size for full links between memory and disk.
1133    pub fn size(&self) -> usize {
1134        self.links_visited.len()
1135    }
1136
1137    /// Get the amount of resources collected.
1138    #[cfg(not(feature = "disk"))]
1139    pub async fn get_size(&self) -> usize {
1140        self.links_visited.len()
1141    }
1142
1143    /// Get the amount of resources collected.
1144    #[cfg(feature = "disk")]
1145    pub async fn get_size(&self) -> usize {
1146        let disk_count = if let Some(sqlite) = &self.sqlite {
1147            if sqlite.pool_inited() {
1148                let disk_count = DatabaseHandler::count_records(sqlite.get_db_pool().await).await;
1149                let disk_count = disk_count.unwrap_or_default() as usize;
1150                disk_count
1151            } else {
1152                0
1153            }
1154        } else {
1155            0
1156        };
1157
1158        let mut mem_count = self.links_visited.len();
1159
1160        if mem_count >= *LINKS_VISITED_MEMORY_LIMIT {
1161            mem_count -= *LINKS_VISITED_MEMORY_LIMIT;
1162        }
1163
1164        disk_count + mem_count
1165    }
1166
1167    /// Drain the extra links used for things like the sitemap.
1168    pub fn drain_extra_links(&mut self) -> hashbrown::hash_set::Drain<'_, CaseInsensitiveString> {
1169        self.extra_links.drain()
1170    }
1171
1172    /// Set the initial status code of the request.
1173    pub fn set_initial_status_code(&mut self, initial_status_code: StatusCode) {
1174        self.initial_status_code = initial_status_code;
1175    }
1176
1177    /// Get the initial status code of the request.
1178    pub fn get_initial_status_code(&self) -> &StatusCode {
1179        &self.initial_status_code
1180    }
1181
1182    /// Set the initial html size of the request.
1183    pub fn set_initial_html_length(&mut self, initial_html_length: usize) {
1184        self.initial_html_length = initial_html_length;
1185    }
1186
1187    /// Get the initial html size of the request.
1188    pub fn get_initial_html_length(&self) -> usize {
1189        self.initial_html_length
1190    }
1191
1192    /// Set the initial anti-bot tech code used for the intitial request.
1193    pub fn set_initial_anti_bot_tech(&mut self, initial_anti_bot_tech: AntiBotTech) {
1194        self.initial_anti_bot_tech = initial_anti_bot_tech;
1195    }
1196
1197    /// Get the initial anti-bot tech code used for the intitial request.
1198    pub fn get_initial_anti_bot_tech(&self) -> &AntiBotTech {
1199        &self.initial_anti_bot_tech
1200    }
1201
1202    /// Set the initial waf detected used for the intitial request
1203    pub fn set_initial_page_waf_check(&mut self, initial_page_waf_check: bool) {
1204        self.initial_page_waf_check = initial_page_waf_check;
1205    }
1206
1207    /// Get the initial waf detected used for the intitial request.
1208    pub fn get_initial_page_waf_check(&self) -> bool {
1209        self.initial_page_waf_check
1210    }
1211
1212    /// Set the initial page should retry determination used for the intitial request.
1213    pub fn set_initial_page_should_retry(&mut self, initial_page_should_retry: bool) {
1214        self.initial_page_should_retry = initial_page_should_retry;
1215    }
1216
1217    /// Get the initial page should retry determination used for the intitial request.
1218    pub fn get_initial_page_should_retry(&self) -> bool {
1219        self.initial_page_should_retry
1220    }
1221
1222    /// Drain the links visited.
1223    #[cfg(any(
1224        feature = "string_interner_bucket_backend",
1225        feature = "string_interner_string_backend",
1226        feature = "string_interner_buffer_backend",
1227    ))]
1228    pub fn drain_links(
1229        &mut self,
1230    ) -> hashbrown::hash_set::Drain<'_, string_interner::symbol::SymbolUsize> {
1231        self.links_visited.drain()
1232    }
1233
1234    #[cfg(not(any(
1235        feature = "string_interner_bucket_backend",
1236        feature = "string_interner_string_backend",
1237        feature = "string_interner_buffer_backend",
1238    )))]
1239    /// Drain the links visited.
1240    pub fn drain_links(&mut self) -> hashbrown::hash_set::Drain<'_, CaseInsensitiveString> {
1241        self.links_visited.drain()
1242    }
1243
1244    /// Drain the signatures visited.
1245    #[cfg(any(
1246        feature = "string_interner_bucket_backend",
1247        feature = "string_interner_string_backend",
1248        feature = "string_interner_buffer_backend",
1249    ))]
1250    pub fn drain_signatures(&mut self) -> hashbrown::hash_set::Drain<'_, u64> {
1251        self.signatures.drain()
1252    }
1253
1254    #[cfg(not(any(
1255        feature = "string_interner_bucket_backend",
1256        feature = "string_interner_string_backend",
1257        feature = "string_interner_buffer_backend",
1258    )))]
1259    /// Drain the signatures visited.
1260    pub fn drain_signatures(&mut self) -> hashbrown::hash_set::Drain<'_, u64> {
1261        self.signatures.drain()
1262    }
1263
1264    /// Set extra links to crawl. This could be used in conjuntion with 'website.persist_links' to extend the crawl on the next run.
1265    pub fn set_extra_links(
1266        &mut self,
1267        extra_links: HashSet<CaseInsensitiveString>,
1268    ) -> &HashSet<CaseInsensitiveString> {
1269        self.extra_links.extend(extra_links);
1270        &self.extra_links
1271    }
1272
1273    /// Get the extra links.
1274    pub fn get_extra_links(&self) -> &HashSet<CaseInsensitiveString> {
1275        &self.extra_links
1276    }
1277
1278    /// Clear all pages, disk, and links stored in memory.
1279    pub async fn clear_all(&mut self) {
1280        self.clear();
1281        self.clear_disk().await;
1282    }
1283
1284    /// Clear all pages and links stored in memory.
1285    pub fn clear(&mut self) {
1286        self.links_visited.clear();
1287        self.signatures.clear();
1288        self.pages.take();
1289        self.extra_links.clear();
1290    }
1291
1292    /// Get the HTTP request client. The client is set after the crawl has started.
1293    pub fn get_client(&self) -> &Option<Client> {
1294        &self.client
1295    }
1296
1297    /// Page getter.
1298    pub fn get_pages(&self) -> Option<&Vec<Page>> {
1299        self.pages.as_ref()
1300    }
1301
1302    /// Links visited getter for disk. This does nothing with `disk` flag enabled.
1303    #[cfg(not(feature = "disk"))]
1304    pub async fn get_links_disk(&self) -> HashSet<CaseInsensitiveString> {
1305        Default::default()
1306    }
1307
1308    /// Links visited getter for disk. This does nothing with `disk` flag enabled.
1309    #[cfg(feature = "disk")]
1310    pub async fn get_links_disk(&self) -> HashSet<CaseInsensitiveString> {
1311        if let Some(sqlite) = &self.sqlite {
1312            if sqlite.pool_inited() {
1313                if let Ok(links) =
1314                    DatabaseHandler::get_all_resources(sqlite.get_db_pool().await).await
1315                {
1316                    links
1317                } else {
1318                    Default::default()
1319                }
1320            } else {
1321                Default::default()
1322            }
1323        } else {
1324            Default::default()
1325        }
1326    }
1327
1328    /// Links all the links visited between memory and disk.
1329    #[cfg(feature = "disk")]
1330    pub async fn get_all_links_visited(&self) -> HashSet<CaseInsensitiveString> {
1331        let mut l = self.get_links_disk().await;
1332        let m = self.links_visited.get_links();
1333
1334        l.extend(m);
1335
1336        l
1337    }
1338
1339    /// Links all the links visited between memory and disk.
1340    #[cfg(not(feature = "disk"))]
1341    pub async fn get_all_links_visited(&self) -> HashSet<CaseInsensitiveString> {
1342        self.get_links()
1343    }
1344
1345    /// Links visited getter for memory resources.
1346    pub fn get_links(&self) -> HashSet<CaseInsensitiveString> {
1347        self.links_visited.get_links()
1348    }
1349
1350    /// Domain parsed url getter.
1351    pub fn get_url_parsed(&self) -> &Option<Box<Url>> {
1352        &self.domain_parsed
1353    }
1354
1355    /// Domain name getter.
1356    pub fn get_url(&self) -> &CaseInsensitiveString {
1357        &self.url
1358    }
1359
1360    /// Crawl delay getter.
1361    pub fn get_delay(&self) -> Duration {
1362        Duration::from_millis(self.configuration.delay)
1363    }
1364
1365    /// Get the active crawl status.
1366    pub fn get_status(&self) -> &CrawlStatus {
1367        &self.status
1368    }
1369
1370    /// Set the active crawl status. This is helpful when chaining crawls concurrently.
1371    pub fn set_status(&mut self, status: CrawlStatus) -> &CrawlStatus {
1372        self.status = status;
1373        &self.status
1374    }
1375
1376    /// Reset the active crawl status to bypass websites that are blocked.
1377    pub fn reset_status(&mut self) -> &CrawlStatus {
1378        self.status = CrawlStatus::Start;
1379        &self.status
1380    }
1381
1382    /// Set the crawl status to persist between the run.
1383    /// Example crawling a sitemap and all links after - website.crawl_sitemap().await.persist_links().crawl().await
1384    pub fn persist_links(&mut self) -> &mut Self {
1385        self.status = CrawlStatus::Active;
1386        self
1387    }
1388
1389    /// Absolute base url of crawl.
1390    pub fn get_absolute_path(&self, domain: Option<&str>) -> Option<Url> {
1391        if domain.is_some() {
1392            url::Url::parse(domain.unwrap_or_default())
1393                .ok()
1394                .map(|mut url| {
1395                    if let Ok(mut path) = url.path_segments_mut() {
1396                        path.clear();
1397                    }
1398                    url
1399                })
1400        } else if let Some(mut d) = self.domain_parsed.as_deref().cloned() {
1401            if let Ok(mut path) = d.path_segments_mut() {
1402                path.clear();
1403            }
1404            Some(d)
1405        } else {
1406            None
1407        }
1408    }
1409
1410    /// Stop all crawls for the website.
1411    pub fn stop(&mut self) {
1412        self.shutdown = true;
1413    }
1414
1415    /// Crawls commenced from fresh run.
1416    pub fn start(&mut self) {
1417        self.shutdown = false;
1418    }
1419
1420    /// configure the robots parser on initial crawl attempt and run.
1421    pub async fn configure_robots_parser(&mut self, client: &Client) {
1422        if self.configuration.respect_robots_txt {
1423            let robot_file_parser = self
1424                .robot_file_parser
1425                .get_or_insert_with(RobotFileParser::new);
1426
1427            if robot_file_parser.mtime() <= 4000 {
1428                let host_str = match &self.domain_parsed {
1429                    Some(domain) => domain.as_str(),
1430                    _ => self.url.inner(),
1431                };
1432
1433                if !host_str.is_empty() {
1434                    if host_str.ends_with('/') {
1435                        robot_file_parser.read(&client, host_str).await;
1436                    } else {
1437                        robot_file_parser
1438                            .read(&client, &string_concat!(host_str, "/"))
1439                            .await;
1440                    }
1441                }
1442                if let Some(delay) =
1443                    robot_file_parser.get_crawl_delay(&self.configuration.user_agent)
1444                {
1445                    self.configuration.delay = delay.as_millis().min(60000) as u64;
1446                }
1447            }
1448        }
1449    }
1450
1451    /// Setup strict a strict redirect policy for request. All redirects need to match the host.
1452    pub fn setup_strict_policy(&self) -> Policy {
1453        use crate::client::redirect::Attempt;
1454        use crate::page::domain_name;
1455        use std::sync::atomic::AtomicU8;
1456
1457        let default_policy = Policy::default();
1458
1459        match self.domain_parsed.as_deref().cloned() {
1460            Some(host_s) => {
1461                let initial_redirect_limit = if self.configuration.respect_robots_txt {
1462                    2
1463                } else {
1464                    1
1465                };
1466                let subdomains = self.configuration.subdomains;
1467                let tld = self.configuration.tld;
1468                let host_domain_name = if tld {
1469                    domain_name(&host_s).to_string()
1470                } else {
1471                    Default::default()
1472                };
1473                let redirect_limit = *self.configuration.redirect_limit;
1474
1475                let custom_policy = {
1476                    let initial_redirect = Arc::new(AtomicU8::new(0));
1477
1478                    move |attempt: Attempt| {
1479                        if tld && domain_name(attempt.url()) == host_domain_name
1480                            || subdomains
1481                                && attempt
1482                                    .url()
1483                                    .host_str()
1484                                    .unwrap_or_default()
1485                                    .ends_with(host_s.host_str().unwrap_or_default())
1486                            || attempt.url().host() == host_s.host()
1487                        {
1488                            default_policy.redirect(attempt)
1489                        } else if attempt.previous().len() > redirect_limit {
1490                            attempt.error("too many redirects")
1491                        } else if attempt.status().is_redirection()
1492                            && (0..initial_redirect_limit)
1493                                .contains(&initial_redirect.load(Ordering::Relaxed))
1494                        {
1495                            initial_redirect.fetch_add(1, Ordering::Relaxed);
1496                            default_policy.redirect(attempt)
1497                        } else {
1498                            attempt.stop()
1499                        }
1500                    }
1501                };
1502                Policy::custom(custom_policy)
1503            }
1504            _ => default_policy,
1505        }
1506    }
1507
1508    /// Setup redirect policy for reqwest.
1509    pub fn setup_redirect_policy(&self) -> Policy {
1510        match self.configuration.redirect_policy {
1511            RedirectPolicy::Loose => Policy::limited(*self.configuration.redirect_limit),
1512            RedirectPolicy::None => Policy::none(),
1513            RedirectPolicy::Strict => self.setup_strict_policy(),
1514        }
1515    }
1516
1517    /// Configure the headers to use.
1518    pub fn configure_headers(&mut self) {
1519        let mut headers: reqwest::header::HeaderMap = reqwest::header::HeaderMap::new();
1520
1521        let user_agent = match &self.configuration.user_agent {
1522            Some(ua) => ua.as_str(),
1523            _ => get_ua(self.configuration.only_chrome_agent()),
1524        };
1525
1526        if self.configuration.modify_headers {
1527            crate::utils::header_utils::extend_headers(
1528                &mut headers,
1529                user_agent,
1530                &self.configuration.headers,
1531                &None,
1532                &self.configuration.viewport,
1533                &self.domain_parsed,
1534            );
1535
1536            if !headers.is_empty() {
1537                // always remove the referer header.
1538                if let Some(referer) = headers.remove(REFERER) {
1539                    if let Ok(v) = referer.to_str() {
1540                        // modify the default referer
1541                        if self.configuration.referer.is_none() && !v.is_empty() {
1542                            self.configuration.referer = Some(v.into())
1543                        }
1544                    }
1545                }
1546                self.configuration
1547                    .headers
1548                    .replace(Box::new(SerializableHeaderMap::from(headers)));
1549            }
1550        }
1551    }
1552
1553    #[cfg(all(not(feature = "wreq"), not(feature = "decentralized")))]
1554    /// Base client configuration.
1555    pub fn configure_base_client(&self) -> ClientBuilder {
1556        let policy = self.setup_redirect_policy();
1557
1558        let user_agent = match &self.configuration.user_agent {
1559            Some(ua) => ua.as_str(),
1560            _ => get_ua(self.configuration.only_chrome_agent()),
1561        };
1562
1563        // let missing_host =
1564        //     !headers.contains_key(crate::client::header::HOST) && !headers.contains_key("Host");
1565        let missing_agent = match &self.configuration.headers {
1566            Some(headers) => {
1567                !headers.contains_key(crate::client::header::USER_AGENT)
1568                    && !headers.contains_key("User-Agent")
1569            }
1570            _ => true,
1571        };
1572
1573        let timeout_mult = if self.configuration.proxies.is_some() {
1574            2
1575        } else {
1576            1
1577        };
1578
1579        let client = reqwest::Client::builder()
1580            .redirect(policy)
1581            .http09_responses()
1582            .http1_ignore_invalid_headers_in_responses(true)
1583            .referer(self.configuration.referer.is_none())
1584            .connect_timeout(
1585                self.configuration
1586                    .default_http_connect_timeout
1587                    .unwrap_or(Duration::from_secs(24 * timeout_mult)),
1588            )
1589            .read_timeout(
1590                self.configuration
1591                    .default_http_read_timeout
1592                    .unwrap_or(Duration::from_secs(42 * timeout_mult)),
1593            )
1594            .http1_title_case_headers()
1595            .http1_allow_obsolete_multiline_headers_in_responses(true)
1596            .http1_allow_spaces_after_header_name_in_responses(true)
1597            // .http1_preserve_header_order()
1598            // .http1_preserve_header_case()
1599            .danger_accept_invalid_certs(self.configuration.accept_invalid_certs);
1600
1601        let client = if let Some(network_interface) = &self.configuration.network_interface {
1602            set_interface(client, &network_interface)
1603        } else {
1604            client
1605        };
1606
1607        let client = if let Some(local_address) = &self.configuration.local_address {
1608            client.local_address(*local_address)
1609        } else {
1610            client
1611        };
1612
1613        let client = if self.configuration.proxies.is_none() {
1614            client
1615        } else {
1616            client.tcp_keepalive(Duration::from_secs(30))
1617        };
1618
1619        // check both casing for user-agent
1620        let client = if missing_agent {
1621            client.user_agent(user_agent)
1622        } else {
1623            client
1624        };
1625
1626        let client = if self.configuration.http2_prior_knowledge {
1627            client.http2_prior_knowledge()
1628        } else {
1629            client
1630        };
1631
1632        crate::utils::header_utils::setup_default_headers(client, &self.configuration)
1633    }
1634
1635    #[cfg(all(feature = "wreq", not(feature = "decentralized")))]
1636    /// Base client configuration.
1637    pub fn configure_base_client(&self) -> ClientBuilder {
1638        let policy = self.setup_redirect_policy();
1639
1640        let user_agent = match &self.configuration.user_agent {
1641            Some(ua) => ua.as_str(),
1642            _ => get_ua(self.configuration.only_chrome_agent()),
1643        };
1644
1645        let missing_agent = match &self.configuration.headers {
1646            Some(headers) => {
1647                !headers.contains_key(crate::client::header::USER_AGENT)
1648                    && !headers.contains_key("User-Agent")
1649            }
1650            _ => true,
1651        };
1652
1653        let timeout_mult = if self.configuration.proxies.is_some() {
1654            2
1655        } else {
1656            1
1657        };
1658
1659        let client = Client::builder()
1660            .redirect(policy)
1661            .referer(self.configuration.referer.is_none())
1662            .connect_timeout(
1663                self.configuration
1664                    .default_http_connect_timeout
1665                    .unwrap_or(Duration::from_secs(24 * timeout_mult)),
1666            )
1667            .read_timeout(
1668                self.configuration
1669                    .default_http_read_timeout
1670                    .unwrap_or(Duration::from_secs(42 * timeout_mult)),
1671            );
1672
1673        let client = if let Some(local_address) = &self.configuration.local_address {
1674            client.local_address(*local_address)
1675        } else {
1676            client
1677        };
1678
1679        let client = if self.configuration.proxies.is_none() {
1680            client
1681        } else {
1682            client.tcp_keepalive(Duration::from_secs(30))
1683        };
1684
1685        let client = if missing_agent {
1686            client.user_agent(user_agent)
1687        } else {
1688            client
1689        };
1690
1691        let client = if let Some(emulation) = self.configuration.emulation {
1692            client.emulation(emulation)
1693        } else {
1694            client
1695        };
1696
1697        crate::utils::header_utils::setup_default_headers(client, &self.configuration)
1698    }
1699
1700    /// Build the HTTP client.
1701    #[cfg(all(not(feature = "decentralized"), not(feature = "cache_request")))]
1702    pub fn configure_http_client_builder(&self) -> ClientBuilder {
1703        let client = self.configure_base_client();
1704
1705        let mut client = match &self.configuration.request_timeout {
1706            Some(t) => client.timeout(**t),
1707            _ => client,
1708        };
1709
1710        let client = match &self.configuration.proxies {
1711            Some(proxies) => {
1712                let linux = cfg!(target_os = "linux");
1713                let ignore_plain_socks = proxies.len() >= 2 && linux;
1714                let replace_plain_socks = proxies.len() == 1 && linux;
1715
1716                for proxie in proxies.iter() {
1717                    if proxie.ignore == crate::configuration::ProxyIgnore::Http {
1718                        continue;
1719                    }
1720
1721                    let proxie = &proxie.addr;
1722                    let socks = proxie.starts_with("socks://");
1723
1724                    // we can skip it and use another proxy from the list.
1725                    if ignore_plain_socks && socks {
1726                        continue;
1727                    }
1728
1729                    // use HTTP instead as reqwest does not support the protocol on linux.
1730                    if replace_plain_socks && socks {
1731                        if let Ok(proxy) =
1732                            crate::client::Proxy::all(&proxie.replacen("socks://", "http://", 1))
1733                        {
1734                            client = client.proxy(proxy);
1735                        }
1736                    } else {
1737                        if let Ok(proxy) = crate::client::Proxy::all(proxie) {
1738                            client = client.proxy(proxy);
1739                        }
1740                    }
1741                }
1742
1743                client
1744            }
1745            _ => client,
1746        };
1747
1748        // Spider Cloud proxy injection (modes that use proxy transport)
1749        #[cfg(feature = "spider_cloud")]
1750        let client = if let Some(ref sc) = self.configuration.spider_cloud {
1751            if sc.uses_proxy() {
1752                match (crate::client::Proxy::all(&sc.proxy_url), reqwest::header::HeaderValue::from_str(&format!("Bearer {}", sc.api_key))) {
1753                    (Ok(proxy), Ok(auth_value)) => client.proxy(proxy.custom_http_auth(auth_value)),
1754                    _ => client,
1755                }
1756            } else {
1757                client
1758            }
1759        } else {
1760            client
1761        };
1762
1763        let client = if crate::utils::connect::background_connect_threading() {
1764            client.connector_layer(crate::utils::connect::BackgroundProcessorLayer::new())
1765        } else {
1766            client
1767        };
1768
1769        let client = match self.configuration.concurrency_limit {
1770            Some(limit) => {
1771                client.connector_layer(tower::limit::concurrency::ConcurrencyLimitLayer::new(limit))
1772            }
1773            _ => client,
1774        };
1775
1776        self.configure_http_client_cookies(client)
1777    }
1778
1779    /// Build the HTTP client with caching enabled.
1780    #[cfg(all(not(feature = "decentralized"), feature = "cache_request"))]
1781    pub fn configure_http_client_builder(&self) -> reqwest_middleware::ClientBuilder {
1782        use crate::utils::create_cache_key;
1783        let client = self.configure_base_client();
1784
1785        let mut client = match &self.configuration.request_timeout {
1786            Some(t) => client.timeout(**t),
1787            _ => client,
1788        };
1789
1790        let client = match &self.configuration.proxies {
1791            Some(proxies) => {
1792                let linux = cfg!(target_os = "linux");
1793                let ignore_plain_socks = proxies.len() >= 2 && linux;
1794                let replace_plain_socks = proxies.len() == 1 && linux;
1795
1796                for proxie in proxies.iter() {
1797                    if proxie.ignore == crate::configuration::ProxyIgnore::Http {
1798                        continue;
1799                    }
1800                    let proxie = &proxie.addr;
1801
1802                    let socks = proxie.starts_with("socks://");
1803
1804                    // we can skip it and use another proxy from the list.
1805                    if ignore_plain_socks && socks {
1806                        continue;
1807                    }
1808
1809                    // use HTTP instead as reqwest does not support the protocol on linux.
1810                    if replace_plain_socks && socks {
1811                        if let Ok(proxy) =
1812                            crate::client::Proxy::all(&proxie.replacen("socks://", "http://", 1))
1813                        {
1814                            client = client.proxy(proxy);
1815                        }
1816                    } else {
1817                        if let Ok(proxy) = crate::client::Proxy::all(proxie) {
1818                            client = client.proxy(proxy);
1819                        }
1820                    }
1821                }
1822
1823                client
1824            }
1825            _ => client,
1826        };
1827
1828        // Spider Cloud proxy injection (modes that use proxy transport)
1829        #[cfg(feature = "spider_cloud")]
1830        let client = if let Some(ref sc) = self.configuration.spider_cloud {
1831            if sc.uses_proxy() {
1832                match (crate::client::Proxy::all(&sc.proxy_url), reqwest::header::HeaderValue::from_str(&format!("Bearer {}", sc.api_key))) {
1833                    (Ok(proxy), Ok(auth_value)) => client.proxy(proxy.custom_http_auth(auth_value)),
1834                    _ => client,
1835                }
1836            } else {
1837                client
1838            }
1839        } else {
1840            client
1841        };
1842
1843        let client = self.configure_http_client_cookies(client);
1844
1845        let client = if crate::utils::connect::background_connect_threading() {
1846            client.connector_layer(crate::utils::connect::BackgroundProcessorLayer::new())
1847        } else {
1848            client
1849        };
1850
1851        let client = match self.configuration.concurrency_limit {
1852            Some(limit) => {
1853                client.connector_layer(tower::limit::concurrency::ConcurrencyLimitLayer::new(limit))
1854            }
1855            _ => client,
1856        };
1857
1858        let client =
1859            reqwest_middleware::ClientBuilder::new(unsafe { client.build().unwrap_unchecked() });
1860
1861        if self.configuration.cache {
1862            let mut cache_options = HttpCacheOptions::default();
1863
1864            cache_options.cache_key = Some(Arc::new(|req: &http::request::Parts| {
1865                let mut auth_token = None;
1866                if let Some(auth) = req.headers.get("authorization") {
1867                    if let Ok(token) = auth.to_str() {
1868                        if !token.is_empty() {
1869                            auth_token = Some(token);
1870                        }
1871                    }
1872                }
1873                create_cache_key(req, Some(req.method.as_str()), auth_token)
1874            }));
1875            client.with(Cache(HttpCache {
1876                mode: CacheMode::Default,
1877                manager: CACACHE_MANAGER.clone(),
1878                options: cache_options,
1879            }))
1880        } else {
1881            client
1882        }
1883    }
1884
1885    /// Build the HTTP client with cookie configurations.
1886    #[cfg(all(not(feature = "decentralized"), feature = "cookies"))]
1887    pub fn configure_http_client_cookies(
1888        &self,
1889        client: crate::client::ClientBuilder,
1890    ) -> crate::client::ClientBuilder {
1891        let client = client.cookie_provider(self.cookie_jar.clone());
1892
1893        if !self.configuration.cookie_str.is_empty() {
1894            if let Some(url) = self.domain_parsed.as_ref() {
1895                self.cookie_jar
1896                    .add_cookie_str(&self.configuration.cookie_str, url);
1897            }
1898        }
1899
1900        client
1901    }
1902
1903    /// Build the client with cookie configurations. This does nothing with [cookies] flag enabled.
1904    #[cfg(all(not(feature = "decentralized"), not(feature = "cookies")))]
1905    pub fn configure_http_client_cookies(
1906        &self,
1907        client: crate::client::ClientBuilder,
1908    ) -> crate::client::ClientBuilder {
1909        client
1910    }
1911
1912    /// Set the HTTP client to use directly. This is helpful if you manually call 'website.configure_http_client' before the crawl.
1913    pub fn set_http_client(&mut self, client: Client) -> &Option<Client> {
1914        self.client = Some(client);
1915        &self.client
1916    }
1917
1918    /// Build a client configured with a single proxy for use in rotation.
1919    #[cfg(all(not(feature = "decentralized"), not(feature = "cache_request")))]
1920    fn build_single_proxy_client(
1921        &self,
1922        proxy: &crate::configuration::RequestProxy,
1923    ) -> Option<Client> {
1924        if proxy.ignore == crate::configuration::ProxyIgnore::Http {
1925            return None;
1926        }
1927
1928        let client = self.configure_base_client();
1929
1930        let client = match &self.configuration.request_timeout {
1931            Some(t) => client.timeout(**t),
1932            _ => client,
1933        };
1934
1935        let addr = &proxy.addr;
1936        let linux = cfg!(target_os = "linux");
1937        let socks = addr.starts_with("socks://");
1938
1939        let client = if socks && linux {
1940            match crate::client::Proxy::all(&addr.replacen("socks://", "http://", 1)) {
1941                Ok(p) => client.proxy(p),
1942                Err(_) => return None,
1943            }
1944        } else {
1945            match crate::client::Proxy::all(addr) {
1946                Ok(p) => client.proxy(p),
1947                Err(_) => return None,
1948            }
1949        };
1950
1951        #[cfg(feature = "spider_cloud")]
1952        let client = if let Some(ref sc) = self.configuration.spider_cloud {
1953            if sc.uses_proxy() {
1954                match (
1955                    crate::client::Proxy::all(&sc.proxy_url),
1956                    reqwest::header::HeaderValue::from_str(&format!("Bearer {}", sc.api_key)),
1957                ) {
1958                    (Ok(proxy), Ok(auth_value)) => {
1959                        client.proxy(proxy.custom_http_auth(auth_value))
1960                    }
1961                    _ => client,
1962                }
1963            } else {
1964                client
1965            }
1966        } else {
1967            client
1968        };
1969
1970        let client = if crate::utils::connect::background_connect_threading() {
1971            client.connector_layer(crate::utils::connect::BackgroundProcessorLayer::new())
1972        } else {
1973            client
1974        };
1975
1976        let client = match self.configuration.concurrency_limit {
1977            Some(limit) => client
1978                .connector_layer(tower::limit::concurrency::ConcurrencyLimitLayer::new(limit)),
1979            _ => client,
1980        };
1981
1982        let client = self.configure_http_client_cookies(client);
1983        unsafe { Some(client.build().unwrap_unchecked()) }
1984    }
1985
1986    /// Build a client configured with a single proxy for use in rotation (cache_request variant).
1987    #[cfg(all(not(feature = "decentralized"), feature = "cache_request"))]
1988    fn build_single_proxy_client(
1989        &self,
1990        proxy: &crate::configuration::RequestProxy,
1991    ) -> Option<Client> {
1992        use crate::utils::create_cache_key;
1993
1994        if proxy.ignore == crate::configuration::ProxyIgnore::Http {
1995            return None;
1996        }
1997
1998        let client = self.configure_base_client();
1999
2000        let client = match &self.configuration.request_timeout {
2001            Some(t) => client.timeout(**t),
2002            _ => client,
2003        };
2004
2005        let addr = &proxy.addr;
2006        let linux = cfg!(target_os = "linux");
2007        let socks = addr.starts_with("socks://");
2008
2009        let client = if socks && linux {
2010            match crate::client::Proxy::all(&addr.replacen("socks://", "http://", 1)) {
2011                Ok(p) => client.proxy(p),
2012                Err(_) => return None,
2013            }
2014        } else {
2015            match crate::client::Proxy::all(addr) {
2016                Ok(p) => client.proxy(p),
2017                Err(_) => return None,
2018            }
2019        };
2020
2021        #[cfg(feature = "spider_cloud")]
2022        let client = if let Some(ref sc) = self.configuration.spider_cloud {
2023            if sc.uses_proxy() {
2024                match (
2025                    crate::client::Proxy::all(&sc.proxy_url),
2026                    reqwest::header::HeaderValue::from_str(&format!("Bearer {}", sc.api_key)),
2027                ) {
2028                    (Ok(proxy), Ok(auth_value)) => {
2029                        client.proxy(proxy.custom_http_auth(auth_value))
2030                    }
2031                    _ => client,
2032                }
2033            } else {
2034                client
2035            }
2036        } else {
2037            client
2038        };
2039
2040        let client = self.configure_http_client_cookies(client);
2041
2042        let client = if crate::utils::connect::background_connect_threading() {
2043            client.connector_layer(crate::utils::connect::BackgroundProcessorLayer::new())
2044        } else {
2045            client
2046        };
2047
2048        let client = match self.configuration.concurrency_limit {
2049            Some(limit) => client
2050                .connector_layer(tower::limit::concurrency::ConcurrencyLimitLayer::new(limit)),
2051            _ => client,
2052        };
2053
2054        let client =
2055            reqwest_middleware::ClientBuilder::new(unsafe { client.build().unwrap_unchecked() });
2056
2057        if self.configuration.cache {
2058            let mut cache_options = HttpCacheOptions::default();
2059
2060            cache_options.cache_key = Some(Arc::new(|req: &http::request::Parts| {
2061                let mut auth_token = None;
2062                if let Some(auth) = req.headers.get("authorization") {
2063                    if let Ok(token) = auth.to_str() {
2064                        if !token.is_empty() {
2065                            auth_token = Some(token);
2066                        }
2067                    }
2068                }
2069                create_cache_key(req, Some(req.method.as_str()), auth_token)
2070            }));
2071
2072            Some(
2073                client
2074                    .with(Cache(HttpCache {
2075                        mode: CacheMode::Default,
2076                        manager: CACACHE_MANAGER.clone(),
2077                        options: cache_options,
2078                    }))
2079                    .build(),
2080            )
2081        } else {
2082            Some(client.build())
2083        }
2084    }
2085
2086    /// Build rotated clients from the proxy list. Returns None if fewer than 2 proxies.
2087    #[cfg(not(feature = "decentralized"))]
2088    fn build_rotated_clients(&self) -> Option<Arc<ClientRotator>> {
2089        let proxies = self.configuration.proxies.as_ref()?;
2090        if proxies.len() < 2 {
2091            return None;
2092        }
2093        let clients: Vec<Client> = proxies
2094            .iter()
2095            .filter_map(|proxy| self.build_single_proxy_client(proxy))
2096            .collect();
2097        if clients.len() < 2 {
2098            return None;
2099        }
2100        Some(Arc::new(ClientRotator::new(clients)))
2101    }
2102
2103    /// Configure http client.
2104    #[cfg(all(not(feature = "decentralized"), not(feature = "cache_request")))]
2105    pub fn configure_http_client(&self) -> Client {
2106        let client = self.configure_http_client_builder();
2107        // should unwrap using native-tls-alpn
2108        unsafe { client.build().unwrap_unchecked() }
2109    }
2110
2111    /// Configure http client.
2112    #[cfg(all(not(feature = "decentralized"), feature = "cache_request"))]
2113    pub fn configure_http_client(&self) -> Client {
2114        let client = self.configure_http_client_builder();
2115        client.build()
2116    }
2117
2118    /// Configure http client for decentralization.
2119    #[cfg(all(feature = "decentralized", not(feature = "cache_request")))]
2120    pub fn configure_http_client(&self) -> Client {
2121        use reqwest::header::{HeaderMap, HeaderValue};
2122
2123        let mut headers = HeaderMap::new();
2124
2125        let policy = self.setup_redirect_policy();
2126
2127        let mut client = Client::builder()
2128            .user_agent(match &self.configuration.user_agent {
2129                Some(ua) => ua.as_str(),
2130                _ => &get_ua(self.configuration.only_chrome_agent()),
2131            })
2132            .redirect(policy)
2133            .tcp_keepalive(Duration::from_millis(500));
2134
2135        let referer = if self.configuration.tld && self.configuration.subdomains {
2136            2
2137        } else if self.configuration.tld {
2138            2
2139        } else if self.configuration.subdomains {
2140            1
2141        } else {
2142            0
2143        };
2144
2145        if referer > 0 {
2146            // use expected http headers for providers that drop invalid headers
2147            headers.insert(reqwest::header::REFERER, HeaderValue::from(referer));
2148        }
2149
2150        if let Some(h) = &self.configuration.headers {
2151            headers.extend(h.inner().clone());
2152        }
2153
2154        if let Some(domain_url) = self.get_absolute_path(None) {
2155            let domain_url = domain_url.as_str();
2156            let domain_host = if domain_url.ends_with("/") {
2157                &domain_url[0..domain_url.len() - 1]
2158            } else {
2159                domain_url
2160            };
2161            if let Ok(value) = HeaderValue::from_str(domain_host) {
2162                headers.insert(reqwest::header::HOST, value);
2163            }
2164        }
2165
2166        for worker in WORKERS.iter() {
2167            if let Ok(worker) = crate::client::Proxy::all(worker) {
2168                client = client.proxy(worker);
2169            }
2170        }
2171
2172        if !self.configuration.modify_headers && self.configuration.modify_http_client_headers {
2173            if let Some(ua) = &self.configuration.user_agent {
2174                crate::utils::header_utils::extend_headers(
2175                    &mut headers,
2176                    ua,
2177                    &self.configuration.headers,
2178                    &None,
2179                    &self.configuration.viewport,
2180                    &self.domain_parsed,
2181                );
2182            }
2183        }
2184
2185        // should unwrap using native-tls-alpn
2186        unsafe {
2187            match &self.configuration.request_timeout {
2188                Some(t) => client.timeout(**t),
2189                _ => client,
2190            }
2191            .default_headers(headers)
2192            .build()
2193            .unwrap_unchecked()
2194        }
2195    }
2196
2197    /// Configure http client for decentralization.
2198    #[cfg(all(feature = "decentralized", feature = "cache_request"))]
2199    pub fn configure_http_client(&mut self) -> Client {
2200        use crate::utils::create_cache_key;
2201        use reqwest::header::{HeaderMap, HeaderValue};
2202        use reqwest_middleware::ClientBuilder;
2203
2204        let mut headers = HeaderMap::new();
2205
2206        let policy = self.setup_redirect_policy();
2207
2208        let mut client = reqwest::Client::builder()
2209            .user_agent(match &self.configuration.user_agent {
2210                Some(ua) => ua.as_str(),
2211                _ => &get_ua(self.configuration.only_chrome_agent()),
2212            })
2213            .redirect(policy)
2214            .tcp_keepalive(Duration::from_millis(500));
2215
2216        let referer = if self.configuration.tld && self.configuration.subdomains {
2217            2
2218        } else if self.configuration.tld {
2219            2
2220        } else if self.configuration.subdomains {
2221            1
2222        } else {
2223            0
2224        };
2225
2226        if referer > 0 {
2227            // use expected http headers for providers that drop invalid headers
2228            headers.insert(reqwest::header::REFERER, HeaderValue::from(referer));
2229        }
2230
2231        if let Some(h) = &self.configuration.headers {
2232            headers.extend(h.inner().clone());
2233        }
2234
2235        if let Some(domain_url) = self.get_absolute_path(None) {
2236            let domain_url = domain_url.as_str();
2237            let domain_host = if domain_url.ends_with("/") {
2238                &domain_url[0..domain_url.len() - 1]
2239            } else {
2240                domain_url
2241            };
2242            if let Ok(value) = HeaderValue::from_str(domain_host) {
2243                headers.insert(reqwest::header::HOST, value);
2244            }
2245        }
2246
2247        for worker in WORKERS.iter() {
2248            if let Ok(worker) = crate::client::Proxy::all(worker) {
2249                client = client.proxy(worker);
2250            }
2251        }
2252
2253        let mut cache_options = HttpCacheOptions::default();
2254
2255        cache_options.cache_key = Some(Arc::new(|req: &http::request::Parts| {
2256            let mut auth_token = None;
2257            if let Some(auth) = req.headers.get("authorization") {
2258                if let Ok(token) = auth.to_str() {
2259                    if !token.is_empty() {
2260                        auth_token = Some(token);
2261                    }
2262                }
2263            }
2264            create_cache_key(req, Some(req.method.as_str()), auth_token)
2265        }));
2266
2267        if !self.configuration.modify_headers && self.configuration.modify_http_client_headers {
2268            if let Some(ua) = &self.configuration.user_agent {
2269                crate::utils::header_utils::extend_headers(
2270                    &mut headers,
2271                    ua,
2272                    &self.configuration.headers,
2273                    &None,
2274                    &self.configuration.viewport,
2275                    &self.domain_parsed,
2276                );
2277            }
2278        }
2279
2280        let client = ClientBuilder::new(unsafe {
2281            match &self.configuration.request_timeout {
2282                Some(t) => client.timeout(**t),
2283                _ => client,
2284            }
2285            .default_headers(headers)
2286            .build()
2287            .unwrap_unchecked()
2288        })
2289        .with(Cache(HttpCache {
2290            mode: CacheMode::Default,
2291            manager: CACACHE_MANAGER.clone(),
2292            options: cache_options,
2293        }));
2294
2295        client.build()
2296    }
2297
2298    /// Setup atomic controller. This does nothing without the 'control' feature flag enabled.
2299    #[cfg(feature = "control")]
2300    pub fn configure_handler(&self) -> Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)> {
2301        use crate::utils::{Handler, CONTROLLER};
2302
2303        if self.configuration.no_control_thread {
2304            None
2305        } else {
2306            let c: Arc<AtomicI8> = Arc::new(AtomicI8::new(0));
2307            let handle = c.clone();
2308            let target_id = self.target_id();
2309
2310            let join_handle = crate::utils::spawn_task("control_handler", async move {
2311                let mut l = CONTROLLER.read().await.1.to_owned();
2312
2313                while l.changed().await.is_ok() {
2314                    let n = &*l.borrow();
2315                    let (target, rest) = n;
2316
2317                    if target_id.eq_ignore_ascii_case(&target) {
2318                        if rest == &Handler::Resume {
2319                            c.store(0, Ordering::Relaxed);
2320                        }
2321                        if rest == &Handler::Pause {
2322                            c.store(1, Ordering::Relaxed);
2323                        }
2324                        if rest == &Handler::Shutdown {
2325                            c.store(2, Ordering::Relaxed);
2326                        }
2327                    }
2328                }
2329            });
2330
2331            Some((handle, join_handle))
2332        }
2333    }
2334
2335    #[cfg(not(feature = "control"))]
2336    /// Setup atomic controller. This does nothing without the 'control' feature flag enabled.
2337    pub fn configure_handler(&self) -> Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)> {
2338        None
2339    }
2340
2341    /// Setup interception for chrome request.
2342    #[cfg(all(feature = "chrome", feature = "chrome_intercept"))]
2343    pub async fn setup_chrome_interception(
2344        &self,
2345        page: &chromiumoxide::Page,
2346    ) -> Option<tokio::task::JoinHandle<()>> {
2347        crate::features::chrome::setup_chrome_interception_base(
2348            page,
2349            self.configuration.chrome_intercept.enabled,
2350            &self.configuration.auth_challenge_response,
2351            self.configuration.chrome_intercept.block_visuals,
2352            self.url.inner(),
2353        )
2354        .await
2355    }
2356
2357    /// Setup interception for chrome request
2358    #[cfg(all(feature = "chrome", not(feature = "chrome_intercept")))]
2359    pub async fn setup_chrome_interception(
2360        &self,
2361        _chrome_page: &chromiumoxide::Page,
2362    ) -> Option<tokio::task::JoinHandle<()>> {
2363        None
2364    }
2365
2366    /// Setup selectors for handling link targets.
2367    pub fn setup_selectors(&self) -> RelativeSelectors {
2368        setup_website_selectors(
2369            self.get_url().inner(),
2370            AllowedDomainTypes::new(self.configuration.subdomains, self.configuration.tld),
2371        )
2372    }
2373
2374    /// Base configuration setup.
2375    pub fn setup_base(&mut self) -> (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) {
2376        self.determine_limits();
2377        self.setup_disk();
2378        self.configure_headers();
2379
2380        crate::utils::connect::init_background_runtime();
2381
2382        let client = match self.client.take() {
2383            Some(client) => client,
2384            _ => self.configure_http_client(),
2385        };
2386
2387        #[cfg(not(feature = "decentralized"))]
2388        {
2389            self.client_rotator = self.build_rotated_clients();
2390        }
2391
2392        (client, self.configure_handler())
2393    }
2394
2395    /// Setup config for crawl.
2396    pub async fn setup(
2397        &mut self,
2398    ) -> (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) {
2399        let setup = self.setup_base();
2400        if self.status != CrawlStatus::Active {
2401            self.clear_all().await;
2402        } else {
2403            self.skip_initial = !self.extra_links.is_empty();
2404        }
2405        self.configure_robots_parser(&setup.0).await;
2406        setup
2407    }
2408
2409    /// Setup shared concurrent configs.
2410    pub fn setup_crawl(
2411        &self,
2412    ) -> (
2413        std::pin::Pin<Box<tokio::time::Interval>>,
2414        std::pin::Pin<Box<Duration>>,
2415    ) {
2416        let interval = Box::pin(tokio::time::interval(Duration::from_millis(10)));
2417        let throttle = Box::pin(self.get_delay());
2418
2419        (interval, throttle)
2420    }
2421
2422    /// Get all the expanded links.
2423    #[cfg(feature = "glob")]
2424    pub fn get_expanded_links(&self, domain_name: &str) -> Vec<CaseInsensitiveString> {
2425        let mut expanded = crate::features::glob::expand_url(&domain_name);
2426
2427        if expanded.len() == 0 {
2428            if let Some(u) = self.get_absolute_path(Some(domain_name)) {
2429                expanded.push(u.as_str().into());
2430            }
2431        };
2432
2433        expanded
2434    }
2435
2436    /// Set the initial crawl status by page output.
2437    pub fn set_crawl_initial_status(
2438        &mut self,
2439        page: &crate::page::Page,
2440        links: &HashSet<CaseInsensitiveString>,
2441    ) {
2442        use crate::utils::{detect_open_resty_forbidden, APACHE_FORBIDDEN};
2443
2444        if page.status_code == reqwest::StatusCode::FORBIDDEN && links.is_empty() {
2445            if is_safe_javascript_challenge(&page) {
2446                self.website_meta_info = WebsiteMetaInfo::RequiresJavascript;
2447            } else if page.get_html_bytes_u8() == *APACHE_FORBIDDEN {
2448                self.website_meta_info = WebsiteMetaInfo::Apache403;
2449            } else if detect_open_resty_forbidden(page.get_html_bytes_u8()) {
2450                self.website_meta_info = WebsiteMetaInfo::OpenResty403;
2451            }
2452            self.status = CrawlStatus::Blocked;
2453        } else if page.status_code == reqwest::StatusCode::TOO_MANY_REQUESTS {
2454            self.status = CrawlStatus::RateLimited;
2455        } else if page.status_code.is_server_error() {
2456            self.status = CrawlStatus::ServerError;
2457        } else if page.is_empty() {
2458            if page.status_code == *UNKNOWN_STATUS_ERROR
2459                || page.status_code == *CHROME_UNKNOWN_STATUS_ERROR
2460            {
2461                self.status = CrawlStatus::ConnectError;
2462            } else {
2463                self.status = CrawlStatus::Empty;
2464            }
2465        }
2466    }
2467
2468    /// Expand links for crawl base establish using a **command-based fetch**.
2469    #[cfg(feature = "cmd")]
2470    pub async fn _crawl_establish_cmd(
2471        &mut self,
2472        cmd: std::path::PathBuf,
2473        cmd_args: Vec<String>,
2474        base: &mut RelativeSelectors,
2475        _ssg_build: bool,
2476    ) -> HashSet<CaseInsensitiveString> {
2477        if self.skip_initial {
2478            return Default::default();
2479        }
2480
2481        if !self
2482            .is_allowed_default(self.get_base_link())
2483            .eq(&ProcessLinkStatus::Allowed)
2484        {
2485            return HashSet::new();
2486        }
2487
2488        let url = self.url.inner();
2489
2490        let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
2491        let mut links_ssg = HashSet::new();
2492        let mut links_pages = if self.configuration.return_page_links {
2493            Some(HashSet::new())
2494        } else {
2495            None
2496        };
2497
2498        let mut page_links_settings =
2499            PageLinkBuildSettings::new(true, self.configuration.full_resources);
2500        page_links_settings.subdomains = self.configuration.subdomains;
2501        page_links_settings.tld = self.configuration.tld;
2502        page_links_settings.normalize = self.configuration.normalize;
2503
2504        let mut domain_parsed = self.domain_parsed.take();
2505
2506        let mut retry_count = self.configuration.retry;
2507        let mut last_err: Option<std::io::Error> = None;
2508
2509        let build_error_page = |status: StatusCode, _err: std::io::Error| {
2510            let mut p = Page::default();
2511            p.url = url.to_string();
2512            p.status_code = status;
2513            #[cfg(feature = "page_error_status_details")]
2514            {
2515                p.error_for_status = Some(Err(_err));
2516            }
2517            p
2518        };
2519
2520        let mut page: Page = loop {
2521            let bytes = match Self::run_via_cmd(&cmd, &cmd_args, url).await {
2522                Ok(b) => {
2523                    if b.is_empty() {
2524                        last_err = Some(std::io::Error::new(
2525                            std::io::ErrorKind::UnexpectedEof,
2526                            "cmd returned empty stdout",
2527                        ));
2528                        None
2529                    } else {
2530                        Some(b)
2531                    }
2532                }
2533                Err(e) => {
2534                    last_err = Some(e);
2535                    None
2536                }
2537            };
2538
2539            if let Some(bytes) = bytes.as_deref() {
2540                let mut domain_parsed_out = None;
2541
2542                let page = Page::new_page_streaming_from_bytes(
2543                    url,
2544                    bytes,
2545                    base,
2546                    &self.configuration.external_domains_caseless,
2547                    &page_links_settings,
2548                    &mut links,
2549                    Some(&mut links_ssg),
2550                    &mut domain_parsed,
2551                    &mut domain_parsed_out,
2552                    &mut links_pages,
2553                )
2554                .await;
2555
2556                if self.domain_parsed.is_none() {
2557                    if let Some(mut dp) = domain_parsed.take() {
2558                        convert_abs_url(&mut dp);
2559                        self.domain_parsed.replace(dp);
2560                    } else if let Some(mut dp) = domain_parsed_out.take() {
2561                        convert_abs_url(&mut dp);
2562                        self.domain_parsed.replace(dp);
2563                    }
2564                } else if self.domain_parsed.is_none() {
2565                    self.domain_parsed = domain_parsed_out;
2566                }
2567
2568                if page.should_retry && retry_count > 0 {
2569                    retry_count -= 1;
2570                    if let Some(timeout) = page.get_timeout() {
2571                        tokio::time::sleep(timeout).await;
2572                    } else {
2573                        tokio::time::sleep(std::time::Duration::from_millis(250)).await;
2574                    }
2575                    continue;
2576                }
2577
2578                break page;
2579            }
2580
2581            if retry_count == 0 {
2582                let err = last_err.take().unwrap_or_else(|| {
2583                    std::io::Error::new(
2584                        std::io::ErrorKind::Other,
2585                        "cmd fetch failed (unknown error)",
2586                    )
2587                });
2588                break build_error_page(StatusCode::BAD_GATEWAY, err);
2589            }
2590
2591            retry_count -= 1;
2592            tokio::time::sleep(std::time::Duration::from_millis(250)).await;
2593        };
2594
2595        if page.get_html_bytes_u8().starts_with(b"<?xml") {
2596            page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
2597                .await;
2598        }
2599
2600        emit_log(url);
2601
2602        if let Some(signature) = page.signature {
2603            if !self.is_signature_allowed(signature).await {
2604                return Default::default();
2605            }
2606            self.insert_signature(signature).await;
2607        }
2608
2609        let url_ci = match &self.on_link_find_callback {
2610            Some(cb) => cb(*self.url.clone(), None).0,
2611            _ => *self.url.clone(),
2612        };
2613        self.insert_link(url_ci).await;
2614
2615        if self.configuration.return_page_links {
2616            page.page_links = links_pages
2617                .filter(|pages: &HashSet<CaseInsensitiveString>| !pages.is_empty())
2618                .map(Box::new);
2619        }
2620
2621        links.extend(links_ssg);
2622
2623        self.initial_status_code = page.status_code;
2624        self.initial_html_length = page.get_html_bytes_u8().len();
2625        self.initial_anti_bot_tech = page.anti_bot_tech;
2626        self.initial_page_should_retry = page.should_retry;
2627        self.initial_page_waf_check = page.waf_check;
2628
2629        self.set_crawl_initial_status(&page, &links);
2630
2631        if let Some(ref cb) = self.on_should_crawl_callback {
2632            if !cb.call(&page) {
2633                page.blocked_crawl = true;
2634                channel_send_page(&self.channel, page, &self.channel_guard);
2635                return Default::default();
2636            }
2637        }
2638
2639        channel_send_page(&self.channel, page, &self.channel_guard);
2640
2641        links
2642    }
2643
2644    /// Expand links for crawl base establish.
2645    #[cfg(not(feature = "glob"))]
2646    pub async fn _crawl_establish(
2647        &mut self,
2648        client: &Client,
2649        base: &mut RelativeSelectors,
2650        _: bool,
2651    ) -> HashSet<CaseInsensitiveString> {
2652        if self.skip_initial {
2653            return Default::default();
2654        }
2655
2656        if self
2657            .is_allowed_default(self.get_base_link())
2658            .eq(&ProcessLinkStatus::Allowed)
2659        {
2660            let url = self.url.inner();
2661
2662            let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
2663            let mut links_ssg = HashSet::new();
2664            let mut links_pages = if self.configuration.return_page_links {
2665                Some(HashSet::new())
2666            } else {
2667                None
2668            };
2669            let mut page_links_settings =
2670                PageLinkBuildSettings::new(true, self.configuration.full_resources);
2671
2672            page_links_settings.subdomains = self.configuration.subdomains;
2673            page_links_settings.tld = self.configuration.tld;
2674            page_links_settings.normalize = self.configuration.normalize;
2675
2676            let mut domain_parsed = self.domain_parsed.take();
2677
2678            let mut page = if let Some(mut seeded_page) = self.build_seed_page() {
2679                // Extract links and metadata from seeded HTML content if not binary
2680                #[cfg(not(feature = "decentralized"))]
2681                {
2682                    let html_bytes = seeded_page.get_html_bytes_u8();
2683                    if !html_bytes.is_empty() && !auto_encoder::is_binary_file(html_bytes) {
2684                        let html = seeded_page.get_html();
2685                        let extracted_links: HashSet<CaseInsensitiveString> = seeded_page
2686                            .links_stream_base_ssg(base, &html, client, &self.domain_parsed)
2687                            .await;
2688                        links.extend(extracted_links);
2689                    }
2690                }
2691                seeded_page
2692            } else {
2693                Page::new_page_streaming(
2694                    url,
2695                    client,
2696                    false,
2697                    base,
2698                    &self.configuration.external_domains_caseless,
2699                    &page_links_settings,
2700                    &mut links,
2701                    Some(&mut links_ssg),
2702                    &mut domain_parsed, // original domain
2703                    &mut self.domain_parsed,
2704                    &mut links_pages,
2705                )
2706                .await
2707            };
2708
2709            if page.get_html_bytes_u8().starts_with(b"<?xml") {
2710                page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
2711                    .await;
2712            }
2713
2714            if self.domain_parsed.is_none() {
2715                if let Some(mut domain_parsed) = domain_parsed.take() {
2716                    convert_abs_url(&mut domain_parsed);
2717                    self.domain_parsed.replace(domain_parsed);
2718                }
2719            }
2720
2721            let mut retry_count = self.configuration.retry;
2722            let domains_caseless = &self.configuration.external_domains_caseless;
2723
2724            while page.should_retry && retry_count > 0 {
2725                retry_count -= 1;
2726                if let Some(timeout) = page.get_timeout() {
2727                    tokio::time::sleep(timeout).await;
2728                }
2729
2730                if page.status_code == StatusCode::GATEWAY_TIMEOUT {
2731                    let mut domain_parsed_clone = self.domain_parsed.clone();
2732
2733                    if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
2734                        page.clone_from(
2735                            &Page::new_page_streaming(
2736                                url,
2737                                client,
2738                                false,
2739                                base,
2740                                domains_caseless,
2741                                &page_links_settings,
2742                                &mut links,
2743                                Some(&mut links_ssg),
2744                                &mut domain_parsed,
2745                                &mut domain_parsed_clone,
2746                                &mut links_pages,
2747                            )
2748                            .await,
2749                        );
2750                    })
2751                    .await
2752                    {
2753                        log::info!("backoff gateway timeout exceeded {elasped}");
2754                    }
2755
2756                    self.domain_parsed = domain_parsed_clone;
2757                } else {
2758                    page.clone_from(
2759                        &Page::new_page_streaming(
2760                            url,
2761                            client,
2762                            false,
2763                            base,
2764                            &self.configuration.external_domains_caseless,
2765                            &page_links_settings,
2766                            &mut links,
2767                            Some(&mut links_ssg),
2768                            &mut domain_parsed,
2769                            &mut self.domain_parsed,
2770                            &mut links_pages,
2771                        )
2772                        .await,
2773                    );
2774                }
2775            }
2776
2777            emit_log(url);
2778
2779            if let Some(signature) = page.signature {
2780                if !self.is_signature_allowed(signature).await {
2781                    return Default::default();
2782                }
2783                self.insert_signature(signature).await;
2784            }
2785
2786            let url = match &self.on_link_find_callback {
2787                Some(cb) => cb(*self.url.clone(), None).0,
2788                _ => *self.url.clone(),
2789            };
2790
2791            self.insert_link(url).await;
2792
2793            if self.configuration.return_page_links {
2794                page.page_links = links_pages
2795                    .filter(|pages: &HashSet<CaseInsensitiveString>| !pages.is_empty())
2796                    .map(Box::new);
2797            }
2798
2799            links.extend(links_ssg);
2800
2801            self.initial_status_code = page.status_code;
2802            self.initial_html_length = page.get_html_bytes_u8().len();
2803            self.initial_anti_bot_tech = page.anti_bot_tech;
2804            self.initial_page_should_retry = page.should_retry;
2805            self.initial_page_waf_check = page.waf_check;
2806
2807            self.set_crawl_initial_status(&page, &links);
2808
2809            if let Some(ref cb) = self.on_should_crawl_callback {
2810                if !cb.call(&page) {
2811                    page.blocked_crawl = true;
2812                    channel_send_page(&self.channel, page, &self.channel_guard);
2813                    return Default::default();
2814                }
2815            }
2816
2817            channel_send_page(&self.channel, page, &self.channel_guard);
2818
2819            links
2820        } else {
2821            HashSet::new()
2822        }
2823    }
2824
2825    /// Run `cmd` and return stdout bytes.
2826    #[cfg(feature = "cmd")]
2827    pub async fn run_via_cmd(
2828        cmd: &std::path::Path,
2829        fixed_args: &[String],
2830        url: &str,
2831    ) -> std::io::Result<Vec<u8>> {
2832        use tokio::process::Command;
2833        let mut args: Vec<String> = Vec::with_capacity(fixed_args.len() + 1);
2834        let mut used_placeholder = false;
2835
2836        for a in fixed_args {
2837            if a.contains("{url}") {
2838                used_placeholder = true;
2839                args.push(a.replace("{url}", url));
2840            } else {
2841                args.push(a.clone());
2842            }
2843        }
2844
2845        if !used_placeholder {
2846            args.push(url.to_string());
2847        }
2848
2849        let out = Command::new(cmd)
2850            .args(&args)
2851            .kill_on_drop(true)
2852            .output()
2853            .await?;
2854
2855        if !out.status.success() {
2856            let code = out.status.code().unwrap_or(-1);
2857            let stderr = String::from_utf8_lossy(&out.stderr);
2858
2859            return Err(std::io::Error::new(
2860                std::io::ErrorKind::Other,
2861                format!("cmd exit={code} stderr={stderr}"),
2862            ));
2863        }
2864
2865        Ok(out.stdout)
2866    }
2867
2868    /// Start to crawl website concurrently using a cmd executable.
2869    /// - `cmd` is the executable (absolute preferred)
2870    /// - `cmd_args` are fixed args; can include "{url}" placeholder, otherwise url is appended.
2871    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
2872    #[cfg(feature = "cmd")]
2873    pub async fn crawl_concurrent_cmd(
2874        &mut self,
2875        cmd: std::path::PathBuf,
2876        cmd_args: Vec<String>,
2877        handle: &Option<Arc<AtomicI8>>,
2878    ) {
2879        self.start();
2880        self.status = CrawlStatus::Active;
2881
2882        let mut selector: (
2883            CompactString,
2884            smallvec::SmallVec<[CompactString; 2]>,
2885            CompactString,
2886        ) = self.setup_selectors();
2887
2888        if self.single_page() {
2889            let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
2890            let mut links_pages: Option<HashSet<CaseInsensitiveString>> =
2891                if self.configuration.return_page_links {
2892                    Some(HashSet::new())
2893                } else {
2894                    None
2895                };
2896
2897            let mut relative_selectors = selector;
2898            let mut domain_parsed = None;
2899
2900            let target = self
2901                .domain_parsed
2902                .as_ref()
2903                .map(|u| u.as_str())
2904                .unwrap_or(self.get_url());
2905
2906            let bytes = match Self::run_via_cmd(&cmd, &cmd_args, target).await {
2907                Ok(b) => b,
2908                Err(e) => {
2909                    let mut page = Page::default();
2910                    page.url = target.to_string();
2911                    page.status_code = StatusCode::BAD_GATEWAY;
2912                    #[cfg(feature = "page_error_status_details")]
2913                    {
2914                        page.error_for_status = Some(Err(e));
2915                    }
2916                    channel_send_page(&self.channel, page, &self.channel_guard);
2917                    return;
2918                }
2919            };
2920
2921            let page = Page::new_page_streaming_from_bytes(
2922                target,
2923                &bytes,
2924                &mut relative_selectors,
2925                &self.configuration.external_domains_caseless,
2926                &PageLinkBuildSettings::new_full(
2927                    false,
2928                    self.configuration.full_resources,
2929                    self.configuration.subdomains,
2930                    self.configuration.tld,
2931                    self.configuration.normalize,
2932                ),
2933                &mut links,
2934                None,
2935                &self.domain_parsed,
2936                &mut domain_parsed,
2937                &mut links_pages,
2938            )
2939            .await;
2940
2941            channel_send_page(&self.channel, page, &self.channel_guard);
2942            return;
2943        }
2944
2945        let on_should_crawl_callback = self.on_should_crawl_callback.clone();
2946        let return_page_links = self.configuration.return_page_links;
2947        let full_resources = self.configuration.full_resources;
2948        let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
2949
2950        let (mut interval, throttle) = self.setup_crawl();
2951        let mut links: HashSet<CaseInsensitiveString> = self.drain_extra_links().collect();
2952
2953        links.extend(
2954            self._crawl_establish_cmd(cmd.clone(), cmd_args.clone(), &mut selector, false)
2955                .await,
2956        );
2957
2958        self.configuration.configure_allowlist();
2959        let semaphore = self.setup_semaphore();
2960
2961        let shared = Arc::new((
2962            cmd,
2963            cmd_args,
2964            selector,
2965            self.channel.clone(),
2966            self.configuration.external_domains_caseless.clone(),
2967            self.channel_guard.clone(),
2968            self.configuration.retry,
2969            return_page_links,
2970            PageLinkBuildSettings::new_full(
2971                false,
2972                full_resources,
2973                self.configuration.subdomains,
2974                self.configuration.tld,
2975                self.configuration.normalize,
2976            ),
2977            self.domain_parsed.clone(),
2978            self.on_link_find_callback.clone(),
2979        ));
2980
2981        let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> = JoinSet::new();
2982
2983        let mut exceeded_budget = false;
2984        let concurrency = throttle.is_zero();
2985
2986        self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
2987
2988        if !concurrency && !links.is_empty() {
2989            tokio::time::sleep(*throttle).await;
2990        }
2991
2992        let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
2993            Some(Instant::now())
2994        } else {
2995            None
2996        };
2997
2998        'outer: loop {
2999            let mut stream =
3000                tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect());
3001
3002            loop {
3003                if !concurrency {
3004                    tokio::time::sleep(*throttle).await;
3005                }
3006
3007                let semaphore = get_semaphore(&semaphore, !self.configuration.shared_queue).await;
3008
3009                tokio::select! {
3010                    biased;
3011
3012                    Some(link) = stream.next(),
3013                    if semaphore.available_permits() > 0
3014                        && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) =>
3015                    {
3016                        if !self.handle_process(handle, &mut interval, async {
3017                            emit_log_shutdown(link.inner());
3018                            let permits = set.len();
3019                            set.shutdown().await;
3020                            semaphore.add_permits(permits);
3021                        }).await {
3022                            while let Some(links) = stream.next().await {
3023                                self.extra_links.insert(links);
3024                            }
3025                            break 'outer;
3026                        }
3027
3028                        let allowed = self.is_allowed(&link);
3029                        if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
3030                            exceeded_budget = true;
3031                            break;
3032                        }
3033                        if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
3034                            continue;
3035                        }
3036
3037                        emit_log(link.inner());
3038                        self.insert_link(link.clone()).await;
3039
3040                        if let Ok(permit) = semaphore.clone().acquire_owned().await {
3041                            let shared = shared.clone();
3042                            let on_should_crawl_callback = on_should_crawl_callback.clone();
3043                            spawn_set("page_fetch_cmd", &mut set, async move {
3044                                let link_result = match &shared.10 {
3045                                    Some(cb) => cb(link, None),
3046                                    _ => (link, None),
3047                                };
3048
3049                                let mut out_links: HashSet<CaseInsensitiveString> = HashSet::new();
3050                                let mut links_pages = if shared.7 { Some(HashSet::new()) } else { None };
3051
3052                                let mut relative_selectors = shared.2.clone();
3053                                let mut r_settings = shared.8;
3054                                r_settings.ssg_build = true;
3055
3056                                let target_url = link_result.0.as_ref();
3057
3058                                // Run cmd -> bytes with retry
3059                                let mut retry_count = shared.6;
3060                                let mut last_err: Option<std::io::Error> = None;
3061
3062                                let bytes = loop {
3063                                    match Self::run_via_cmd(&shared.0, &shared.1, target_url).await {
3064                                        Ok(b) if !b.is_empty() => break Some(b),
3065                                        Ok(_) => {
3066                                            last_err = Some(std::io::Error::new(
3067                                                std::io::ErrorKind::UnexpectedEof,
3068                                                "cmd returned empty stdout",
3069                                            ));
3070                                        }
3071                                        Err(e) => {
3072                                            last_err = Some(e);
3073                                        }
3074                                    }
3075
3076                                    if retry_count == 0 { break None; }
3077                                    retry_count -= 1;
3078
3079                                    tokio::time::sleep(std::time::Duration::from_millis(250)).await;
3080                                };
3081
3082                                let mut domain_parsed = None;
3083
3084                                let mut page = if let Some(bytes) = bytes {
3085                                    Page::new_page_streaming_from_bytes(
3086                                        target_url,
3087                                        &bytes,
3088                                        &mut relative_selectors,
3089                                        &shared.4,
3090                                        &r_settings,
3091                                        &mut out_links,
3092                                        None,
3093                                        &shared.9,
3094                                        &mut domain_parsed,
3095                                        &mut links_pages,
3096                                    ).await
3097                                } else {
3098                                    // Build an error page
3099                                    let mut p = Page::default();
3100                                    p.url = target_url.to_string();
3101                                    p.status_code = StatusCode::BAD_GATEWAY;
3102                                    if let Some(e) = last_err {
3103                                                    #[cfg(feature = "page_error_status_details")]
3104                                                    {
3105                                                        p.error_for_status = Some(Err(e));
3106                                                    }
3107                                    }
3108                                    p
3109                                };
3110
3111                                if shared.7 {
3112                                    page.page_links = links_pages
3113                                        .filter(|pages| !pages.is_empty())
3114                                        .map(Box::new);
3115                                }
3116
3117                                if let Some(ref cb) = on_should_crawl_callback {
3118                                    if !cb.call(&page) {
3119                                        page.blocked_crawl = true;
3120                                        channel_send_page(&shared.3, page, &shared.5);
3121                                        drop(permit);
3122                                        return Default::default();
3123                                    }
3124                                }
3125
3126                                let signature = page.signature;
3127                                channel_send_page(&shared.3, page, &shared.5);
3128                                drop(permit);
3129
3130                                (out_links, signature)
3131                            });
3132                        }
3133
3134                        self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
3135                    },
3136
3137                    Some(result) = set.join_next(), if !set.is_empty() => {
3138                        if let Ok(res) = result {
3139                            match res.1 {
3140                                Some(signature) => {
3141                                    if self.is_signature_allowed(signature).await {
3142                                        self.insert_signature(signature).await;
3143                                        self.links_visited.extend_links(&mut links, res.0);
3144                                    }
3145                                }
3146                                _ => {
3147                                    self.links_visited.extend_links(&mut links, res.0);
3148                                }
3149                            }
3150                        } else {
3151                            break;
3152                        }
3153                    }
3154
3155                    else => break,
3156                }
3157
3158                self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
3159
3160                if (links.is_empty() && set.is_empty()) || exceeded_budget {
3161                    if exceeded_budget {
3162                        while let Some(links) = stream.next().await {
3163                            self.extra_links.insert(links);
3164                        }
3165                        while let Some(links) = set.join_next().await {
3166                            if let Ok(links) = links {
3167                                self.extra_links.extend(links.0);
3168                            }
3169                        }
3170                    }
3171                    break 'outer;
3172                }
3173            }
3174
3175            self.subscription_guard().await;
3176            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
3177
3178            if links.is_empty() && set.is_empty() {
3179                break;
3180            }
3181        }
3182
3183        if !links.is_empty() {
3184            self.extra_links.extend(links);
3185        }
3186    }
3187
3188    /// Build a page from a seed.
3189    #[allow(dead_code)]
3190    fn build_seed_page(&self) -> Option<Page> {
3191        if let Some(seeded_html) = self.get_seeded_html() {
3192            let mut page_response = PageResponse::default();
3193            page_response.content = Some(Box::new(seeded_html.as_bytes().to_vec()));
3194            Some(build(&self.url.inner(), page_response))
3195        } else {
3196            None
3197        }
3198    }
3199
3200    /// Expand links for crawl.
3201    #[cfg(all(
3202        not(feature = "decentralized"),
3203        feature = "chrome",
3204        not(feature = "glob")
3205    ))]
3206    pub async fn crawl_establish(
3207        &mut self,
3208        client: &Client,
3209        base: &mut RelativeSelectors,
3210        _: bool,
3211        chrome_page: &chromiumoxide::Page,
3212    ) -> HashSet<CaseInsensitiveString> {
3213        if self.skip_initial {
3214            return Default::default();
3215        }
3216
3217        if self
3218            .is_allowed_default(&self.get_base_link())
3219            .eq(&ProcessLinkStatus::Allowed)
3220        {
3221            let (_, intercept_handle) = tokio::join!(
3222                crate::features::chrome::setup_chrome_events(chrome_page, &self.configuration),
3223                self.setup_chrome_interception(&chrome_page)
3224            );
3225
3226            let mut page = if let Some(seeded_html) = self.get_seeded_html() {
3227                Page::new_seeded(
3228                    &self.url.inner(),
3229                    &client,
3230                    &chrome_page,
3231                    &self.configuration.wait_for,
3232                    &self.configuration.screenshot,
3233                    false, // we use the initial about:blank page.
3234                    &self.configuration.openai_config,
3235                    &self.configuration.execution_scripts,
3236                    &self.configuration.automation_scripts,
3237                    &self.configuration.viewport,
3238                    &self.configuration.request_timeout,
3239                    &self.configuration.track_events,
3240                    self.configuration.referer.clone(),
3241                    self.configuration.max_page_bytes,
3242                    self.configuration.get_cache_options(),
3243                    &self.configuration.cache_policy,
3244                    Some(seeded_html.clone()),
3245                    Some(&self.cookie_jar),
3246                    &self.configuration.remote_multimodal,
3247                )
3248                .await
3249            } else {
3250                Page::new(
3251                    &self.url.inner(),
3252                    &client,
3253                    &chrome_page,
3254                    &self.configuration.wait_for,
3255                    &self.configuration.screenshot,
3256                    false, // we use the initial about:blank page.
3257                    &self.configuration.openai_config,
3258                    &self.configuration.execution_scripts,
3259                    &self.configuration.automation_scripts,
3260                    &self.configuration.viewport,
3261                    &self.configuration.request_timeout,
3262                    &self.configuration.track_events,
3263                    self.configuration.referer.clone(),
3264                    self.configuration.max_page_bytes,
3265                    self.configuration.get_cache_options(),
3266                    &self.configuration.cache_policy,
3267                    &self.configuration.remote_multimodal,
3268                )
3269                .await
3270            };
3271
3272            let mut retry_count = self.configuration.retry;
3273
3274            if let Some(final_redirect_destination) = &page.final_redirect_destination {
3275                if final_redirect_destination == "chrome-error://chromewebdata/"
3276                    && page.status_code.is_success()
3277                    && page.is_empty()
3278                    && self.configuration.proxies.is_some()
3279                {
3280                    page.error_status = Some("Invalid proxy configuration.".into());
3281                    page.should_retry = true;
3282                    page.status_code = *crate::page::CHROME_UNKNOWN_STATUS_ERROR;
3283                }
3284            }
3285
3286            while page.should_retry && retry_count > 0 {
3287                retry_count -= 1;
3288                if let Some(timeout) = page.get_timeout() {
3289                    tokio::time::sleep(timeout).await;
3290                }
3291                if page.status_code == StatusCode::GATEWAY_TIMEOUT {
3292                    if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
3293                        let next_page = Page::new(
3294                            &self.url.inner(),
3295                            &client,
3296                            &chrome_page,
3297                            &self.configuration.wait_for,
3298                            &self.configuration.screenshot,
3299                            false, // we use the initial about:blank page.
3300                            &self.configuration.openai_config,
3301                            &self.configuration.execution_scripts,
3302                            &self.configuration.automation_scripts,
3303                            &self.configuration.viewport,
3304                            &self.configuration.request_timeout,
3305                            &self.configuration.track_events,
3306                            self.configuration.referer.clone(),
3307                            self.configuration.max_page_bytes,
3308                            self.configuration.get_cache_options(),
3309                            &self.configuration.cache_policy,
3310                            &self.configuration.remote_multimodal,
3311                        )
3312                        .await;
3313                        page.clone_from(&next_page);
3314                    })
3315                    .await
3316                    {
3317                        log::warn!("backoff timeout {elasped}");
3318                    }
3319                } else {
3320                    let next_page = Page::new(
3321                        &self.url.inner(),
3322                        &client,
3323                        &chrome_page,
3324                        &self.configuration.wait_for,
3325                        &self.configuration.screenshot,
3326                        false, // we use the initial about:blank page.
3327                        &self.configuration.openai_config,
3328                        &self.configuration.execution_scripts,
3329                        &self.configuration.automation_scripts,
3330                        &self.configuration.viewport,
3331                        &self.configuration.request_timeout,
3332                        &self.configuration.track_events,
3333                        self.configuration.referer.clone(),
3334                        self.configuration.max_page_bytes,
3335                        self.configuration.get_cache_options(),
3336                        &self.configuration.cache_policy,
3337                        &self.configuration.remote_multimodal,
3338                    )
3339                    .await;
3340                    page.clone_from(&next_page);
3341                }
3342
3343                // check the page again for final.
3344                if let Some(final_redirect_destination) = &page.final_redirect_destination {
3345                    if final_redirect_destination == "chrome-error://chromewebdata/"
3346                        && page.status_code.is_success()
3347                        && page.is_empty()
3348                        && self.configuration.proxies.is_some()
3349                    {
3350                        page.error_status = Some("Invalid proxy configuration.".into());
3351                        page.should_retry = true;
3352                        page.status_code = *crate::page::CHROME_UNKNOWN_STATUS_ERROR;
3353                    }
3354                }
3355            }
3356
3357            if let Some(h) = intercept_handle {
3358                let abort_handle = h.abort_handle();
3359                if let Err(elasped) =
3360                    tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await
3361                {
3362                    log::warn!("Handler timeout exceeded {elasped}");
3363                    abort_handle.abort();
3364                }
3365            }
3366
3367            if let Some(domain) = &page.final_redirect_destination {
3368                let domain: Box<CaseInsensitiveString> = CaseInsensitiveString::new(&domain).into();
3369                let prior_domain = self.domain_parsed.take();
3370                self.domain_parsed = parse_absolute_url(&domain);
3371                self.url = domain;
3372
3373                let s = self.setup_selectors();
3374                base.0 = s.0;
3375                base.1 = s.1;
3376
3377                if let Some(pdname) = prior_domain {
3378                    if let Some(dname) = pdname.host_str() {
3379                        base.2 = dname.into();
3380                    }
3381                }
3382            }
3383
3384            emit_log(&self.url.inner());
3385
3386            if let Some(sid) = page.signature {
3387                self.insert_signature(sid).await;
3388            }
3389
3390            let url = match &self.on_link_find_callback {
3391                Some(cb) => cb(*self.url.clone(), None).0,
3392                _ => *self.url.clone(),
3393            };
3394
3395            self.insert_link(url).await;
3396
3397            // setup link tracking.
3398            if self.configuration.return_page_links && page.page_links.is_none() {
3399                page.page_links = Some(Box::new(Default::default()));
3400            }
3401
3402            let xml_file = page.get_html_bytes_u8().starts_with(b"<?xml");
3403
3404            let mut links = if !page.is_empty() && !xml_file {
3405                page.links_ssg(&base, &client, &self.domain_parsed).await
3406            } else {
3407                Default::default()
3408            };
3409
3410            if xml_file {
3411                page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
3412                    .await;
3413            }
3414
3415            self.initial_status_code = page.status_code;
3416            self.initial_html_length = page.get_html_bytes_u8().len();
3417            self.initial_anti_bot_tech = page.anti_bot_tech;
3418            self.initial_page_should_retry = page.should_retry;
3419            self.initial_page_waf_check = page.waf_check;
3420
3421            self.set_crawl_initial_status(&page, &links);
3422
3423            if let Some(ref cb) = self.on_should_crawl_callback {
3424                if !cb.call(&page) {
3425                    page.blocked_crawl = true;
3426                    channel_send_page(&self.channel, page, &self.channel_guard);
3427                    return Default::default();
3428                }
3429            }
3430
3431            channel_send_page(&self.channel, page, &self.channel_guard);
3432
3433            links
3434        } else {
3435            HashSet::new()
3436        }
3437    }
3438
3439    /// Expand links for crawl.
3440    #[cfg(all(not(feature = "decentralized"), feature = "chrome",))]
3441    pub async fn crawl_establish_chrome_one(
3442        &self,
3443        client: &Client,
3444        base: &mut RelativeSelectors,
3445        url: &Option<&str>,
3446        chrome_page: &chromiumoxide::Page,
3447    ) -> HashSet<CaseInsensitiveString> {
3448        if self
3449            .is_allowed_default(&self.get_base_link())
3450            .eq(&ProcessLinkStatus::Allowed)
3451        {
3452            let (_, intercept_handle) = tokio::join!(
3453                crate::features::chrome::setup_chrome_events(chrome_page, &self.configuration),
3454                self.setup_chrome_interception(&chrome_page)
3455            );
3456
3457            let mut page = Page::new(
3458                url.unwrap_or(&self.url.inner()),
3459                &client,
3460                &chrome_page,
3461                &self.configuration.wait_for,
3462                &self.configuration.screenshot,
3463                false, // we use the initial about:blank page.
3464                &self.configuration.openai_config,
3465                &self.configuration.execution_scripts,
3466                &self.configuration.automation_scripts,
3467                &self.configuration.viewport,
3468                &self.configuration.request_timeout,
3469                &self.configuration.track_events,
3470                self.configuration.referer.clone(),
3471                self.configuration.max_page_bytes,
3472                self.configuration.get_cache_options(),
3473                &self.configuration.cache_policy,
3474                &self.configuration.remote_multimodal,
3475            )
3476            .await;
3477
3478            let mut retry_count = self.configuration.retry;
3479
3480            if let Some(final_redirect_destination) = &page.final_redirect_destination {
3481                if final_redirect_destination == "chrome-error://chromewebdata/"
3482                    && page.status_code.is_success()
3483                    && page.is_empty()
3484                    && self.configuration.proxies.is_some()
3485                {
3486                    page.error_status = Some("Invalid proxy configuration.".into());
3487                    page.should_retry = true;
3488                    page.status_code = *crate::page::CHROME_UNKNOWN_STATUS_ERROR;
3489                }
3490            }
3491
3492            while page.should_retry && retry_count > 0 {
3493                retry_count -= 1;
3494                if let Some(timeout) = page.get_timeout() {
3495                    tokio::time::sleep(timeout).await;
3496                }
3497                if page.status_code == StatusCode::GATEWAY_TIMEOUT {
3498                    if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
3499                        let next_page = Page::new(
3500                            &self.url.inner(),
3501                            &client,
3502                            &chrome_page,
3503                            &self.configuration.wait_for,
3504                            &self.configuration.screenshot,
3505                            false, // we use the initial about:blank page.
3506                            &self.configuration.openai_config,
3507                            &self.configuration.execution_scripts,
3508                            &self.configuration.automation_scripts,
3509                            &self.configuration.viewport,
3510                            &self.configuration.request_timeout,
3511                            &self.configuration.track_events,
3512                            self.configuration.referer.clone(),
3513                            self.configuration.max_page_bytes,
3514                            self.configuration.get_cache_options(),
3515                            &self.configuration.cache_policy,
3516                            &self.configuration.remote_multimodal,
3517                        )
3518                        .await;
3519                        page.clone_from(&next_page);
3520                    })
3521                    .await
3522                    {
3523                        log::warn!("backoff timeout {elasped}");
3524                    }
3525                } else {
3526                    let next_page = Page::new(
3527                        &self.url.inner(),
3528                        &client,
3529                        &chrome_page,
3530                        &self.configuration.wait_for,
3531                        &self.configuration.screenshot,
3532                        false, // we use the initial about:blank page.
3533                        &self.configuration.openai_config,
3534                        &self.configuration.execution_scripts,
3535                        &self.configuration.automation_scripts,
3536                        &self.configuration.viewport,
3537                        &self.configuration.request_timeout,
3538                        &self.configuration.track_events,
3539                        self.configuration.referer.clone(),
3540                        self.configuration.max_page_bytes,
3541                        self.configuration.get_cache_options(),
3542                        &self.configuration.cache_policy,
3543                        &self.configuration.remote_multimodal,
3544                    )
3545                    .await;
3546                    page.clone_from(&next_page);
3547                }
3548
3549                // check the page again for final.
3550                if let Some(final_redirect_destination) = &page.final_redirect_destination {
3551                    if final_redirect_destination == "chrome-error://chromewebdata/"
3552                        && page.status_code.is_success()
3553                        && page.is_empty()
3554                        && self.configuration.proxies.is_some()
3555                    {
3556                        page.error_status = Some("Invalid proxy configuration.".into());
3557                        page.should_retry = true;
3558                        page.status_code = *crate::page::CHROME_UNKNOWN_STATUS_ERROR;
3559                    }
3560                }
3561            }
3562
3563            if let Some(h) = intercept_handle {
3564                let abort_handle = h.abort_handle();
3565                if let Err(elasped) =
3566                    tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await
3567                {
3568                    log::warn!("Handler timeout exceeded {elasped}");
3569                    abort_handle.abort();
3570                }
3571            }
3572
3573            if let Some(domain) = &page.final_redirect_destination {
3574                let domain: Box<CaseInsensitiveString> = CaseInsensitiveString::new(&domain).into();
3575                let s = self.setup_selectors();
3576
3577                base.0 = s.0;
3578                base.1 = s.1;
3579
3580                if let Some(pdname) = parse_absolute_url(&domain) {
3581                    if let Some(dname) = pdname.host_str() {
3582                        base.2 = dname.into();
3583                    }
3584                }
3585            }
3586
3587            emit_log(&self.url.inner());
3588
3589            if self.configuration.return_page_links && page.page_links.is_none() {
3590                page.page_links = Some(Box::new(Default::default()));
3591            }
3592
3593            let xml_file = page.get_html_bytes_u8().starts_with(b"<?xml");
3594
3595            let mut links = if !page.is_empty() && !xml_file {
3596                page.links_ssg(&base, &client, &self.domain_parsed).await
3597            } else {
3598                Default::default()
3599            };
3600
3601            if xml_file {
3602                page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
3603                    .await;
3604            }
3605
3606            if let Some(ref cb) = self.on_should_crawl_callback {
3607                if !cb.call(&page) {
3608                    page.blocked_crawl = true;
3609                    channel_send_page(&self.channel, page, &self.channel_guard);
3610                    return Default::default();
3611                }
3612            }
3613
3614            channel_send_page(&self.channel, page, &self.channel_guard);
3615
3616            links
3617        } else {
3618            HashSet::new()
3619        }
3620    }
3621
3622    /// Expand links for crawl using WebDriver.
3623    #[cfg(all(feature = "webdriver", not(feature = "decentralized"), not(feature = "chrome")))]
3624    pub async fn crawl_establish_webdriver_one(
3625        &self,
3626        client: &Client,
3627        base: &mut RelativeSelectors,
3628        url: &Option<&str>,
3629        driver: &std::sync::Arc<thirtyfour::WebDriver>,
3630    ) -> HashSet<CaseInsensitiveString> {
3631        if self
3632            .is_allowed_default(&self.get_base_link())
3633            .eq(&ProcessLinkStatus::Allowed)
3634        {
3635            let timeout = self
3636                .configuration
3637                .webdriver_config
3638                .as_ref()
3639                .and_then(|c| c.timeout);
3640
3641            // Setup stealth events
3642            crate::features::webdriver::setup_driver_events(driver, &self.configuration).await;
3643
3644            let mut page = Page::new_page_webdriver(
3645                url.unwrap_or(&self.url.inner()),
3646                driver,
3647                timeout,
3648            )
3649            .await;
3650
3651            let mut retry_count = self.configuration.retry;
3652
3653            while page.should_retry && retry_count > 0 {
3654                retry_count -= 1;
3655                if let Some(timeout_duration) = page.get_timeout() {
3656                    tokio::time::sleep(timeout_duration).await;
3657                }
3658                if page.status_code == StatusCode::GATEWAY_TIMEOUT {
3659                    if let Err(elapsed) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
3660                        let next_page = Page::new_page_webdriver(
3661                            &self.url.inner(),
3662                            driver,
3663                            timeout,
3664                        )
3665                        .await;
3666                        page.clone_from(&next_page);
3667                    })
3668                    .await
3669                    {
3670                        log::warn!("backoff timeout {elapsed}");
3671                    }
3672                } else {
3673                    let next_page = Page::new_page_webdriver(
3674                        &self.url.inner(),
3675                        driver,
3676                        timeout,
3677                    )
3678                    .await;
3679                    page.clone_from(&next_page);
3680                }
3681            }
3682
3683            if let Some(domain) = &page.final_redirect_destination {
3684                let domain: Box<CaseInsensitiveString> = CaseInsensitiveString::new(&domain).into();
3685                let s = self.setup_selectors();
3686
3687                base.0 = s.0;
3688                base.1 = s.1;
3689
3690                if let Some(pdname) = parse_absolute_url(&domain) {
3691                    if let Some(dname) = pdname.host_str() {
3692                        base.2 = dname.into();
3693                    }
3694                }
3695            }
3696
3697            emit_log(&self.url.inner());
3698
3699            if self.configuration.return_page_links && page.page_links.is_none() {
3700                page.page_links = Some(Box::new(Default::default()));
3701            }
3702
3703            let xml_file = page.get_html_bytes_u8().starts_with(b"<?xml");
3704
3705            let mut links = if !page.is_empty() && !xml_file {
3706                page.links_ssg(&base, &client, &self.domain_parsed).await
3707            } else {
3708                Default::default()
3709            };
3710
3711            if xml_file {
3712                page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
3713                    .await;
3714            }
3715
3716            if let Some(ref cb) = self.on_should_crawl_callback {
3717                if !cb.call(&page) {
3718                    page.blocked_crawl = true;
3719                    channel_send_page(&self.channel, page, &self.channel_guard);
3720                    return Default::default();
3721                }
3722            }
3723
3724            channel_send_page(&self.channel, page, &self.channel_guard);
3725
3726            links
3727        } else {
3728            HashSet::new()
3729        }
3730    }
3731
3732    /// Expand links for crawl.
3733    #[cfg(all(not(feature = "glob"), feature = "decentralized"))]
3734    pub async fn crawl_establish(
3735        &mut self,
3736        client: &Client,
3737        _: &(CompactString, smallvec::SmallVec<[CompactString; 2]>),
3738        http_worker: bool,
3739    ) -> HashSet<CaseInsensitiveString> {
3740        // base_domain name passed here is for primary url determination and not subdomain.tld placement
3741        let links: HashSet<CaseInsensitiveString> = if self
3742            .is_allowed_default(&self.get_base_link())
3743            .eq(&ProcessLinkStatus::Allowed)
3744        {
3745            let link = self.url.inner();
3746
3747            let mut page = Page::new(
3748                &if http_worker && link.starts_with("https") {
3749                    link.replacen("https", "http", 1)
3750                } else {
3751                    link.to_string()
3752                },
3753                &client,
3754            )
3755            .await;
3756
3757            if let Some(sid) = page.signature {
3758                self.insert_signature(sid).await;
3759            }
3760
3761            self.insert_link(match &self.on_link_find_callback {
3762                Some(cb) => cb(*self.url.to_owned(), None).0,
3763                _ => *self.url.to_owned(),
3764            })
3765            .await;
3766
3767            self.initial_status_code = page.status_code;
3768            self.initial_html_length = page.get_html_bytes_u8().len();
3769            self.initial_anti_bot_tech = page.anti_bot_tech;
3770            self.initial_page_should_retry = page.should_retry;
3771            self.initial_page_waf_check = page.waf_check;
3772
3773            // todo: pass full links to the worker to return.
3774            if self.configuration.return_page_links {
3775                page.page_links = Some(page.links.clone().into());
3776            }
3777
3778            let links = HashSet::from(page.links.clone());
3779
3780            self.set_crawl_initial_status(&page, &links);
3781
3782            channel_send_page(&self.channel, page, &self.channel_guard);
3783
3784            links
3785        } else {
3786            HashSet::new()
3787        };
3788
3789        links
3790    }
3791
3792    /// Expand links for crawl.
3793    #[cfg(all(feature = "glob", feature = "decentralized"))]
3794    pub async fn crawl_establish(
3795        &mut self,
3796        client: &Client,
3797        _: &(CompactString, smallvec::SmallVec<[CompactString; 2]>),
3798        http_worker: bool,
3799    ) -> HashSet<CaseInsensitiveString> {
3800        let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
3801        let expanded = self.get_expanded_links(&self.url.inner().as_str());
3802        self.configuration.configure_allowlist();
3803
3804        for link in expanded {
3805            let allowed = self.is_allowed(&link);
3806
3807            if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
3808                break;
3809            }
3810            if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
3811                continue;
3812            }
3813
3814            let mut page = Page::new(
3815                &if http_worker && link.as_ref().starts_with("https") {
3816                    link.inner().replacen("https", "http", 1).to_string()
3817                } else {
3818                    link.inner().to_string()
3819                },
3820                &client,
3821            )
3822            .await;
3823
3824            let u = page.get_url();
3825            let u = if u.is_empty() { link } else { u.into() };
3826
3827            let link_result = match &self.on_link_find_callback {
3828                Some(cb) => cb(u, None),
3829                _ => (u, None),
3830            };
3831
3832            if let Some(sid) = page.signature {
3833                self.insert_signature(sid).await;
3834            }
3835
3836            self.insert_link(link_result.0).await;
3837
3838            if self.configuration.return_page_links {
3839                page.page_links = Some(Default::default());
3840            }
3841
3842            channel_send_page(&self.channel, page.clone(), &self.channel_guard);
3843
3844            let page_links = HashSet::from(page.links);
3845
3846            links.extend(page_links);
3847        }
3848
3849        links
3850    }
3851
3852    /// Expand links for crawl.
3853    #[cfg(all(feature = "glob", feature = "chrome", not(feature = "decentralized")))]
3854    pub async fn crawl_establish(
3855        &mut self,
3856        client: &Client,
3857        base: &mut RelativeSelectors,
3858        _: bool,
3859        page: &chromiumoxide::Page,
3860    ) -> HashSet<CaseInsensitiveString> {
3861        if self.skip_initial {
3862            return Default::default();
3863        }
3864        let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
3865        let expanded = self.get_expanded_links(&self.url.inner().as_str());
3866        self.configuration.configure_allowlist();
3867
3868        for link in expanded {
3869            let allowed = self.is_allowed(&link);
3870
3871            if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
3872                break;
3873            }
3874            if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
3875                continue;
3876            }
3877
3878            let mut page = Page::new(
3879                &link.inner().as_str(),
3880                &client,
3881                &page,
3882                &self.configuration.wait_for,
3883                &self.configuration.screenshot,
3884                false, // we use the initial about:blank page.
3885                &self.configuration.openai_config,
3886                &self.configuration.execution_scripts,
3887                &self.configuration.automation_scripts,
3888                &self.configuration.viewport,
3889                &self.configuration.request_timeout,
3890                &self.configuration.track_events,
3891                self.configuration.referer.clone(),
3892                self.configuration.max_page_bytes,
3893                self.configuration.get_cache_options(),
3894                &self.configuration.cache_policy,
3895                &self.configuration.remote_multimodal,
3896            )
3897            .await;
3898
3899            let u = page.get_url();
3900            let u = if u.is_empty() { link } else { u.into() };
3901
3902            let link_result = match &self.on_link_find_callback {
3903                Some(cb) => cb(u, None),
3904                _ => (u, None),
3905            };
3906
3907            if let Some(sid) = page.signature {
3908                self.insert_signature(sid).await;
3909            }
3910
3911            self.insert_link(link_result.0).await;
3912
3913            if self.configuration.return_page_links {
3914                page.page_links = Some(Default::default());
3915                let next_links = HashSet::from(page.links(&base, &self.domain_parsed).await);
3916
3917                channel_send_page(&self.channel, page.clone(), &self.channel_guard);
3918
3919                links.extend(next_links);
3920            } else {
3921                channel_send_page(&self.channel, page.clone(), &self.channel_guard);
3922                let next_links = HashSet::from(page.links(&base, &self.domain_parsed).await);
3923
3924                links.extend(next_links);
3925            }
3926        }
3927
3928        links
3929    }
3930
3931    /// Expand links for crawl.
3932    #[cfg(feature = "glob")]
3933    async fn _crawl_establish(
3934        &mut self,
3935        client: &Client,
3936        base: &mut RelativeSelectors,
3937        _: bool,
3938    ) -> HashSet<CaseInsensitiveString> {
3939        if self.skip_initial {
3940            return Default::default();
3941        }
3942        let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
3943        let domain_name = self.url.inner();
3944        let expanded = self.get_expanded_links(&domain_name.as_str());
3945
3946        self.configuration.configure_allowlist();
3947
3948        for url in expanded {
3949            #[cfg(feature = "regex")]
3950            let url_ref: &CaseInsensitiveString = &url;
3951            #[cfg(not(feature = "regex"))]
3952            let url_ref: &CompactString = url.inner();
3953            if self
3954                .is_allowed_default(url_ref)
3955                .eq(&ProcessLinkStatus::Allowed)
3956            {
3957                let mut links_ssg = HashSet::new();
3958                let mut links_pages = if self.configuration.return_page_links {
3959                    Some(HashSet::new())
3960                } else {
3961                    None
3962                };
3963                let mut page_links_settings =
3964                    PageLinkBuildSettings::new(true, self.configuration.full_resources);
3965
3966                page_links_settings.subdomains = self.configuration.subdomains;
3967                page_links_settings.tld = self.configuration.tld;
3968                page_links_settings.normalize = self.configuration.normalize;
3969
3970                let mut domain_parsed = self.domain_parsed.take();
3971
3972                let mut page = Page::new_page_streaming(
3973                    &url,
3974                    client,
3975                    false,
3976                    base,
3977                    &self.configuration.external_domains_caseless,
3978                    &page_links_settings,
3979                    &mut links,
3980                    Some(&mut links_ssg),
3981                    &mut domain_parsed, // original domain
3982                    &mut self.domain_parsed,
3983                    &mut links_pages,
3984                )
3985                .await;
3986
3987                if self.domain_parsed.is_none() {
3988                    if let Some(mut domain_parsed) = domain_parsed.take() {
3989                        convert_abs_url(&mut domain_parsed);
3990                        self.domain_parsed.replace(domain_parsed);
3991                    }
3992                }
3993
3994                let mut retry_count = self.configuration.retry;
3995                let domains_caseless = &self.configuration.external_domains_caseless;
3996
3997                while page.should_retry && retry_count > 0 {
3998                    retry_count -= 1;
3999                    if let Some(timeout) = page.get_timeout() {
4000                        tokio::time::sleep(timeout).await;
4001                    }
4002
4003                    if page.status_code == StatusCode::GATEWAY_TIMEOUT {
4004                        let mut domain_parsed_clone = self.domain_parsed.clone();
4005
4006                        if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
4007                            page.clone_from(
4008                                &Page::new_page_streaming(
4009                                    &url,
4010                                    client,
4011                                    false,
4012                                    base,
4013                                    domains_caseless,
4014                                    &page_links_settings,
4015                                    &mut links,
4016                                    Some(&mut links_ssg),
4017                                    &mut domain_parsed,
4018                                    &mut domain_parsed_clone,
4019                                    &mut links_pages,
4020                                )
4021                                .await,
4022                            );
4023                        })
4024                        .await
4025                        {
4026                            log::info!("backoff gateway timeout exceeded {elasped}");
4027                        }
4028
4029                        self.domain_parsed = domain_parsed_clone;
4030                    } else {
4031                        page.clone_from(
4032                            &Page::new_page_streaming(
4033                                &url,
4034                                client,
4035                                false,
4036                                base,
4037                                &self.configuration.external_domains_caseless,
4038                                &page_links_settings,
4039                                &mut links,
4040                                Some(&mut links_ssg),
4041                                &mut domain_parsed,
4042                                &mut self.domain_parsed,
4043                                &mut links_pages,
4044                            )
4045                            .await,
4046                        );
4047                    }
4048                }
4049
4050                emit_log(&url);
4051
4052                if let Some(signature) = page.signature {
4053                    if !self.is_signature_allowed(signature).await {
4054                        return Default::default();
4055                    }
4056                    self.insert_signature(signature).await;
4057                }
4058
4059                self.insert_link(
4060                    self.on_link_find_callback
4061                        .as_ref()
4062                        .map(|cb| cb(*self.url.clone(), None).0)
4063                        .unwrap_or_else(|| *self.url.clone()),
4064                )
4065                .await;
4066
4067                if self.configuration.return_page_links {
4068                    page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new);
4069                }
4070
4071                links.extend(links_ssg);
4072
4073                self.initial_status_code = page.status_code;
4074                self.initial_html_length = page.get_html_bytes_u8().len();
4075                self.initial_anti_bot_tech = page.anti_bot_tech;
4076                self.initial_page_should_retry = page.should_retry;
4077                self.initial_page_waf_check = page.waf_check;
4078
4079                self.set_crawl_initial_status(&page, &links);
4080
4081                if let Some(ref cb) = self.on_should_crawl_callback {
4082                    if !cb.call(&page) {
4083                        page.blocked_crawl = true;
4084                        channel_send_page(&self.channel, page, &self.channel_guard);
4085                        return Default::default();
4086                    }
4087                }
4088
4089                channel_send_page(&self.channel, page, &self.channel_guard);
4090            }
4091        }
4092
4093        links
4094    }
4095
4096    /// Expand links for crawl.
4097    #[cfg(all(not(feature = "decentralized"), feature = "smart"))]
4098    pub async fn crawl_establish_smart(
4099        &mut self,
4100        client: &Client,
4101        mut base: &mut RelativeSelectors,
4102        browser: &crate::features::chrome::OnceBrowser,
4103    ) -> HashSet<CaseInsensitiveString> {
4104        if self.skip_initial {
4105            return Default::default();
4106        }
4107
4108        let links: HashSet<CaseInsensitiveString> = if self
4109            .is_allowed_default(&self.get_base_link())
4110            .eq(&ProcessLinkStatus::Allowed)
4111        {
4112            let url = self.url.inner();
4113
4114            let mut page = if let Some(seeded_page) = self.build_seed_page() {
4115                seeded_page
4116            } else {
4117                Page::new_page(&url, &client).await
4118            };
4119
4120            let mut retry_count = self.configuration.retry;
4121
4122            while page.should_retry && retry_count > 0 {
4123                retry_count -= 1;
4124                if let Some(timeout) = page.get_timeout() {
4125                    tokio::time::sleep(timeout).await;
4126                }
4127                let client_error = page.status_code.is_client_error();
4128
4129                if page.status_code == StatusCode::GATEWAY_TIMEOUT {
4130                    if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
4131                        if retry_count.is_power_of_two() {
4132                            Website::render_chrome_page(
4133                                &self.configuration,
4134                                client,
4135                                &mut page,
4136                                url,
4137                                &self.domain_parsed,
4138                                browser,
4139                            )
4140                            .await;
4141                        } else {
4142                            let next_page = Page::new_page(url, &client).await;
4143                            page.clone_from(&next_page);
4144                        };
4145                    })
4146                    .await
4147                    {
4148                        log::warn!("backoff timeout {elasped}");
4149                    }
4150                } else {
4151                    if retry_count.is_power_of_two() || client_error {
4152                        Website::render_chrome_page(
4153                            &self.configuration,
4154                            client,
4155                            &mut page,
4156                            url,
4157                            &self.domain_parsed,
4158                            browser,
4159                        )
4160                        .await
4161                    } else {
4162                        page.clone_from(&Page::new_page(url, &client).await);
4163                    }
4164                }
4165            }
4166
4167            let (page_links, bytes_transferred): (HashSet<CaseInsensitiveString>, Option<f64>) =
4168                page.smart_links(
4169                    &base,
4170                    &self.configuration,
4171                    &self.domain_parsed,
4172                    &browser,
4173                    Some(&self.cookie_jar),
4174                )
4175                .await;
4176
4177            if let Some(domain) = &page.final_redirect_destination {
4178                let prior_domain = self.domain_parsed.take();
4179                crate::utils::modify_selectors(
4180                    &prior_domain,
4181                    domain,
4182                    &mut self.domain_parsed,
4183                    &mut self.url,
4184                    &mut base,
4185                    AllowedDomainTypes::new(self.configuration.subdomains, self.configuration.tld),
4186                );
4187            }
4188
4189            emit_log(&self.url.inner());
4190
4191            if let Some(sid) = page.signature {
4192                self.insert_signature(sid).await;
4193            }
4194
4195            self.insert_link(
4196                self.on_link_find_callback
4197                    .as_ref()
4198                    .map(|cb| cb(*self.url.clone(), None).0)
4199                    .unwrap_or_else(|| *self.url.clone()),
4200            )
4201            .await;
4202
4203            let links = if !page_links.is_empty() {
4204                page_links
4205            } else {
4206                Default::default()
4207            };
4208
4209            page.bytes_transferred = bytes_transferred;
4210
4211            self.initial_status_code = page.status_code;
4212            self.initial_html_length = page.get_html_bytes_u8().len();
4213            self.initial_anti_bot_tech = page.anti_bot_tech;
4214            self.initial_page_should_retry = page.should_retry;
4215            self.initial_page_waf_check = page.waf_check;
4216
4217            self.set_crawl_initial_status(&page, &links);
4218
4219            if self.configuration.return_page_links {
4220                page.page_links = if links.is_empty() {
4221                    None
4222                } else {
4223                    Some(Box::new(links.clone()))
4224                };
4225            }
4226
4227            if let Some(cb) = &mut self.on_should_crawl_callback {
4228                if !cb.call(&page) {
4229                    page.blocked_crawl = true;
4230                    channel_send_page(&self.channel, page, &self.channel_guard);
4231                    return Default::default();
4232                }
4233            }
4234
4235            channel_send_page(&self.channel, page, &self.channel_guard);
4236
4237            links
4238        } else {
4239            HashSet::new()
4240        };
4241
4242        links
4243    }
4244
4245    /// fetch the page with chrome
4246    #[cfg(all(not(feature = "decentralized"), feature = "smart"))]
4247    pub async fn render_chrome_page(
4248        config: &Configuration,
4249        client: &Client,
4250        page: &mut Page,
4251        url: &str,
4252        base: &Option<Box<Url>>,
4253        browser: &crate::features::chrome::OnceBrowser,
4254    ) {
4255        if let Some(browser_controller) = browser
4256            .get_or_init(|| crate::website::Website::setup_browser_base(&config, &base, None))
4257            .await
4258        {
4259            if let Ok(chrome_page) = crate::features::chrome::attempt_navigation(
4260                "about:blank",
4261                &browser_controller.browser.0,
4262                &config.request_timeout,
4263                &browser_controller.browser.2,
4264                &config.viewport,
4265            )
4266            .await
4267            {
4268                let (_, intercept_handle) = tokio::join!(
4269                    crate::features::chrome::setup_chrome_events(&chrome_page, &config),
4270                    crate::features::chrome::setup_chrome_interception_base(
4271                        &chrome_page,
4272                        config.chrome_intercept.enabled,
4273                        &config.auth_challenge_response,
4274                        config.chrome_intercept.block_visuals,
4275                        &url,
4276                    )
4277                );
4278
4279                let next_page = Page::new(
4280                    &url,
4281                    &client,
4282                    &chrome_page,
4283                    &config.wait_for,
4284                    &config.screenshot,
4285                    false, // we use the initial about:blank page.
4286                    &config.openai_config,
4287                    &config.execution_scripts,
4288                    &config.automation_scripts,
4289                    &config.viewport,
4290                    &config.request_timeout,
4291                    &config.track_events,
4292                    config.referer.clone(),
4293                    config.max_page_bytes,
4294                    config.get_cache_options(),
4295                    &config.cache_policy,
4296                    &config.remote_multimodal,
4297                )
4298                .await;
4299
4300                page.clone_from(&next_page);
4301
4302                if let Some(h) = intercept_handle {
4303                    let abort_handle = h.abort_handle();
4304                    if let Err(elasped) =
4305                        tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await
4306                    {
4307                        log::warn!("Handler timeout exceeded {elasped}");
4308                        abort_handle.abort();
4309                    }
4310                }
4311            }
4312        }
4313    }
4314
4315    /// Set the crawl status depending on crawl state. The crawl that only changes if the state is Start or Active.
4316    pub fn set_crawl_status(&mut self) {
4317        if self.status == CrawlStatus::Start || self.status == CrawlStatus::Active {
4318            self.status = if self.domain_parsed.is_none() {
4319                CrawlStatus::Invalid
4320            } else {
4321                CrawlStatus::Idle
4322            };
4323        }
4324    }
4325
4326    /// Setup the Semaphore for the crawl.
4327    pub fn setup_semaphore(&self) -> Arc<Semaphore> {
4328        if self.configuration.shared_queue {
4329            SEM_SHARED.clone()
4330        } else {
4331            Arc::new(Semaphore::const_new(
4332                self.configuration
4333                    .concurrency_limit
4334                    .unwrap_or(*DEFAULT_PERMITS),
4335            ))
4336        }
4337    }
4338
4339    /// Start to crawl website with async concurrency.
4340    pub async fn crawl(&mut self) {
4341        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4342            self.start();
4343            let (client, handle) = self.setup().await;
4344            let (handle, join_handle) = match handle {
4345                Some(h) => (Some(h.0), Some(h.1)),
4346                _ => (None, None),
4347            };
4348            self.crawl_concurrent(&client, &handle).await;
4349            self.sitemap_crawl_chain(&client, &handle, false).await;
4350            self.set_crawl_status();
4351            if let Some(h) = join_handle {
4352                h.abort()
4353            }
4354            self.client.replace(client);
4355        }
4356    }
4357
4358    /// Start to crawl website with async concurrency using the sitemap. This does not page forward into the request. This does nothing without the `sitemap` flag enabled.
4359    pub async fn crawl_sitemap(&mut self) {
4360        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4361            self.start();
4362            let (client, handle) = self.setup().await;
4363            let (handle, join_handle) = match handle {
4364                Some(h) => (Some(h.0), Some(h.1)),
4365                _ => (None, None),
4366            };
4367            self.sitemap_crawl(&client, &handle, false).await;
4368            self.set_crawl_status();
4369            if let Some(h) = join_handle {
4370                h.abort()
4371            }
4372            self.client.replace(client);
4373        }
4374    }
4375
4376    /// Start to crawl website with async concurrency using the sitemap. This does not page forward into the request. This does nothing without the `sitemap` and the `chrome` flag enabled.
4377    #[cfg(all(
4378        feature = "sitemap",
4379        feature = "chrome",
4380        not(feature = "decentralized")
4381    ))]
4382    pub async fn crawl_sitemap_chrome(&mut self) {
4383        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4384            self.start();
4385            let (client, handle) = self.setup().await;
4386            let (handle, join_handle) = match handle {
4387                Some(h) => (Some(h.0), Some(h.1)),
4388                _ => (None, None),
4389            };
4390            self.sitemap_crawl_chrome(&client, &handle, false).await;
4391            self.set_crawl_status();
4392            if let Some(h) = join_handle {
4393                h.abort()
4394            }
4395            self.client.replace(client);
4396        }
4397    }
4398
4399    /// Configures the website crawling process for concurrent execution with the ability to send it across threads for subscriptions.
4400    pub async fn configure_setup(&mut self) {
4401        self.status = CrawlStatus::Active;
4402        self.start();
4403        self.setup().await;
4404        self.configuration.configure_allowlist();
4405        self.send_configured = true;
4406    }
4407
4408    /// Configures the website crawling process for concurrent execution with the ability to send it across threads for subscriptions without robot protection.
4409    /// You can manually call `website.configure_robots_parser` after.
4410    pub fn configure_setup_norobots(&mut self) {
4411        self.status = CrawlStatus::Active;
4412        self.start();
4413        self.setup_base();
4414        self.configuration.configure_allowlist();
4415        self.send_configured = true;
4416    }
4417
4418    #[cfg(not(feature = "decentralized"))]
4419    /// Initiates the website crawling http process concurrently with the ability to send it across threads for subscriptions.
4420    /// Ensure that `website.configure_setup()` has been called before executing this function.
4421    /// It checks the status to ensure it is not firewall-blocked before proceeding with concurrent crawling.
4422    /// You can pass in a manual url in order to setup a new crawl directly with pre-configurations ready.
4423    pub async fn crawl_raw_send(&self, url: Option<&str>) {
4424        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4425            let (client, handle) = (
4426                match &self.client {
4427                    Some(c) => c.to_owned(),
4428                    _ => self.configure_http_client(),
4429                },
4430                self.configure_handler(),
4431            );
4432            let (handle, join_handle) = match handle {
4433                Some(h) => (Some(h.0), Some(h.1)),
4434                _ => (None, None),
4435            };
4436            self.crawl_concurrent_raw_send(&client, &handle, &url).await;
4437            if let Some(h) = join_handle {
4438                h.abort()
4439            }
4440        }
4441    }
4442
4443    #[cfg(all(feature = "chrome", not(feature = "decentralized")))]
4444    /// Initiates the website crawling process concurrently with the ability to send it across threads for subscriptions.
4445    /// Use `website.configure_setup().await` before executing this function to re-use the initial setup.
4446    /// You can pass in a manual url in order to setup a new crawl directly with pre-configurations ready.
4447    pub async fn crawl_chrome_send(&self, url: Option<&str>) {
4448        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4449            let (client, handle) = (
4450                match &self.client {
4451                    Some(c) => c.to_owned(),
4452                    _ => self.configure_http_client(),
4453                },
4454                self.configure_handler(),
4455            );
4456            let (handle, join_handle) = match handle {
4457                Some(h) => (Some(h.0), Some(h.1)),
4458                _ => (None, None),
4459            };
4460            self.crawl_concurrent_send(&client, &handle, &url).await;
4461            if let Some(h) = join_handle {
4462                h.abort()
4463            }
4464        }
4465    }
4466
4467    #[cfg(all(feature = "chrome", not(feature = "decentralized")))]
4468    /// Initiates a single fetch with chrome for one page with the ability to send it across threads for subscriptions.
4469    pub async fn fetch_chrome(&self, url: Option<&str>) {
4470        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4471            let (client, handle) = (
4472                match &self.client {
4473                    Some(c) => c.to_owned(),
4474                    _ => self.configure_http_client(),
4475                },
4476                self.configure_handler(),
4477            );
4478            let (_handle, join_handle) = match handle {
4479                Some(h) => (Some(h.0), Some(h.1)),
4480                _ => (None, None),
4481            };
4482            self._fetch_chrome(&client, &url).await;
4483            if let Some(h) = join_handle {
4484                h.abort()
4485            }
4486        }
4487    }
4488
4489    #[cfg(all(feature = "chrome", not(feature = "decentralized")))]
4490    /// Initiates a single fetch with chrome without closing the browser for one page with the ability to send it across threads for subscriptions.
4491    pub async fn fetch_chrome_persisted(
4492        &self,
4493        url: Option<&str>,
4494        browser: &crate::features::chrome::BrowserController,
4495    ) {
4496        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4497            let (client, handle) = (
4498                match &self.client {
4499                    Some(c) => c.to_owned(),
4500                    _ => self.configure_http_client(),
4501                },
4502                self.configure_handler(),
4503            );
4504            let (_handle, join_handle) = match handle {
4505                Some(h) => (Some(h.0), Some(h.1)),
4506                _ => (None, None),
4507            };
4508            self._fetch_chrome_persisted(&client, &url, &browser).await;
4509            if let Some(h) = join_handle {
4510                h.abort()
4511            }
4512        }
4513    }
4514
4515    #[cfg(all(feature = "decentralized", feature = "smart"))]
4516    /// Start to crawl website with async concurrency smart. Use HTTP first and JavaScript Rendering as needed. This has no effect without the `smart` flag enabled.
4517    pub async fn crawl_smart(&mut self) {
4518        self.crawl().await;
4519    }
4520
4521    #[cfg(all(feature = "decentralized", not(feature = "smart")))]
4522    /// Start to crawl website with async concurrency smart. Use HTTP first and JavaScript Rendering as needed. This has no effect without the `smart` flag enabled.
4523    pub async fn crawl_smart(&mut self) {
4524        self.crawl().await;
4525    }
4526
4527    #[cfg(all(not(feature = "decentralized"), feature = "smart"))]
4528    /// Start to crawl website with async concurrency smart. Use HTTP first and JavaScript Rendering as needed. This has no effect without the `smart` flag enabled.
4529    pub async fn crawl_smart(&mut self) {
4530        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4531            self.start();
4532            let (client, handle) = self.setup().await;
4533            let (handle, join_handle) = match handle {
4534                Some(h) => (Some(h.0), Some(h.1)),
4535                _ => (None, None),
4536            };
4537            self.crawl_concurrent_smart(&client, &handle).await;
4538            self.set_crawl_status();
4539            if let Some(h) = join_handle {
4540                h.abort()
4541            }
4542            self.client.replace(client);
4543        }
4544    }
4545
4546    #[cfg(all(not(feature = "decentralized"), not(feature = "smart")))]
4547    /// Start to crawl website with async concurrency smart. Use HTTP first and JavaScript Rendering as needed. This has no effect without the `smart` flag enabled.
4548    pub async fn crawl_smart(&mut self) {
4549        self.crawl().await
4550    }
4551
4552    /// Start to crawl website with async concurrency using the base raw functionality. Useful when using the `chrome` feature and defaulting to the basic implementation.
4553    pub async fn crawl_raw(&mut self) {
4554        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4555            self.start();
4556            let (client, handle) = self.setup().await;
4557            let (handle, join_handle) = match handle {
4558                Some(h) => (Some(h.0), Some(h.1)),
4559                _ => (None, None),
4560            };
4561            self.crawl_concurrent_raw(&client, &handle).await;
4562            self.sitemap_crawl_chain(&client, &handle, false).await;
4563            self.set_crawl_status();
4564            if let Some(h) = join_handle {
4565                h.abort()
4566            }
4567            self.client.replace(client);
4568        }
4569    }
4570
4571    /// Start to scrape/download website with async concurrency.
4572    pub async fn scrape(&mut self) {
4573        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4574            let mut w = self.clone();
4575            let mut rx2 = w.subscribe(0).expect("receiver enabled");
4576
4577            if self.pages.is_none() {
4578                self.pages = Some(Vec::new());
4579            }
4580
4581            // Signal channel to notify when crawl is done
4582            let (done_tx, mut done_rx) = tokio::sync::oneshot::channel::<()>();
4583
4584            let crawl = async move {
4585                w.crawl().await;
4586                w.unsubscribe();
4587                // Signal that crawl is complete
4588                let _ = done_tx.send(());
4589            };
4590
4591            let sub = async {
4592                loop {
4593                    tokio::select! {
4594                        biased;
4595                        // Check if crawl is done first
4596                        _ = &mut done_rx => {
4597                            break;
4598                        }
4599                        result = rx2.recv() => {
4600                            if let Ok(page) = result {
4601                                if let Some(sid) = page.signature {
4602                                    self.insert_signature(sid).await;
4603                                }
4604                                self.insert_link(page.get_url().into()).await;
4605                                if let Some(p) = self.pages.as_mut() {
4606                                    p.push(page);
4607                                }
4608                            } else {
4609                                break;
4610                            }
4611                        }
4612                    }
4613                }
4614            };
4615
4616            tokio::join!(sub, crawl);
4617            // Unsubscribe from self to close the original channel for any external subscribers
4618            self.unsubscribe();
4619        }
4620    }
4621
4622    /// Start to crawl website with async concurrency using the base raw functionality. Useful when using the "chrome" feature and defaulting to the basic implementation.
4623    pub async fn scrape_raw(&mut self) {
4624        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4625            let mut w = self.clone();
4626            let mut rx2 = w.subscribe(0).expect("receiver enabled");
4627
4628            if self.pages.is_none() {
4629                self.pages = Some(Vec::new());
4630            }
4631
4632            let (done_tx, mut done_rx) = tokio::sync::oneshot::channel::<()>();
4633
4634            let crawl = async move {
4635                w.crawl_raw().await;
4636                w.unsubscribe();
4637                let _ = done_tx.send(());
4638            };
4639
4640            let sub = async {
4641                loop {
4642                    tokio::select! {
4643                        biased;
4644                        _ = &mut done_rx => break,
4645                        result = rx2.recv() => {
4646                            if let Ok(page) = result {
4647                                if let Some(sid) = page.signature {
4648                                    self.insert_signature(sid).await;
4649                                }
4650                                self.insert_link(page.get_url().into()).await;
4651                                if let Some(p) = self.pages.as_mut() {
4652                                    p.push(page);
4653                                }
4654                            } else {
4655                                break;
4656                            }
4657                        }
4658                    }
4659                }
4660            };
4661
4662            tokio::join!(sub, crawl);
4663            self.unsubscribe();
4664        }
4665    }
4666
4667    /// Start to scrape website with async concurrency smart. Use HTTP first and JavaScript Rendering as needed. This has no effect without the `smart` flag enabled.
4668    pub async fn scrape_smart(&mut self) {
4669        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4670            let mut w = self.clone();
4671            let mut rx2 = w.subscribe(0).expect("receiver enabled");
4672
4673            if self.pages.is_none() {
4674                self.pages = Some(Vec::new());
4675            }
4676
4677            let (done_tx, mut done_rx) = tokio::sync::oneshot::channel::<()>();
4678
4679            let crawl = async move {
4680                w.crawl_smart().await;
4681                w.unsubscribe();
4682                let _ = done_tx.send(());
4683            };
4684
4685            let sub = async {
4686                loop {
4687                    tokio::select! {
4688                        biased;
4689                        _ = &mut done_rx => break,
4690                        result = rx2.recv() => {
4691                            if let Ok(page) = result {
4692                                if let Some(sid) = page.signature {
4693                                    self.insert_signature(sid).await;
4694                                }
4695                                self.insert_link(page.get_url().into()).await;
4696                                if let Some(p) = self.pages.as_mut() {
4697                                    p.push(page);
4698                                }
4699                            } else {
4700                                break;
4701                            }
4702                        }
4703                    }
4704                }
4705            };
4706
4707            tokio::join!(sub, crawl);
4708            self.unsubscribe();
4709        }
4710    }
4711
4712    /// Start to scrape website sitemap with async concurrency. Use HTTP first and JavaScript Rendering as needed. This has no effect without the `sitemap` flag enabled.
4713    pub async fn scrape_sitemap(&mut self) {
4714        if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4715            let mut w = self.clone();
4716            let mut rx2 = w.subscribe(0).expect("receiver enabled");
4717
4718            if self.pages.is_none() {
4719                self.pages = Some(Vec::new());
4720            }
4721
4722            let (done_tx, mut done_rx) = tokio::sync::oneshot::channel::<()>();
4723
4724            let crawl = async move {
4725                w.crawl_sitemap().await;
4726                w.unsubscribe();
4727                let _ = done_tx.send(());
4728            };
4729
4730            let sub = async {
4731                loop {
4732                    tokio::select! {
4733                        biased;
4734                        _ = &mut done_rx => break,
4735                        result = rx2.recv() => {
4736                            if let Ok(page) = result {
4737                                if let Some(sid) = page.signature {
4738                                    self.insert_signature(sid).await;
4739                                }
4740                                self.insert_link(page.get_url().into()).await;
4741                                if let Some(p) = self.pages.as_mut() {
4742                                    p.push(page);
4743                                }
4744                            } else {
4745                                break;
4746                            }
4747                        }
4748                    }
4749                }
4750            };
4751
4752            tokio::join!(sub, crawl);
4753            self.unsubscribe();
4754        }
4755    }
4756
4757    /// Dequeue the links to a set
4758    async fn dequeue(
4759        &mut self,
4760        q: &mut Option<tokio::sync::broadcast::Receiver<String>>,
4761        links: &mut HashSet<CaseInsensitiveString>,
4762        exceeded_budget: &mut bool,
4763    ) {
4764        if let Some(q) = q {
4765            while let Ok(link) = q.try_recv() {
4766                let s = link.into();
4767                let allowed = self.is_allowed_budgetless(&s);
4768
4769                if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
4770                    *exceeded_budget = true;
4771                    break;
4772                }
4773
4774                if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&s).await {
4775                    continue;
4776                }
4777
4778                self.links_visited.extend_with_new_links(links, s);
4779            }
4780        }
4781    }
4782
4783    /// Start to crawl website concurrently - used mainly for chrome instances to connect to default raw HTTP.
4784    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
4785    async fn crawl_concurrent_raw(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
4786        self.start();
4787        self.status = CrawlStatus::Active;
4788        let client_rotator = self.client_rotator.clone();
4789        let mut selector: (
4790            CompactString,
4791            smallvec::SmallVec<[CompactString; 2]>,
4792            CompactString,
4793        ) = self.setup_selectors();
4794        if self.single_page() {
4795            self._crawl_establish(client, &mut selector, false).await;
4796        } else {
4797            let on_should_crawl_callback = self.on_should_crawl_callback.clone();
4798            let full_resources = self.configuration.full_resources;
4799            let return_page_links = self.configuration.return_page_links;
4800            let only_html = self.configuration.only_html && !full_resources;
4801            let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
4802
4803            let (mut interval, throttle) = self.setup_crawl();
4804
4805            let mut links: HashSet<CaseInsensitiveString> = self.drain_extra_links().collect();
4806
4807            links.extend(self._crawl_establish(client, &mut selector, false).await);
4808
4809            self.configuration.configure_allowlist();
4810
4811            let semaphore = self.setup_semaphore();
4812
4813            let shared = Arc::new((
4814                client.to_owned(),
4815                selector,
4816                self.channel.clone(),
4817                self.configuration.external_domains_caseless.clone(),
4818                self.channel_guard.clone(),
4819                self.configuration.retry,
4820                self.configuration.full_resources,
4821                PageLinkBuildSettings::new_full(
4822                    false,
4823                    self.configuration.full_resources,
4824                    self.configuration.subdomains,
4825                    self.configuration.tld,
4826                    self.configuration.normalize,
4827                ),
4828                self.domain_parsed.clone(),
4829                self.on_link_find_callback.clone(),
4830                self.configuration.remote_multimodal.clone(),
4831            ));
4832
4833            let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> = JoinSet::new();
4834
4835            // track budgeting one time.
4836            let mut exceeded_budget = false;
4837            let concurrency = throttle.is_zero();
4838
4839            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
4840
4841            if !concurrency && !links.is_empty() {
4842                tokio::time::sleep(*throttle).await;
4843            }
4844
4845            let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
4846                Some(Instant::now())
4847            } else {
4848                None
4849            };
4850
4851            'outer: loop {
4852                let mut stream =
4853                    tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect());
4854
4855                loop {
4856                    if !concurrency {
4857                        tokio::time::sleep(*throttle).await;
4858                    }
4859
4860                    let semaphore =
4861                        get_semaphore(&semaphore, !self.configuration.shared_queue).await;
4862
4863                    tokio::select! {
4864                        biased;
4865                        Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
4866                            if !self.handle_process(handle, &mut interval, async {
4867                                emit_log_shutdown(link.inner());
4868                                let permits = set.len();
4869                                set.shutdown().await;
4870                                semaphore.add_permits(permits);
4871                            }).await {
4872                                while let Some(links) = stream.next().await {
4873                                    self.extra_links.insert(links);
4874                                }
4875                                break 'outer;
4876                            }
4877                            let allowed = self.is_allowed(&link);
4878
4879                            if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
4880                                exceeded_budget = true;
4881                                break;
4882                            }
4883
4884                            if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
4885                                continue;
4886                            }
4887
4888                            emit_log(link.inner());
4889
4890                            self.insert_link(link.clone()).await;
4891
4892                            if let Ok(permit) = semaphore.clone().acquire_owned().await {
4893                                let shared = shared.clone();
4894                                let on_should_crawl_callback = on_should_crawl_callback.clone();
4895                                let rotator = client_rotator.clone();
4896                                spawn_set("page_fetch", &mut set, async move {
4897                                    let link_result = match &shared.9 {
4898                                        Some(cb) => cb(link, None),
4899                                        _ => (link, None),
4900                                    };
4901
4902                                    let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
4903                                    let mut links_pages = if return_page_links {
4904                                        Some(links.clone())
4905                                    } else {
4906                                        None
4907                                    };
4908                                    let mut relative_selectors = shared.1.clone();
4909                                    let mut r_settings = shared.7;
4910                                    r_settings.ssg_build = true;
4911                                    let target_url = link_result.0.as_ref();
4912                                    let external_domains_caseless = &shared.3;
4913                                    let client = match &rotator {
4914                                        Some(r) => r.next(),
4915                                        None => &shared.0,
4916                                    };
4917
4918                                    let mut domain_parsed = None;
4919
4920                                    let mut page = Page::new_page_streaming(
4921                                        target_url,
4922                                        client, only_html,
4923                                        &mut relative_selectors,
4924                                        external_domains_caseless,
4925                                        &r_settings,
4926                                        &mut links,
4927                                        None,
4928                                        &shared.8,
4929                                        &mut domain_parsed,
4930                                        &mut links_pages).await;
4931
4932                                    let mut retry_count = shared.5;
4933
4934                                    while page.should_retry && retry_count > 0 {
4935                                        retry_count -= 1;
4936
4937                                        if let Some(timeout) = page.get_timeout() {
4938                                            tokio::time::sleep(timeout).await;
4939                                        }
4940
4941                                        let retry_client = match &rotator {
4942                                            Some(r) => r.next(),
4943                                            None => &shared.0,
4944                                        };
4945
4946                                        if page.status_code == StatusCode::GATEWAY_TIMEOUT {
4947                                            if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
4948                                                let mut domain_parsed = None;
4949                                                let next_page = Page::new_page_streaming(
4950                                                    target_url,
4951                                                    retry_client, only_html,
4952                                                    &mut relative_selectors.clone(),
4953                                                    external_domains_caseless,
4954                                                    &r_settings,
4955                                                    &mut links,
4956                                                    None,
4957                                                    &shared.8,
4958                                                    &mut domain_parsed,
4959                                                    &mut links_pages).await;
4960
4961                                                page.clone_from(&next_page);
4962
4963                                            }).await
4964                                        {
4965                                            log::warn!("Handler timeout exceeded {elasped}");
4966                                        }
4967
4968                                        } else {
4969                                            page.clone_from(&Page::new_page_streaming(
4970                                                target_url,
4971                                                retry_client,
4972                                                only_html,
4973                                                &mut relative_selectors.clone(),
4974                                                external_domains_caseless,
4975                                                &r_settings,
4976                                                &mut links,
4977                                                None,
4978                                                &shared.8,
4979                                                &mut domain_parsed,
4980                                                &mut links_pages).await);
4981                                        }
4982                                    }
4983
4984                                    if return_page_links {
4985                                        page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new);
4986                                    }
4987
4988                                    // Run remote multimodal extraction if configured (HTTP-only path)
4989                                    #[cfg(all(feature = "agent", feature = "serde"))]
4990                                    if shared.10.is_some() {
4991                                        let html = page.get_html();
4992                                        if !html.is_empty() {
4993                                            use crate::features::automation::{run_remote_multimodal_extraction, AutomationResultExt};
4994                                            let title = page.metadata.as_ref().and_then(|m| m.title.as_ref()).map(|t| t.as_str());
4995                                            if let Ok(Some(result)) = run_remote_multimodal_extraction(
4996                                                &shared.10,
4997                                                &html,
4998                                                target_url,
4999                                                title,
5000                                            ).await {
5001                                                // Store usage on page
5002                                                match page.remote_multimodal_usage.as_mut() {
5003                                                    Some(v) => v.push(result.usage.clone()),
5004                                                    None => page.remote_multimodal_usage = Some(vec![result.usage.clone()]),
5005                                                }
5006                                                // Store extracted data if available
5007                                                if result.extracted.is_some() || result.screenshot.is_some() {
5008                                                    let automation_result = result.to_automation_results();
5009                                                    match page.extra_remote_multimodal_data.as_mut() {
5010                                                        Some(v) => v.push(automation_result),
5011                                                        None => page.extra_remote_multimodal_data = Some(vec![automation_result]),
5012                                                    }
5013                                                }
5014                                            }
5015                                        }
5016                                    }
5017
5018                                    if let Some(ref cb) = on_should_crawl_callback {
5019                                        if !cb.call(&page) {
5020                                            page.blocked_crawl = true;
5021                                            channel_send_page(&shared.2, page, &shared.4);
5022                                            drop(permit);
5023                                            return Default::default()
5024                                        }
5025                                    }
5026
5027                                    let signature = page.signature;
5028
5029                                    channel_send_page(&shared.2, page, &shared.4);
5030
5031                                    drop(permit);
5032
5033                                    (links, signature)
5034                                });
5035                            }
5036
5037                            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5038                        },
5039                        Some(result) = set.join_next(), if !set.is_empty() => {
5040                            if let Ok(res) = result {
5041                                match res.1 {
5042                                    Some(signature) => {
5043                                        if self.is_signature_allowed(signature).await {
5044                                            self.insert_signature(signature).await;
5045                                            self.links_visited.extend_links(&mut links, res.0);
5046                                        }
5047                                    }
5048                                    _ => {
5049                                        self.links_visited.extend_links(&mut links, res.0);
5050                                    }
5051                                }
5052                            } else {
5053                                break;
5054                            }
5055                        }
5056                        else => break,
5057                    }
5058
5059                    self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5060
5061                    if links.is_empty() && set.is_empty() || exceeded_budget {
5062                        // await for all tasks to complete.
5063                        if exceeded_budget {
5064                            while let Some(links) = stream.next().await {
5065                                self.extra_links.insert(links);
5066                            }
5067                            while let Some(links) = set.join_next().await {
5068                                if let Ok(links) = links {
5069                                    self.extra_links.extend(links.0);
5070                                }
5071                            }
5072                        }
5073                        break 'outer;
5074                    }
5075                }
5076
5077                self.subscription_guard().await;
5078                self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5079
5080                if links.is_empty() && set.is_empty() {
5081                    break;
5082                }
5083            }
5084
5085            // store the extra links.
5086            if !links.is_empty() {
5087                self.extra_links.extend(links);
5088            }
5089        }
5090    }
5091
5092    /// Start to crawl website concurrently.
5093    #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
5094    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
5095    async fn crawl_concurrent(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
5096        use crate::features::chrome::attempt_navigation;
5097        self.start();
5098
5099        match self.setup_browser().await {
5100            Some(mut b) => {
5101                match attempt_navigation(
5102                    "about:blank",
5103                    &b.browser.0,
5104                    &self.configuration.request_timeout,
5105                    &b.browser.2,
5106                    &self.configuration.viewport,
5107                )
5108                .await
5109                {
5110                    Ok(new_page) => {
5111                        let mut selectors = self.setup_selectors();
5112                        self.status = CrawlStatus::Active;
5113
5114                        if self.single_page() {
5115                            self.crawl_establish(&client, &mut selectors, false, &new_page)
5116                                .await;
5117                            drop(new_page);
5118                            self.subscription_guard().await;
5119                            b.dispose();
5120                        } else {
5121                            let semaphore: Arc<Semaphore> = self.setup_semaphore();
5122                            let (mut interval, throttle) = self.setup_crawl();
5123
5124                            let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
5125
5126                            let base_links = self
5127                                .crawl_establish(&client, &mut selectors, false, &new_page)
5128                                .await;
5129
5130                            drop(new_page);
5131
5132                            let mut links: HashSet<CaseInsensitiveString> =
5133                                self.drain_extra_links().collect();
5134
5135                            links.extend(base_links);
5136
5137                            self.configuration.configure_allowlist();
5138
5139                            let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> =
5140                                JoinSet::new();
5141
5142                            let shared = Arc::new((
5143                                client.to_owned(),
5144                                selectors,
5145                                self.channel.clone(),
5146                                self.configuration.external_domains_caseless.clone(),
5147                                self.channel_guard.clone(),
5148                                b.browser.0.clone(),
5149                                self.configuration.clone(),
5150                                self.url.inner().to_string(),
5151                                b.browser.2.clone(),
5152                                self.domain_parsed.clone(),
5153                                self.on_link_find_callback.clone(),
5154                            ));
5155
5156                            let add_external = shared.3.len() > 0;
5157                            let on_should_crawl_callback = self.on_should_crawl_callback.clone();
5158                            let full_resources = self.configuration.full_resources;
5159                            let return_page_links = self.configuration.return_page_links;
5160                            let mut exceeded_budget = false;
5161                            let concurrency = throttle.is_zero();
5162
5163                            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5164
5165                            if !concurrency && !links.is_empty() {
5166                                tokio::time::sleep(*throttle).await;
5167                            }
5168
5169                            let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
5170                                Some(Instant::now())
5171                            } else {
5172                                None
5173                            };
5174
5175                            'outer: loop {
5176                                let mut stream = tokio_stream::iter::<HashSet<CaseInsensitiveString>>(
5177                                    links.drain().collect(),
5178                                );
5179
5180                                loop {
5181                                    if !concurrency {
5182                                        tokio::time::sleep(*throttle).await;
5183                                    }
5184
5185                                    let semaphore =
5186                                        get_semaphore(&semaphore, !self.configuration.shared_queue)
5187                                            .await;
5188
5189                                    tokio::select! {
5190                                        biased;
5191                                        Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker)  => {
5192                                            if !self
5193                                                .handle_process(
5194                                                    handle,
5195                                                    &mut interval,
5196                                                    async {
5197                                                        emit_log_shutdown(&link.inner());
5198                                                        let permits = set.len();
5199                                                        set.shutdown().await;
5200                                                        semaphore.add_permits(permits);
5201                                                    },
5202                                                )
5203                                                .await
5204                                            {
5205                                                break 'outer;
5206                                            }
5207
5208                                            let allowed = self.is_allowed(&link);
5209
5210                                            if allowed
5211                                                .eq(&ProcessLinkStatus::BudgetExceeded)
5212                                            {
5213                                                exceeded_budget = true;
5214                                                break;
5215                                            }
5216                                            if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
5217                                                continue;
5218                                            }
5219
5220                                            emit_log(&link.inner());
5221
5222                                            self.insert_link(link.clone()).await;
5223
5224                                            if let Ok(permit) = semaphore.clone().acquire_owned().await {
5225                                                let shared = shared.clone();
5226                                                let on_should_crawl_callback = on_should_crawl_callback.clone();
5227                                                spawn_set("page_fetch", &mut set, async move {
5228                                                    let results = match attempt_navigation("about:blank", &shared.5, &shared.6.request_timeout, &shared.8, &shared.6.viewport).await {
5229                                                        Ok(new_page) => {
5230                                                            let (_, intercept_handle) = tokio::join!(
5231                                                                crate::features::chrome::setup_chrome_events(&new_page, &shared.6),
5232                                                                crate::features::chrome::setup_chrome_interception_base(
5233                                                                    &new_page,
5234                                                                    shared.6.chrome_intercept.enabled,
5235                                                                    &shared.6.auth_challenge_response,
5236                                                                    shared.6.chrome_intercept.block_visuals,
5237                                                                    &shared.7,
5238                                                                )
5239                                                            );
5240
5241                                                            let link_result =
5242                                                                match  &shared.10 {
5243                                                                    Some(cb) => cb(link, None),
5244                                                                    _ => (link, None),
5245                                                                };
5246
5247                                                            let target_url = link_result.0.as_ref();
5248
5249                                                            let mut page = Page::new(
5250                                                                &target_url,
5251                                                                &shared.0,
5252                                                                &new_page,
5253                                                                &shared.6.wait_for,
5254                                                                &shared.6.screenshot,
5255                                                                false,
5256                                                                &shared.6.openai_config,
5257                                                                &shared.6.execution_scripts,
5258                                                                &shared.6.automation_scripts,
5259                                                                &shared.6.viewport,
5260                                                                &shared.6.request_timeout,
5261                                                                &shared.6.track_events,
5262                                                                shared.6.referer.clone(),
5263                                                                shared.6.max_page_bytes,
5264                                                                shared.6.get_cache_options(),
5265                                                                &shared.6.cache_policy,
5266                                                                &shared.6.remote_multimodal,
5267                                                            )
5268                                                            .await;
5269
5270                                                            let mut retry_count = shared.6.retry;
5271
5272                                                            while page.should_retry && retry_count > 0 {
5273                                                                retry_count -= 1;
5274                                                                if let Some(timeout) = page.get_timeout() {
5275                                                                    tokio::time::sleep(timeout).await;
5276                                                                }
5277                                                                if page.status_code == StatusCode::GATEWAY_TIMEOUT {
5278                                                                    if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
5279                                                                        let p = Page::new(
5280                                                                            &target_url,
5281                                                                            &shared.0,
5282                                                                            &new_page,
5283                                                                            &shared.6.wait_for,
5284                                                                            &shared.6.screenshot,
5285                                                                            false,
5286                                                                            &shared.6.openai_config,
5287                                                                            &shared.6.execution_scripts,
5288                                                                            &shared.6.automation_scripts,
5289                                                                            &shared.6.viewport,
5290                                                                            &shared.6.request_timeout,
5291                                                                            &shared.6.track_events,
5292                                                                            shared.6.referer.clone(),
5293                                                                            shared.6.max_page_bytes,
5294                                                                            shared.6.get_cache_options(),
5295                                                                            &shared.6.cache_policy,
5296                                                                            &shared.6.remote_multimodal,
5297                                                                        ).await;
5298                                                                        page.clone_from(&p);
5299
5300                                                                    }).await {
5301                                                                        log::info!("{target_url} backoff gateway timeout exceeded {elasped}");
5302                                                                    }
5303                                                                } else {
5304                                                                    page.clone_from(
5305                                                                        &Page::new(
5306                                                                            &target_url,
5307                                                                            &shared.0,
5308                                                                            &new_page,
5309                                                                            &shared.6.wait_for,
5310                                                                            &shared.6.screenshot,
5311                                                                            false,
5312                                                                            &shared.6.openai_config,
5313                                                                            &shared.6.execution_scripts,
5314                                                                            &shared.6.automation_scripts,
5315                                                                            &shared.6.viewport,
5316                                                                            &shared.6.request_timeout,
5317                                                                            &shared.6.track_events,
5318                                                                            shared.6.referer.clone(),
5319                                                                            shared.6.max_page_bytes,
5320                                                                            shared.6.get_cache_options(),
5321                                                                            &shared.6.cache_policy,
5322                                                                            &shared.6.remote_multimodal,
5323                                                                        )
5324                                                                        .await,
5325                                                                    );
5326                                                                }
5327                                                            }
5328
5329                                                            if let Some(h) = intercept_handle {
5330                                                                let abort_handle = h.abort_handle();
5331                                                                if let Err(elasped) = tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await {
5332                                                                    log::warn!("Handler timeout exceeded {elasped}");
5333                                                                    abort_handle.abort();
5334                                                                }
5335                                                            }
5336
5337                                                            if add_external {
5338                                                                page.set_external(shared.3.clone());
5339                                                            }
5340
5341                                                            let prev_domain = page.base;
5342
5343                                                            page.base = shared.9.as_deref().cloned();
5344
5345                                                            if return_page_links {
5346                                                                page.page_links = Some(Default::default());
5347                                                            }
5348
5349                                                            let links = if full_resources {
5350                                                                page.links_full(&shared.1, &shared.9).await
5351                                                            } else {
5352                                                                page.links(&shared.1, &shared.9).await
5353                                                            };
5354
5355                                                            page.base = prev_domain;
5356
5357                                                            if shared.6.normalize {
5358                                                                page.signature.replace(crate::utils::hash_html(&page.get_html_bytes_u8()).await);
5359                                                            }
5360
5361                                                            if let Some(ref cb) = on_should_crawl_callback {
5362                                                                if !cb.call(&page) {
5363                                                                    page.blocked_crawl = true;
5364                                                                    channel_send_page(&shared.2, page, &shared.4);
5365                                                                    drop(permit);
5366                                                                    return Default::default()
5367                                                                }
5368                                                            }
5369
5370                                                            let signature = page.signature;
5371
5372                                                            channel_send_page(
5373                                                                &shared.2, page, &shared.4,
5374                                                            );
5375
5376                                                            (links, signature)
5377                                                        }
5378                                                        _ => Default::default(),
5379                                                    };
5380
5381
5382                                                    drop(permit);
5383
5384                                                    results
5385                                                });
5386                                            }
5387
5388                                            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5389                                        }
5390                                        Some(result) = set.join_next(), if !set.is_empty() => {
5391                                            if let Ok(res) = result {
5392                                                match res.1 {
5393                                                    Some(signature) => {
5394                                                        if self.is_signature_allowed(signature).await {
5395                                                            self.insert_signature(signature).await;
5396                                                            self.links_visited.extend_links(&mut links, res.0);
5397                                                        }
5398                                                    }
5399                                                    _ => {
5400                                                        self.links_visited.extend_links(&mut links, res.0);
5401                                                    }
5402                                                }
5403                                            } else{
5404                                                break
5405                                            }
5406                                        }
5407                                        else => break,
5408                                    };
5409
5410                                    if links.is_empty() && set.is_empty() || exceeded_budget {
5411                                        if exceeded_budget {
5412                                            while set.join_next().await.is_some() {}
5413                                        }
5414                                        break 'outer;
5415                                    }
5416                                }
5417
5418                                self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5419
5420                                if links.is_empty() && set.is_empty() {
5421                                    break;
5422                                }
5423                            }
5424
5425                            self.subscription_guard().await;
5426                            b.dispose();
5427                            // store the extra links.
5428                            if !links.is_empty() {
5429                                self.extra_links.extend(links);
5430                            }
5431                        }
5432                    }
5433                    Err(err) => {
5434                        b.dispose();
5435                        log::error!("{}", err)
5436                    }
5437                }
5438            }
5439            _ => log::error!("Chrome initialization failed."),
5440        }
5441    }
5442
5443    /// Start to crawl website concurrently using chrome with the ability to send it across threads for subscriptions.
5444    #[cfg_attr(
5445        all(feature = "tracing", not(feature = "decentralized")),
5446        tracing::instrument(skip_all)
5447    )]
5448    async fn crawl_concurrent_raw_send(
5449        &self,
5450        client: &Client,
5451        handle: &Option<Arc<AtomicI8>>,
5452        url: &Option<&str>,
5453    ) -> Website {
5454        let mut selector: (
5455            CompactString,
5456            smallvec::SmallVec<[CompactString; 2]>,
5457            CompactString,
5458        ) = self.setup_selectors();
5459
5460        let mut website = self.clone();
5461
5462        if let Some(u) = url {
5463            match &website.domain_parsed {
5464                Some(domain_url) => {
5465                    if domain_url.as_str().starts_with(u) {
5466                        website.set_url_only(u);
5467                    } else {
5468                        website.set_url(u);
5469                    }
5470                }
5471                _ => {
5472                    website.set_url(u);
5473                }
5474            }
5475        }
5476
5477        if !website.send_configured {
5478            website.configure_setup().await;
5479        }
5480
5481        if self.single_page() {
5482            website._crawl_establish(client, &mut selector, false).await;
5483            website
5484        } else {
5485            let client_rotator = self.client_rotator.clone();
5486            let on_should_crawl_callback = self.on_should_crawl_callback.clone();
5487            let full_resources = self.configuration.full_resources;
5488            let return_page_links = self.configuration.return_page_links;
5489            let only_html = self.configuration.only_html && !full_resources;
5490            let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
5491
5492            let (mut interval, throttle) = self.setup_crawl();
5493
5494            let mut links: HashSet<CaseInsensitiveString> = website.drain_extra_links().collect();
5495
5496            links.extend(website._crawl_establish(client, &mut selector, false).await);
5497
5498            let semaphore = self.setup_semaphore();
5499
5500            let shared = Arc::new((
5501                client.to_owned(),
5502                selector,
5503                self.channel.clone(),
5504                self.configuration.external_domains_caseless.clone(),
5505                self.channel_guard.clone(),
5506                self.configuration.retry,
5507                self.configuration.full_resources,
5508                PageLinkBuildSettings::new_full(
5509                    false,
5510                    self.configuration.full_resources,
5511                    self.configuration.subdomains,
5512                    self.configuration.tld,
5513                    self.configuration.normalize,
5514                ),
5515                self.domain_parsed.clone(),
5516                self.on_link_find_callback.clone(),
5517                self.configuration.remote_multimodal.clone(),
5518            ));
5519
5520            let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> = JoinSet::new();
5521
5522            // track budgeting one time.
5523            let mut exceeded_budget = false;
5524            let concurrency = throttle.is_zero();
5525
5526            website
5527                .dequeue(&mut q, &mut links, &mut exceeded_budget)
5528                .await;
5529
5530            if !concurrency && !links.is_empty() {
5531                tokio::time::sleep(*throttle).await;
5532            }
5533
5534            let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
5535                Some(Instant::now())
5536            } else {
5537                None
5538            };
5539
5540            'outer: loop {
5541                let mut stream =
5542                    tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect());
5543
5544                loop {
5545                    if !concurrency {
5546                        tokio::time::sleep(*throttle).await;
5547                    }
5548
5549                    let semaphore =
5550                        get_semaphore(&semaphore, !self.configuration.shared_queue).await;
5551
5552                    tokio::select! {
5553                        biased;
5554                        Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker)   => {
5555                            if !self.handle_process(handle, &mut interval, async {
5556                                emit_log_shutdown(link.inner());
5557                                let permits = set.len();
5558                                set.shutdown().await;
5559                                semaphore.add_permits(permits);
5560                            }).await {
5561                                break 'outer;
5562                            }
5563                            let allowed = website.is_allowed(&link);
5564
5565                            if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
5566                                exceeded_budget = true;
5567                                break;
5568                            }
5569
5570                            if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
5571                                continue;
5572                            }
5573
5574                            emit_log(link.inner());
5575
5576                            website.insert_link(link.clone()).await;
5577
5578                            if let Ok(permit) = semaphore.clone().acquire_owned().await {
5579                                let shared = shared.clone();
5580                                let on_should_crawl_callback = on_should_crawl_callback.clone();
5581                                let rotator = client_rotator.clone();
5582                                spawn_set("page_fetch", &mut set, async move {
5583                                    let link_result = match &shared.9 {
5584                                        Some(cb) => cb(link, None),
5585                                        _ => (link, None),
5586                                    };
5587
5588                                    let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
5589                                    let mut links_pages = if return_page_links {
5590                                        Some(links.clone())
5591                                    } else {
5592                                        None
5593                                    };
5594                                    let mut relative_selectors = shared.1.clone();
5595                                    let mut r_settings = shared.7;
5596                                    r_settings.ssg_build = true;
5597                                    let target_url = link_result.0.as_ref();
5598                                    let external_domains_caseless = &shared.3;
5599                                    let client = match &rotator {
5600                                        Some(r) => r.next(),
5601                                        None => &shared.0,
5602                                    };
5603
5604                                    let mut domain_parsed = None;
5605
5606                                    let mut page = Page::new_page_streaming(
5607                                        target_url,
5608                                        client, only_html,
5609                                        &mut relative_selectors,
5610                                        external_domains_caseless,
5611                                        &r_settings,
5612                                        &mut links,
5613                                        None,
5614                                        &shared.8,
5615                                        &mut domain_parsed,
5616                                        &mut links_pages).await;
5617
5618                                    let mut retry_count = shared.5;
5619
5620                                    while page.should_retry && retry_count > 0 {
5621                                        retry_count -= 1;
5622
5623                                        if let Some(timeout) = page.get_timeout() {
5624                                            tokio::time::sleep(timeout).await;
5625                                        }
5626
5627                                        let retry_client = match &rotator {
5628                                            Some(r) => r.next(),
5629                                            None => &shared.0,
5630                                        };
5631
5632                                        if page.status_code == StatusCode::GATEWAY_TIMEOUT {
5633                                            if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
5634                                                let mut domain_parsed = None;
5635                                                let next_page = Page::new_page_streaming(
5636                                                    target_url,
5637                                                    retry_client, only_html,
5638                                                    &mut relative_selectors.clone(),
5639                                                    external_domains_caseless,
5640                                                    &r_settings,
5641                                                    &mut links,
5642                                                    None,
5643                                                    &shared.8,
5644                                                    &mut domain_parsed,
5645                                                    &mut links_pages).await;
5646
5647                                                page.clone_from(&next_page);
5648
5649                                            }).await
5650                                        {
5651                                            log::warn!("Handler timeout exceeded {elasped}");
5652                                        }
5653
5654                                        } else {
5655                                            page.clone_from(&Page::new_page_streaming(
5656                                                target_url,
5657                                                retry_client,
5658                                                only_html,
5659                                                &mut relative_selectors.clone(),
5660                                                external_domains_caseless,
5661                                                &r_settings,
5662                                                &mut links,
5663                                                None,
5664                                                &shared.8,
5665                                                &mut domain_parsed,
5666                                                &mut links_pages).await);
5667                                        }
5668                                    }
5669
5670                                    if return_page_links {
5671                                        page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new);
5672                                    }
5673
5674                                    // Run remote multimodal extraction if configured (HTTP-only path)
5675                                    #[cfg(all(feature = "agent", feature = "serde"))]
5676                                    if shared.10.is_some() {
5677                                        let html = page.get_html();
5678                                        if !html.is_empty() {
5679                                            use crate::features::automation::{run_remote_multimodal_extraction, AutomationResultExt};
5680                                            let title = page.metadata.as_ref().and_then(|m| m.title.as_ref()).map(|t| t.as_str());
5681                                            if let Ok(Some(result)) = run_remote_multimodal_extraction(
5682                                                &shared.10,
5683                                                &html,
5684                                                target_url,
5685                                                title,
5686                                            ).await {
5687                                                // Store usage on page
5688                                                match page.remote_multimodal_usage.as_mut() {
5689                                                    Some(v) => v.push(result.usage.clone()),
5690                                                    None => page.remote_multimodal_usage = Some(vec![result.usage.clone()]),
5691                                                }
5692                                                // Store extracted data if available
5693                                                if result.extracted.is_some() || result.screenshot.is_some() {
5694                                                    let automation_result = result.to_automation_results();
5695                                                    match page.extra_remote_multimodal_data.as_mut() {
5696                                                        Some(v) => v.push(automation_result),
5697                                                        None => page.extra_remote_multimodal_data = Some(vec![automation_result]),
5698                                                    }
5699                                                }
5700                                            }
5701                                        }
5702                                    }
5703
5704                                    if let Some(ref cb) = on_should_crawl_callback {
5705                                        if !cb.call(&page) {
5706                                            page.blocked_crawl = true;
5707                                            channel_send_page(&shared.2, page, &shared.4);
5708                                            drop(permit);
5709                                            return Default::default()
5710                                        }
5711                                    }
5712
5713                                    let signature = page.signature;
5714
5715                                    channel_send_page(&shared.2, page, &shared.4);
5716
5717                                    drop(permit);
5718
5719                                    (links, signature)
5720                                });
5721                            }
5722
5723                            website.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5724                        },
5725                        Some(result) = set.join_next(), if !set.is_empty() => {
5726                            if let Ok(res) = result {
5727                                match res.1 {
5728                                    Some(signature) => {
5729                                        if website.is_signature_allowed(signature).await {
5730                                            website.insert_signature(signature).await;
5731                                            website.links_visited.extend_links(&mut links, res.0);
5732                                        }
5733                                    }
5734                                    _ => {
5735                                        website.links_visited.extend_links(&mut links, res.0);
5736                                    }
5737                                }
5738                            } else {
5739                                break;
5740                            }
5741                        }
5742                        else => break,
5743                    }
5744
5745                    website
5746                        .dequeue(&mut q, &mut links, &mut exceeded_budget)
5747                        .await;
5748
5749                    if links.is_empty() && set.is_empty() || exceeded_budget {
5750                        // await for all tasks to complete.
5751                        if exceeded_budget {
5752                            while set.join_next().await.is_some() {}
5753                        }
5754                        break 'outer;
5755                    }
5756                }
5757
5758                website.subscription_guard().await;
5759                website
5760                    .dequeue(&mut q, &mut links, &mut exceeded_budget)
5761                    .await;
5762
5763                if links.is_empty() && set.is_empty() {
5764                    break;
5765                }
5766            }
5767            website
5768        }
5769    }
5770
5771    /// Start to crawl website concurrently with the ability to send it across threads for subscriptions.
5772    #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
5773    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
5774    async fn crawl_concurrent_send(
5775        &self,
5776        client: &Client,
5777        handle: &Option<Arc<AtomicI8>>,
5778        url: &Option<&str>,
5779    ) -> Website {
5780        use crate::features::chrome::attempt_navigation;
5781
5782        match self.setup_browser().await {
5783            Some(mut b) => {
5784                match attempt_navigation(
5785                    "about:blank",
5786                    &b.browser.0,
5787                    &self.configuration.request_timeout,
5788                    &b.browser.2,
5789                    &self.configuration.viewport,
5790                )
5791                .await
5792                {
5793                    Ok(new_page) => {
5794                        let mut selectors = self.setup_selectors();
5795                        let mut website = self.to_owned();
5796
5797                        if let Some(u) = url {
5798                            match &website.domain_parsed {
5799                                Some(domain_url) => {
5800                                    if domain_url.as_str().starts_with(u) {
5801                                        website.set_url_only(u);
5802                                    } else {
5803                                        website.set_url(u);
5804                                    }
5805                                }
5806                                _ => {
5807                                    website.set_url(u);
5808                                }
5809                            }
5810                        }
5811
5812                        if !website.send_configured {
5813                            website.configure_setup().await;
5814                        }
5815
5816                        let base_links = website
5817                            .crawl_establish(&client, &mut selectors, false, &new_page)
5818                            .await;
5819
5820                        drop(new_page);
5821
5822                        if self.single_page() {
5823                            website.subscription_guard().await;
5824                            b.dispose();
5825                            website
5826                        } else {
5827                            let semaphore: Arc<Semaphore> = self.setup_semaphore();
5828                            let (mut interval, throttle) = self.setup_crawl();
5829
5830                            let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
5831
5832                            let mut links: HashSet<CaseInsensitiveString> =
5833                                *self.extra_links.clone();
5834
5835                            links.extend(base_links);
5836
5837                            let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> =
5838                                JoinSet::new();
5839
5840                            let shared = Arc::new((
5841                                client.to_owned(),
5842                                selectors,
5843                                self.channel.clone(),
5844                                self.configuration.external_domains_caseless.clone(),
5845                                self.channel_guard.clone(),
5846                                b.browser.0.clone(),
5847                                self.configuration.clone(),
5848                                self.url.inner().to_string(),
5849                                b.browser.2.clone(),
5850                                self.domain_parsed.clone(),
5851                                self.on_link_find_callback.clone(),
5852                            ));
5853
5854                            let add_external = shared.3.len() > 0;
5855                            let on_should_crawl_callback = self.on_should_crawl_callback.clone();
5856                            let full_resources = self.configuration.full_resources;
5857                            let return_page_links = self.configuration.return_page_links;
5858                            let mut exceeded_budget = false;
5859                            let concurrency = throttle.is_zero();
5860
5861                            website
5862                                .dequeue(&mut q, &mut links, &mut exceeded_budget)
5863                                .await;
5864
5865                            if !concurrency && !links.is_empty() {
5866                                tokio::time::sleep(*throttle).await;
5867                            }
5868
5869                            let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
5870                                Some(Instant::now())
5871                            } else {
5872                                None
5873                            };
5874
5875                            'outer: loop {
5876                                let mut stream = tokio_stream::iter::<HashSet<CaseInsensitiveString>>(
5877                                    links.drain().collect(),
5878                                );
5879
5880                                loop {
5881                                    if !concurrency {
5882                                        tokio::time::sleep(*throttle).await;
5883                                    }
5884
5885                                    let semaphore =
5886                                        get_semaphore(&semaphore, !self.configuration.shared_queue)
5887                                            .await;
5888
5889                                    tokio::select! {
5890                                        biased;
5891                                        Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker)  => {
5892                                            if !self
5893                                                .handle_process(
5894                                                    handle,
5895                                                    &mut interval,
5896                                                    async {
5897                                                        emit_log_shutdown(&link.inner());
5898                                                        let permits = set.len();
5899                                                        set.shutdown().await;
5900                                                        semaphore.add_permits(permits);
5901                                                    },
5902                                                )
5903                                                .await
5904                                            {
5905                                                break 'outer;
5906                                            }
5907
5908                                            let allowed = website.is_allowed(&link);
5909
5910                                            if allowed
5911                                                .eq(&ProcessLinkStatus::BudgetExceeded)
5912                                            {
5913                                                exceeded_budget = true;
5914                                                break;
5915                                            }
5916                                            if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
5917                                                continue;
5918                                            }
5919
5920                                            emit_log(&link.inner());
5921
5922                                            website.insert_link(link.clone()).await;
5923
5924                                            if let Ok(permit) = semaphore.clone().acquire_owned().await {
5925                                                let shared = shared.clone();
5926                                                let on_should_crawl_callback = on_should_crawl_callback.clone();
5927                                                spawn_set("page_fetch", &mut set, async move {
5928                                                    let results = match attempt_navigation("about:blank", &shared.5, &shared.6.request_timeout, &shared.8, &shared.6.viewport).await {
5929                                                        Ok(new_page) => {
5930                                                            let (_, intercept_handle) = tokio::join!(
5931                                                                crate::features::chrome::setup_chrome_events(&new_page, &shared.6),
5932                                                                crate::features::chrome::setup_chrome_interception_base(
5933                                                                    &new_page,
5934                                                                    shared.6.chrome_intercept.enabled,
5935                                                                    &shared.6.auth_challenge_response,
5936                                                                    shared.6.chrome_intercept.block_visuals,
5937                                                                    &shared.7,
5938                                                                )
5939                                                            );
5940
5941                                                            let link_result =
5942                                                                match &shared.10 {
5943                                                                    Some(cb) => cb(link, None),
5944                                                                    _ => (link, None),
5945                                                                };
5946
5947                                                            let target_url = link_result.0.as_ref();
5948
5949                                                            let mut page = Page::new(
5950                                                                &target_url,
5951                                                                &shared.0,
5952                                                                &new_page,
5953                                                                &shared.6.wait_for,
5954                                                                &shared.6.screenshot,
5955                                                                false,
5956                                                                &shared.6.openai_config,
5957                                                                &shared.6.execution_scripts,
5958                                                                &shared.6.automation_scripts,
5959                                                                &shared.6.viewport,
5960                                                                &shared.6.request_timeout,
5961                                                                &shared.6.track_events,
5962                                                                shared.6.referer.clone(),
5963                                                                shared.6.max_page_bytes,
5964                                                                shared.6.get_cache_options(),
5965                                                                &shared.6.cache_policy,
5966                                                                &shared.6.remote_multimodal,
5967                                                            )
5968                                                            .await;
5969
5970                                                            let mut retry_count = shared.6.retry;
5971
5972                                                            while page.should_retry && retry_count > 0 {
5973                                                                retry_count -= 1;
5974                                                                if let Some(timeout) = page.get_timeout() {
5975                                                                    tokio::time::sleep(timeout).await;
5976                                                                }
5977                                                                if page.status_code == StatusCode::GATEWAY_TIMEOUT {
5978                                                                    if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
5979                                                                        let p = Page::new(
5980                                                                            &target_url,
5981                                                                            &shared.0,
5982                                                                            &new_page,
5983                                                                            &shared.6.wait_for,
5984                                                                            &shared.6.screenshot,
5985                                                                            false,
5986                                                                            &shared.6.openai_config,
5987                                                                            &shared.6.execution_scripts,
5988                                                                            &shared.6.automation_scripts,
5989                                                                            &shared.6.viewport,
5990                                                                            &shared.6.request_timeout,
5991                                                                            &shared.6.track_events,
5992                                                                            shared.6.referer.clone(),
5993                                                                            shared.6.max_page_bytes,
5994                                                                            shared.6.get_cache_options(),
5995                                                                            &shared.6.cache_policy,
5996                                                                            &shared.6.remote_multimodal,
5997                                                                        ).await;
5998                                                                        page.clone_from(&p);
5999
6000                                                                    }).await {
6001                                                                        log::info!("{target_url} backoff gateway timeout exceeded {elasped}");
6002                                                                    }
6003                                                                } else {
6004                                                                    page.clone_from(
6005                                                                        &Page::new(
6006                                                                            &target_url,
6007                                                                            &shared.0,
6008                                                                            &new_page,
6009                                                                            &shared.6.wait_for,
6010                                                                            &shared.6.screenshot,
6011                                                                            false,
6012                                                                            &shared.6.openai_config,
6013                                                                            &shared.6.execution_scripts,
6014                                                                            &shared.6.automation_scripts,
6015                                                                            &shared.6.viewport,
6016                                                                            &shared.6.request_timeout,
6017                                                                            &shared.6.track_events,
6018                                                                            shared.6.referer.clone(),
6019                                                                            shared.6.max_page_bytes,
6020                                                                            shared.6.get_cache_options(),
6021                                                                            &shared.6.cache_policy,
6022                                                                            &shared.6.remote_multimodal,
6023                                                                        )
6024                                                                        .await,
6025                                                                    );
6026                                                                }
6027                                                            }
6028
6029                                                            if let Some(h) = intercept_handle {
6030                                                                let abort_handle = h.abort_handle();
6031                                                                if let Err(elasped) = tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await {
6032                                                                    log::warn!("Handler timeout exceeded {elasped}");
6033                                                                    abort_handle.abort();
6034                                                                }
6035                                                            }
6036
6037                                                            if add_external {
6038                                                                page.set_external(shared.3.clone());
6039                                                            }
6040
6041                                                            let prev_domain = page.base;
6042
6043                                                            page.base = shared.9.as_deref().cloned();
6044
6045                                                            if return_page_links {
6046                                                                page.page_links = Some(Default::default());
6047                                                            }
6048
6049                                                            let links = if full_resources {
6050                                                                page.links_full(&shared.1, &shared.9).await
6051                                                            } else {
6052                                                                page.links(&shared.1, &shared.9).await
6053                                                            };
6054
6055                                                            page.base = prev_domain;
6056
6057                                                            if shared.6.normalize {
6058                                                                page.signature.replace(crate::utils::hash_html(&page.get_html_bytes_u8()).await);
6059                                                            }
6060
6061                                                            if let Some(ref cb) = on_should_crawl_callback {
6062                                                                if !cb.call(&page) {
6063                                                                    page.blocked_crawl = true;
6064                                                                    channel_send_page(&shared.2, page, &shared.4);
6065                                                                    drop(permit);
6066                                                                    return Default::default()
6067                                                                }
6068                                                            }
6069
6070                                                            let signature = page.signature;
6071
6072                                                            channel_send_page(
6073                                                                &shared.2, page, &shared.4,
6074                                                            );
6075
6076                                                            (links, signature)
6077                                                        }
6078                                                        _ => Default::default(),
6079                                                    };
6080
6081
6082                                                    drop(permit);
6083
6084                                                    results
6085                                                });
6086                                            }
6087
6088                                            website.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6089                                        }
6090                                        Some(result) = set.join_next(), if !set.is_empty() => {
6091                                            if let Ok(res) = result {
6092                                                match res.1 {
6093                                                    Some(signature) => {
6094                                                        if website.is_signature_allowed(signature).await {
6095                                                            website.insert_signature(signature).await;
6096                                                            website.links_visited.extend_links(&mut links, res.0);
6097                                                        }
6098                                                    }
6099                                                    _ => {
6100                                                        website.links_visited.extend_links(&mut links, res.0);
6101                                                    }
6102                                                }
6103                                            } else{
6104                                                break
6105                                            }
6106                                        }
6107                                        else => break,
6108                                    };
6109
6110                                    if links.is_empty() && set.is_empty() || exceeded_budget {
6111                                        if exceeded_budget {
6112                                            while set.join_next().await.is_some() {}
6113                                        }
6114                                        break 'outer;
6115                                    }
6116                                }
6117
6118                                website
6119                                    .dequeue(&mut q, &mut links, &mut exceeded_budget)
6120                                    .await;
6121
6122                                if links.is_empty() && set.is_empty() {
6123                                    break;
6124                                }
6125                            }
6126
6127                            website.subscription_guard().await;
6128                            b.dispose();
6129
6130                            website
6131                        }
6132                    }
6133                    Err(err) => {
6134                        b.dispose();
6135                        log::error!("{}", err);
6136                        self.clone()
6137                    }
6138                }
6139            }
6140            _ => {
6141                log::error!("Chrome initialization failed.");
6142                self.clone()
6143            }
6144        }
6145    }
6146
6147    /// Start to crawl website concurrently with the ability to send it across threads for subscriptions for one page.
6148    #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
6149    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6150    async fn _fetch_chrome(&self, client: &Client, url: &Option<&str>) {
6151        use crate::features::chrome::attempt_navigation;
6152
6153        match self.setup_browser().await {
6154            Some(mut b) => {
6155                match attempt_navigation(
6156                    "about:blank",
6157                    &b.browser.0,
6158                    &self.configuration.request_timeout,
6159                    &b.browser.2,
6160                    &self.configuration.viewport,
6161                )
6162                .await
6163                {
6164                    Ok(new_page) => {
6165                        let mut selectors = self.setup_selectors();
6166                        self.crawl_establish_chrome_one(&client, &mut selectors, url, &new_page)
6167                            .await;
6168                        self.subscription_guard().await;
6169                        b.dispose();
6170                    }
6171                    Err(err) => {
6172                        b.dispose();
6173                        log::error!("{}", err);
6174                    }
6175                }
6176            }
6177            _ => {
6178                log::error!("Chrome initialization failed.");
6179            }
6180        }
6181    }
6182
6183    /// Start to crawl website concurrently with the ability to send it across threads for subscriptions for one page.
6184    #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
6185    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6186    async fn _fetch_chrome_persisted(
6187        &self,
6188        client: &Client,
6189        url: &Option<&str>,
6190        b: &crate::features::chrome::BrowserController,
6191    ) {
6192        use crate::features::chrome::attempt_navigation;
6193        match attempt_navigation(
6194            "about:blank",
6195            &b.browser.0,
6196            &self.configuration.request_timeout,
6197            &b.browser.2,
6198            &self.configuration.viewport,
6199        )
6200        .await
6201        {
6202            Ok(new_page) => {
6203                let mut selectors = self.setup_selectors();
6204                self.crawl_establish_chrome_one(&client, &mut selectors, url, &new_page)
6205                    .await;
6206                self.subscription_guard().await;
6207            }
6208            Err(err) => {
6209                log::error!("{}", err);
6210            }
6211        }
6212    }
6213
6214    /// Start to crawl website concurrently using WebDriver.
6215    #[cfg(all(not(feature = "decentralized"), not(feature = "chrome"), feature = "webdriver"))]
6216    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6217    async fn crawl_concurrent_webdriver(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
6218        self.start();
6219
6220        match self.setup_webdriver().await {
6221            Some(mut controller) => {
6222                let driver = controller.driver();
6223                let mut selectors = self.setup_selectors();
6224                self.status = CrawlStatus::Active;
6225
6226                if self.single_page() {
6227                    self.crawl_establish_webdriver_one(&client, &mut selectors, &None, driver)
6228                        .await;
6229                    self.subscription_guard().await;
6230                    controller.dispose();
6231                } else {
6232                    let semaphore: Arc<Semaphore> = self.setup_semaphore();
6233                    let (mut interval, throttle) = self.setup_crawl();
6234
6235                    let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
6236
6237                    let base_links = self
6238                        .crawl_establish_webdriver_one(&client, &mut selectors, &None, driver)
6239                        .await;
6240
6241                    let mut links: HashSet<CaseInsensitiveString> =
6242                        self.drain_extra_links().collect();
6243
6244                    links.extend(base_links);
6245
6246                    self.configuration.configure_allowlist();
6247
6248                    let timeout = self
6249                        .configuration
6250                        .webdriver_config
6251                        .as_ref()
6252                        .and_then(|c| c.timeout);
6253
6254                    let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> =
6255                        JoinSet::new();
6256
6257                    let shared = Arc::new((
6258                        client.to_owned(),
6259                        selectors,
6260                        self.channel.clone(),
6261                        self.configuration.external_domains_caseless.clone(),
6262                        self.channel_guard.clone(),
6263                        driver.clone(),
6264                        self.configuration.clone(),
6265                        self.url.inner().to_string(),
6266                        self.domain_parsed.clone(),
6267                        self.on_link_find_callback.clone(),
6268                        timeout,
6269                    ));
6270
6271                    let add_external = shared.3.len() > 0;
6272                    let on_should_crawl_callback = self.on_should_crawl_callback.clone();
6273                    let full_resources = self.configuration.full_resources;
6274                    let return_page_links = self.configuration.return_page_links;
6275                    let mut exceeded_budget = false;
6276                    let concurrency = throttle.is_zero();
6277
6278                    self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6279
6280                    if !concurrency && !links.is_empty() {
6281                        tokio::time::sleep(*throttle).await;
6282                    }
6283
6284                    let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
6285                        Some(Instant::now())
6286                    } else {
6287                        None
6288                    };
6289
6290                    'outer: loop {
6291                        let mut stream = tokio_stream::iter::<HashSet<CaseInsensitiveString>>(
6292                            links.drain().collect(),
6293                        );
6294
6295                        loop {
6296                            if !concurrency {
6297                                tokio::time::sleep(*throttle).await;
6298                            }
6299
6300                            let semaphore =
6301                                get_semaphore(&semaphore, !self.configuration.shared_queue)
6302                                    .await;
6303
6304                            tokio::select! {
6305                                biased;
6306                                Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
6307                                    if !self
6308                                        .handle_process(
6309                                            handle,
6310                                            &mut interval,
6311                                            async {
6312                                                emit_log_shutdown(&link.inner());
6313                                                let permits = set.len();
6314                                                set.shutdown().await;
6315                                                semaphore.add_permits(permits);
6316                                            },
6317                                        )
6318                                        .await
6319                                    {
6320                                        break 'outer;
6321                                    }
6322
6323                                    let allowed = self.is_allowed(&link);
6324
6325                                    if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
6326                                        exceeded_budget = true;
6327                                        break;
6328                                    }
6329                                    if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
6330                                        continue;
6331                                    }
6332
6333                                    emit_log(&link.inner());
6334
6335                                    self.insert_link(link.clone()).await;
6336
6337                                    if let Ok(permit) = semaphore.clone().acquire_owned().await {
6338                                        let shared = shared.clone();
6339                                        let on_should_crawl_callback = on_should_crawl_callback.clone();
6340
6341                                        spawn_set("page_fetch_webdriver", &mut set, async move {
6342                                            let link_result = match &shared.9 {
6343                                                Some(cb) => cb(link, None),
6344                                                _ => (link, None),
6345                                            };
6346
6347                                            let target_url = link_result.0.as_ref();
6348
6349                                            // Setup stealth events before navigation
6350                                            crate::features::webdriver::setup_driver_events(&shared.5, &shared.6).await;
6351
6352                                            let mut page = Page::new_page_webdriver(
6353                                                target_url,
6354                                                &shared.5,
6355                                                shared.10,
6356                                            )
6357                                            .await;
6358
6359                                            let mut retry_count = shared.6.retry;
6360
6361                                            while page.should_retry && retry_count > 0 {
6362                                                retry_count -= 1;
6363                                                if let Some(timeout_duration) = page.get_timeout() {
6364                                                    tokio::time::sleep(timeout_duration).await;
6365                                                }
6366                                                if page.status_code == StatusCode::GATEWAY_TIMEOUT {
6367                                                    if let Err(elapsed) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
6368                                                        let p = Page::new_page_webdriver(
6369                                                            target_url,
6370                                                            &shared.5,
6371                                                            shared.10,
6372                                                        ).await;
6373                                                        page.clone_from(&p);
6374                                                    }).await {
6375                                                        log::info!("{target_url} backoff gateway timeout exceeded {elapsed}");
6376                                                    }
6377                                                } else {
6378                                                    page.clone_from(
6379                                                        &Page::new_page_webdriver(
6380                                                            target_url,
6381                                                            &shared.5,
6382                                                            shared.10,
6383                                                        )
6384                                                        .await,
6385                                                    );
6386                                                }
6387                                            }
6388
6389                                            if add_external {
6390                                                page.set_external(shared.3.clone());
6391                                            }
6392
6393                                            let prev_domain = page.base;
6394                                            page.base = shared.8.as_deref().cloned();
6395
6396                                            if return_page_links {
6397                                                page.page_links = Some(Default::default());
6398                                            }
6399
6400                                            let links = if full_resources {
6401                                                page.links_full(&shared.1, &shared.8).await
6402                                            } else {
6403                                                page.links(&shared.1, &shared.8).await
6404                                            };
6405
6406                                            page.base = prev_domain;
6407
6408                                            if shared.6.normalize {
6409                                                page.signature.replace(crate::utils::hash_html(&page.get_html_bytes_u8()).await);
6410                                            }
6411
6412                                            if let Some(ref cb) = on_should_crawl_callback {
6413                                                if !cb.call(&page) {
6414                                                    page.blocked_crawl = true;
6415                                                    channel_send_page(&shared.2, page, &shared.4);
6416                                                    drop(permit);
6417                                                    return Default::default();
6418                                                }
6419                                            }
6420
6421                                            let signature = page.signature;
6422
6423                                            channel_send_page(&shared.2, page, &shared.4);
6424
6425                                            drop(permit);
6426
6427                                            (links, signature)
6428                                        });
6429                                    }
6430
6431                                    self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6432                                }
6433                                Some(result) = set.join_next(), if !set.is_empty() => {
6434                                    if let Ok(res) = result {
6435                                        match res.1 {
6436                                            Some(signature) => {
6437                                                if self.is_signature_allowed(signature).await {
6438                                                    self.insert_signature(signature).await;
6439                                                    self.links_visited.extend_links(&mut links, res.0);
6440                                                }
6441                                            }
6442                                            _ => {
6443                                                self.links_visited.extend_links(&mut links, res.0);
6444                                            }
6445                                        }
6446                                    } else {
6447                                        break
6448                                    }
6449
6450                                    if links.is_empty() && set.is_empty() || exceeded_budget {
6451                                        if exceeded_budget {
6452                                            while set.join_next().await.is_some() {}
6453                                        }
6454                                        break 'outer;
6455                                    }
6456                                }
6457                                else => break,
6458                            };
6459
6460                            if links.is_empty() && set.is_empty() || exceeded_budget {
6461                                if exceeded_budget {
6462                                    while set.join_next().await.is_some() {}
6463                                }
6464                                break 'outer;
6465                            }
6466                        }
6467
6468                        self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6469
6470                        if links.is_empty() && set.is_empty() {
6471                            break;
6472                        }
6473                    }
6474
6475                    self.subscription_guard().await;
6476                    controller.dispose();
6477
6478                    if !links.is_empty() {
6479                        self.extra_links.extend(links);
6480                    }
6481                }
6482            }
6483            None => {
6484                log::error!("WebDriver initialization failed.");
6485            }
6486        }
6487    }
6488
6489    /// Start to crawl website concurrently.
6490    #[cfg(all(not(feature = "decentralized"), not(feature = "chrome"), feature = "webdriver"))]
6491    pub async fn crawl_concurrent(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
6492        // Use WebDriver if configured, otherwise fall back to raw HTTP
6493        if self.configuration.webdriver_config.is_some() {
6494            self.crawl_concurrent_webdriver(client, handle).await
6495        } else {
6496            self.crawl_concurrent_raw(client, handle).await
6497        }
6498    }
6499
6500    /// Start to crawl website concurrently.
6501    #[cfg(all(not(feature = "decentralized"), not(feature = "chrome"), not(feature = "webdriver")))]
6502    pub async fn crawl_concurrent(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
6503        self.crawl_concurrent_raw(client, handle).await
6504    }
6505
6506    /// Start to crawl website concurrently.
6507    #[cfg(feature = "decentralized")]
6508    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6509    pub async fn crawl_concurrent(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
6510        let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
6511
6512        self.configuration.configure_allowlist();
6513        let domain = self.url.inner().as_str();
6514        let mut interval = Box::pin(tokio::time::interval(Duration::from_millis(10)));
6515        let throttle = Box::pin(self.get_delay());
6516        let on_link_find_callback = self.on_link_find_callback.clone();
6517        // http worker verify
6518        let http_worker = std::env::var("SPIDER_WORKER")
6519            .unwrap_or_else(|_| "http:".to_string())
6520            .starts_with("http:");
6521
6522        let mut links: HashSet<CaseInsensitiveString> = self
6523            .crawl_establish(
6524                &client,
6525                &mut (domain.into(), Default::default()),
6526                http_worker,
6527            )
6528            .await;
6529
6530        let mut set: JoinSet<HashSet<CaseInsensitiveString>> = JoinSet::new();
6531        let mut exceeded_budget = false;
6532
6533        'outer: loop {
6534            let stream =
6535                tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect())
6536                    .throttle(*throttle);
6537            tokio::pin!(stream);
6538
6539            loop {
6540                match stream.next().await {
6541                    Some(link) => {
6542                        if !self
6543                            .handle_process(handle, &mut interval, async {
6544                                emit_log_shutdown(&link.inner());
6545                                set.shutdown().await;
6546                            })
6547                            .await
6548                        {
6549                            break 'outer;
6550                        }
6551
6552                        let allowed = self.is_allowed(&link);
6553
6554                        if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
6555                            exceeded_budget = true;
6556                            break;
6557                        }
6558                        if allowed.eq(&ProcessLinkStatus::Blocked)
6559                            || !self.is_allowed_disk(&link).await
6560                        {
6561                            continue;
6562                        }
6563
6564                        emit_log(&link.inner());
6565
6566                        self.insert_link(link.clone()).await;
6567
6568                        if let Ok(permit) = SEM.acquire().await {
6569                            let client = client.clone();
6570                            let on_link_find_callback = on_link_find_callback.clone();
6571
6572                            spawn_set("page_fetch", &mut set, async move {
6573                                let link_results = match &on_link_find_callback.clone() {
6574                                    Some(cb) => cb(link, None),
6575                                    _ => (link, None),
6576                                };
6577                                let link_results = link_results.0.as_ref();
6578                                let page = Page::new_links_only(
6579                                    &if http_worker && link_results.starts_with("https") {
6580                                        link_results.replacen("https", "http", 1).to_string()
6581                                    } else {
6582                                        link_results.to_string()
6583                                    },
6584                                    &client,
6585                                )
6586                                .await;
6587
6588                                drop(permit);
6589
6590                                page.links
6591                            });
6592
6593                            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6594                        }
6595                    }
6596                    _ => break,
6597                }
6598                if exceeded_budget {
6599                    break;
6600                }
6601            }
6602
6603            while let Some(res) = set.join_next().await {
6604                if let Ok(msg) = res {
6605                    self.links_visited.extend_links(&mut links, msg);
6606                }
6607            }
6608
6609            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6610
6611            if links.is_empty() || exceeded_budget {
6612                break;
6613            }
6614        }
6615
6616        if !links.is_empty() {
6617            self.extra_links.extend(links);
6618        }
6619    }
6620
6621    #[cfg(all(feature = "chrome", feature = "real_browser"))]
6622    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6623    /// Warm up the gemini model.
6624    pub async fn warm_up_gemini(&mut self) {
6625        use crate::features::chrome::attempt_navigation;
6626
6627        if let Some(mut b) = self.setup_browser().await {
6628            if let Ok(page) = attempt_navigation(
6629                "about:blank",
6630                &b.browser.0,
6631                &self.configuration.request_timeout,
6632                &b.browser.2,
6633                &self.configuration.viewport,
6634            )
6635            .await
6636            {
6637                let _ = crate::features::solvers::warm_gemini_model(&page).await;
6638                b.dispose();
6639            }
6640        }
6641    }
6642
6643    /// Start to crawl website concurrently using HTTP by default and chrome Javascript Rendering as needed. The glob feature does not work with this at the moment.
6644    #[cfg(all(not(feature = "decentralized"), feature = "smart"))]
6645    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6646    pub async fn crawl_concurrent_smart(
6647        &mut self,
6648        client: &Client,
6649        handle: &Option<Arc<AtomicI8>>,
6650    ) {
6651        use tokio::sync::OnceCell;
6652        self.start();
6653        self.status = CrawlStatus::Active;
6654        let browser: OnceBrowser = OnceCell::new();
6655
6656        let mut selectors: (
6657            CompactString,
6658            smallvec::SmallVec<[CompactString; 2]>,
6659            CompactString,
6660        ) = self.setup_selectors();
6661
6662        if self.single_page() {
6663            self.subscription_guard().await;
6664            self.crawl_establish_smart(&client, &mut selectors, &browser)
6665                .await;
6666        } else {
6667            let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
6668
6669            let mut links: HashSet<CaseInsensitiveString> = self.drain_extra_links().collect();
6670
6671            let (mut interval, throttle) = self.setup_crawl();
6672            let on_should_crawl_callback = self.on_should_crawl_callback.clone();
6673            let return_page_links = self.configuration.return_page_links;
6674
6675            links.extend(
6676                self.crawl_establish_smart(&client, &mut selectors, &browser)
6677                    .await,
6678            );
6679
6680            self.configuration.configure_allowlist();
6681
6682            let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> = JoinSet::new();
6683            let semaphore = self.setup_semaphore();
6684
6685            let shared = Arc::new((
6686                client.to_owned(),
6687                selectors,
6688                self.channel.clone(),
6689                self.channel_guard.clone(),
6690                self.configuration.clone(),
6691                self.domain_parsed.clone(),
6692                browser,
6693                self.on_link_find_callback.clone(),
6694                self.cookie_jar.clone(),
6695            ));
6696
6697            let add_external = self.configuration.external_domains_caseless.len() > 0;
6698            let mut exceeded_budget = false;
6699            let concurrency = throttle.is_zero();
6700
6701            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6702
6703            if !concurrency && !links.is_empty() {
6704                tokio::time::sleep(*throttle).await;
6705            }
6706
6707            let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
6708                Some(Instant::now())
6709            } else {
6710                None
6711            };
6712
6713            'outer: loop {
6714                let mut stream =
6715                    tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect());
6716
6717                loop {
6718                    if !concurrency {
6719                        tokio::time::sleep(*throttle).await;
6720                    }
6721
6722                    let semaphore =
6723                        get_semaphore(&semaphore, !self.configuration.shared_queue).await;
6724
6725                    tokio::select! {
6726                        biased;
6727                        Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker)  => {
6728                            if !self
6729                                .handle_process(
6730                                    handle,
6731                                    &mut interval,
6732                                    async {
6733                                        emit_log_shutdown(&link.inner());
6734                                        let permits = set.len();
6735                                        set.shutdown().await;
6736                                        semaphore.add_permits(permits);
6737
6738                                    },
6739                                )
6740                                .await
6741                            {
6742                                break 'outer;
6743                            }
6744
6745                            let allowed = self.is_allowed(&link);
6746
6747                            if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
6748                                exceeded_budget = true;
6749                                break;
6750                            }
6751                            if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
6752                                continue;
6753                            }
6754
6755                            emit_log(&link.inner());
6756                            self.insert_link(link.clone()).await;
6757
6758                            if let Ok(permit) = semaphore.clone().acquire_owned().await {
6759                                let shared = shared.clone();
6760                                let on_should_crawl_callback = on_should_crawl_callback.clone();
6761                                spawn_set("page_fetch", &mut set, async move {
6762                                    let link_result = match &shared.7 {
6763                                        Some(cb) => cb(link, None),
6764                                        _ => (link, None),
6765                                    };
6766
6767                                    let url = link_result.0.as_ref();
6768                                    let mut page =
6769                                        Page::new_page(&url, &shared.0).await;
6770
6771                                    let mut retry_count = shared.4.retry;
6772
6773                                    while page.should_retry && retry_count > 0 {
6774                                        retry_count -= 1;
6775
6776                                        if let Some(timeout) = page.get_timeout() {
6777                                            tokio::time::sleep(timeout).await;
6778                                        }
6779
6780                                        if page.status_code == StatusCode::GATEWAY_TIMEOUT {
6781
6782                                            if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
6783                                                if retry_count.is_power_of_two() {
6784                                                    Website::render_chrome_page(
6785                                                        &shared.4, &shared.0,
6786                                                         &mut page, url,
6787                                                         &shared.5,
6788                                                         &shared.6,
6789                                                    )
6790                                                    .await;
6791                                                } else {
6792                                                    let next_page =  Page::new_page(url, &shared.0).await;
6793
6794                                                    page.clone_from(&next_page)
6795                                                };
6796
6797                                            }).await
6798                                        {
6799                                            log::info!("backoff gateway timeout exceeded {elasped}");
6800                                        }
6801
6802                                        } else {
6803
6804                                            if retry_count.is_power_of_two() {
6805                                                Website::render_chrome_page(
6806                                                    &shared.4, &shared.0,
6807                                                    &mut page, url,
6808                                                    &shared.5,
6809                                                    &shared.6,
6810                                                )
6811                                                .await;
6812                                            } else {
6813                                                page.clone_from(
6814                                                    &Page::new_page(url, &shared.0)
6815                                                        .await,
6816                                                );
6817                                            }
6818                                        }
6819                                    }
6820
6821                                    if add_external {
6822                                        page.set_external(
6823                                            shared
6824                                                .4
6825                                                .external_domains_caseless
6826                                                .clone(),
6827                                        );
6828                                    }
6829
6830                                    let prev_domain = page.base;
6831
6832                                    page.base = shared.5.as_deref().cloned();
6833
6834                                    if return_page_links {
6835                                        page.page_links = Some(Default::default());
6836                                    }
6837
6838                                    let (links, bytes_transferred ) = page
6839                                        .smart_links(
6840                                            &shared.1, &shared.4, &shared.5, &shared.6, Some(&shared.8)
6841                                        )
6842                                        .await;
6843
6844                                    page.base = prev_domain;
6845                                    page.bytes_transferred = bytes_transferred;
6846
6847                                    if shared.4.normalize {
6848                                        page.signature.replace(crate::utils::hash_html(&page.get_html_bytes_u8()).await);
6849                                    }
6850
6851                                    // Run remote multimodal extraction if configured (smart HTTP path)
6852                                    #[cfg(all(feature = "agent", feature = "serde"))]
6853                                    if shared.4.remote_multimodal.is_some() {
6854                                        let html = page.get_html();
6855                                        if !html.is_empty() {
6856                                            use crate::features::automation::{run_remote_multimodal_extraction, AutomationResultExt};
6857                                            let title = page.metadata.as_ref().and_then(|m| m.title.as_ref()).map(|t| t.as_str());
6858                                            if let Ok(Some(result)) = run_remote_multimodal_extraction(
6859                                                &shared.4.remote_multimodal,
6860                                                &html,
6861                                                url,
6862                                                title,
6863                                            ).await {
6864                                                // Store usage on page
6865                                                match page.remote_multimodal_usage.as_mut() {
6866                                                    Some(v) => v.push(result.usage.clone()),
6867                                                    None => page.remote_multimodal_usage = Some(vec![result.usage.clone()]),
6868                                                }
6869                                                // Store extracted data if available
6870                                                if result.extracted.is_some() || result.screenshot.is_some() {
6871                                                    let automation_result = result.to_automation_results();
6872                                                    match page.extra_remote_multimodal_data.as_mut() {
6873                                                        Some(v) => v.push(automation_result),
6874                                                        None => page.extra_remote_multimodal_data = Some(vec![automation_result]),
6875                                                    }
6876                                                }
6877                                            }
6878                                        }
6879                                    }
6880
6881                                    if let Some(ref cb) = on_should_crawl_callback {
6882                                        if !cb.call(&page) {
6883                                            page.blocked_crawl = true;
6884                                            channel_send_page(&shared.2, page, &shared.3);
6885                                            drop(permit);
6886                                            return Default::default()
6887                                        }
6888                                    }
6889
6890                                    let signature = page.signature;
6891
6892                                    channel_send_page(&shared.2, page, &shared.3);
6893
6894                                    drop(permit);
6895
6896                                    (links, signature)
6897                                });
6898                            }
6899
6900                            self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6901                        }
6902                        Some(result) = set.join_next(), if !set.is_empty() => {
6903                            if let Ok(res) = result {
6904                                match res.1 {
6905                                    Some(signature) => {
6906                                        if self.is_signature_allowed(signature).await {
6907                                            self.insert_signature(signature).await;
6908                                            self.links_visited.extend_links(&mut links, res.0);
6909                                        }
6910                                    }
6911                                    _ => {
6912                                        self.links_visited.extend_links(&mut links, res.0);
6913                                    }
6914                                }
6915                            } else{
6916                                break
6917                            }
6918                        }
6919                        else => break,
6920                    }
6921
6922                    if links.is_empty() && set.is_empty() || exceeded_budget {
6923                        if exceeded_budget {
6924                            while set.join_next().await.is_some() {}
6925                        }
6926                        break 'outer;
6927                    }
6928                }
6929
6930                self.subscription_guard().await;
6931                self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6932
6933                if links.is_empty() && set.is_empty() {
6934                    break;
6935                }
6936            }
6937
6938            if !links.is_empty() {
6939                self.extra_links.extend(links);
6940            }
6941        }
6942    }
6943
6944    /// Sitemap crawl entire lists. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
6945    #[cfg(not(feature = "sitemap"))]
6946    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6947    pub async fn sitemap_crawl(
6948        &mut self,
6949        _client: &Client,
6950        _handle: &Option<Arc<AtomicI8>>,
6951        _scrape: bool,
6952    ) {
6953    }
6954
6955    /// Sitemap crawl entire lists chain. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
6956    #[cfg(not(feature = "sitemap"))]
6957    pub async fn sitemap_crawl_chain(
6958        &mut self,
6959        _client: &Client,
6960        _handle: &Option<Arc<AtomicI8>>,
6961        _scrape: bool,
6962    ) {
6963    }
6964
6965    /// Setup the sitemap path
6966    #[cfg(feature = "sitemap")]
6967    pub(crate) fn get_sitemap_setup(&self, domain: &str) -> (&str, bool) {
6968        let (sitemap_path, needs_trailing) = match &self.configuration.sitemap_url {
6969            Some(sitemap_path) => {
6970                let sitemap_path = sitemap_path.as_str();
6971                if domain.ends_with('/') && sitemap_path.starts_with('/') {
6972                    (&sitemap_path[1..], false)
6973                } else if !domain.ends_with('/')
6974                    && !sitemap_path.is_empty()
6975                    && !sitemap_path.starts_with('/')
6976                {
6977                    (sitemap_path, true)
6978                } else {
6979                    (sitemap_path, false)
6980                }
6981            }
6982            _ => ("sitemap.xml", !domain.ends_with("/")),
6983        };
6984
6985        (sitemap_path, needs_trailing)
6986    }
6987
6988    /// Sitemap crawl entire lists. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
6989    #[cfg(feature = "sitemap")]
6990    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6991    pub(crate) async fn sitemap_crawl_raw(
6992        &mut self,
6993        client: &Client,
6994        handle: &Option<Arc<AtomicI8>>,
6995        scrape: bool,
6996    ) {
6997        let mut exceeded_budget = self.is_over_wild_budget(&self.configuration.budget);
6998
6999        if !exceeded_budget {
7000            let selectors = self.setup_selectors();
7001            let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
7002            let domain = self.url.inner().as_str();
7003            self.domain_parsed = parse_absolute_url(&domain);
7004
7005            let persist_links = self.status == CrawlStatus::Start;
7006
7007            let mut interval: Interval = tokio::time::interval(Duration::from_millis(15));
7008
7009            let (sitemap_path, needs_trailing) = self.get_sitemap_setup(&domain);
7010
7011            self.configuration.sitemap_url = Some(Box::new(
7012                string_concat!(domain, if needs_trailing { "/" } else { "" }, sitemap_path).into(),
7013            ));
7014
7015            self.configuration.configure_allowlist();
7016
7017            let domain_parsed_ref = self.domain_parsed.as_deref().cloned().map(Box::new);
7018
7019            let shared = Arc::new((
7020                self.channel.clone(),
7021                self.channel_guard.clone(),
7022                selectors,
7023                domain_parsed_ref,
7024            ));
7025            let mut sitemaps = match &self.configuration.sitemap_url {
7026                Some(sitemap) => Vec::from([sitemap.to_owned()]),
7027                _ => Default::default(),
7028            };
7029
7030            let return_page_links = self.configuration.return_page_links;
7031
7032            let mut extra_links = self.extra_links.clone();
7033            self.dequeue(&mut q, &mut *extra_links, &mut exceeded_budget)
7034                .await;
7035            self.extra_links.clone_from(&extra_links);
7036
7037            let whitelist_changes = self.configuration.add_sitemap_to_whitelist();
7038
7039            if whitelist_changes.modified() {
7040                self.configuration.set_whitelist();
7041            }
7042
7043            'outer: loop {
7044                let stream =
7045                    tokio_stream::iter::<Vec<Box<CompactString>>>(sitemaps.drain(..).collect());
7046                tokio::pin!(stream);
7047
7048                let mut first_request = false;
7049                let mut attempted_correct = false;
7050
7051                while let Some(mut sitemap_url) = stream.next().await {
7052                    if !self.handle_process(handle, &mut interval, async {}).await {
7053                        break 'outer;
7054                    }
7055
7056                    let link = <CompactString as Clone>::clone(&(*sitemap_url)).into();
7057
7058                    let allowed = self.is_allowed_budgetless(&link);
7059
7060                    if allowed.eq(&ProcessLinkStatus::Blocked) {
7061                        continue;
7062                    }
7063
7064                    self.insert_link(link).await;
7065
7066                    let (tx, mut rx) = tokio::sync::mpsc::channel::<Page>(100);
7067
7068                    let shared = shared.clone();
7069
7070                    let handles = crate::utils::spawn_task("page_fetch", async move {
7071                        let mut pages = Vec::new();
7072
7073                        while let Some(mut page) = rx.recv().await {
7074                            if page.page_links.is_none() {
7075                                let links = page.links(&shared.2, &shared.3).await;
7076                                page.page_links = Some(links.into());
7077                            }
7078
7079                            if scrape || persist_links {
7080                                pages.push(page.clone());
7081                            };
7082
7083                            // reset the page links before sending to the main subscriber.
7084                            if !return_page_links {
7085                                page.page_links = None;
7086                            }
7087
7088                            if shared.0.is_some() {
7089                                channel_send_page(&shared.0, page, &shared.1);
7090                            }
7091                        }
7092
7093                        pages
7094                    });
7095
7096                    while !first_request {
7097                        // try to get the original sitemap if it had an error on the first request make a request to the root html and parse out the sitemap path.
7098                        match client.get(sitemap_url.as_str()).send().await {
7099                            Ok(response) => {
7100                                let limit = *crate::utils::MAX_SIZE_BYTES as u64;
7101
7102                                if let Some(response_content_length) = response.content_length() {
7103                                    if limit > 0 && response_content_length >= limit {
7104                                        // we need a error here
7105                                        first_request = true;
7106                                        log::info!(
7107                                            "{} exceeded parse limit: {:?}",
7108                                            sitemap_url,
7109                                            limit
7110                                        );
7111                                        break;
7112                                    }
7113                                }
7114
7115                                if response.status() == 404 {
7116                                    if !self
7117                                        .sitemap_parse(
7118                                            client,
7119                                            &mut first_request,
7120                                            &mut sitemap_url,
7121                                            &mut attempted_correct,
7122                                        )
7123                                        .await
7124                                    {
7125                                        break;
7126                                    }
7127                                } else {
7128                                    match response.bytes().await {
7129                                        Ok(b) => {
7130                                            first_request = true;
7131                                            self.sitemap_parse_crawl(
7132                                                client,
7133                                                handle,
7134                                                b,
7135                                                &mut interval,
7136                                                &mut exceeded_budget,
7137                                                &tx,
7138                                                &mut sitemaps,
7139                                                true,
7140                                            )
7141                                            .await;
7142                                        }
7143                                        Err(err) => {
7144                                            first_request = true;
7145                                            log::info!("http parse error: {:?}", err.to_string())
7146                                        }
7147                                    };
7148                                }
7149                            }
7150                            Err(err) => {
7151                                // do not retry error again.
7152                                if attempted_correct {
7153                                    first_request = true;
7154                                    break;
7155                                }
7156
7157                                log::info!("attempting to find sitemap path: {}", err.to_string());
7158
7159                                if !self
7160                                    .sitemap_parse(
7161                                        client,
7162                                        &mut first_request,
7163                                        &mut sitemap_url,
7164                                        &mut attempted_correct,
7165                                    )
7166                                    .await
7167                                {
7168                                    break;
7169                                }
7170                            }
7171                        };
7172                    }
7173
7174                    drop(tx);
7175
7176                    if let Ok(mut handle) = handles.await {
7177                        for page in handle.iter_mut() {
7178                            if let Some(mut links) = page.page_links.clone() {
7179                                self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7180                                self.extra_links.extend(*links)
7181                            }
7182                        }
7183                        if scrape {
7184                            if let Some(p) = self.pages.as_mut() {
7185                                p.extend(handle);
7186                            }
7187                        }
7188                    }
7189
7190                    if exceeded_budget {
7191                        break;
7192                    }
7193                }
7194
7195                if sitemaps.len() == 0 || exceeded_budget {
7196                    break;
7197                }
7198            }
7199
7200            self.configuration
7201                .remove_sitemap_from_whitelist(whitelist_changes);
7202        }
7203    }
7204
7205    /// Sitemap crawl entire lists using chrome. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
7206    #[cfg(all(
7207        feature = "sitemap",
7208        feature = "chrome",
7209        not(feature = "decentralized")
7210    ))]
7211    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7212    pub(crate) async fn sitemap_crawl_chrome(
7213        &mut self,
7214        client: &Client,
7215        handle: &Option<Arc<AtomicI8>>,
7216        scrape: bool,
7217    ) {
7218        use crate::features::chrome::attempt_navigation;
7219        use sitemap::{
7220            reader::{SiteMapEntity, SiteMapReader},
7221            structs::Location,
7222        };
7223
7224        let mut exceeded_budget = self.is_over_wild_budget(&self.configuration.budget);
7225
7226        if !exceeded_budget {
7227            if let Some(mut b) = self.setup_browser().await {
7228                let selectors = self.setup_selectors();
7229                let semaphore: Arc<Semaphore> = self.setup_semaphore();
7230                let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
7231                let domain = self.url.inner().as_str();
7232                self.domain_parsed = parse_absolute_url(&domain);
7233                let persist_links = self.status == CrawlStatus::Start;
7234
7235                let mut interval = tokio::time::interval(Duration::from_millis(15));
7236
7237                let (sitemap_path, needs_trailing) = self.get_sitemap_setup(&domain);
7238
7239                self.configuration.sitemap_url = Some(Box::new(
7240                    string_concat!(domain, if needs_trailing { "/" } else { "" }, sitemap_path)
7241                        .into(),
7242                ));
7243
7244                self.configuration.configure_allowlist();
7245
7246                let domain_parsed_ref = self.domain_parsed.as_deref().cloned().map(Box::new);
7247
7248                let shared = Arc::new((
7249                    self.channel.clone(),
7250                    self.channel_guard.clone(),
7251                    b.browser.0.clone(),
7252                    self.configuration.clone(),
7253                    self.url.inner().to_string(),
7254                    b.browser.2.clone(),
7255                    selectors.clone(),
7256                    domain_parsed_ref,
7257                ));
7258
7259                let mut sitemaps = match &self.configuration.sitemap_url {
7260                    Some(sitemap) => Vec::from([sitemap.to_owned()]),
7261                    _ => Default::default(),
7262                };
7263
7264                let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
7265                    Some(Instant::now())
7266                } else {
7267                    None
7268                };
7269
7270                let mut extra_links = self.extra_links.clone();
7271                self.dequeue(&mut q, &mut *extra_links, &mut exceeded_budget)
7272                    .await;
7273                self.extra_links.clone_from(&extra_links);
7274                let mut set: JoinSet<Option<Page>> = JoinSet::new();
7275
7276                let whitelist_changes = self.configuration.add_sitemap_to_whitelist();
7277
7278                if whitelist_changes.modified() {
7279                    self.configuration.set_whitelist();
7280                }
7281
7282                'outer: loop {
7283                    let stream: tokio_stream::Iter<std::vec::IntoIter<Box<CompactString>>> =
7284                        tokio_stream::iter::<Vec<Box<CompactString>>>(sitemaps.drain(..).collect());
7285                    tokio::pin!(stream);
7286
7287                    tokio::select! {
7288                        biased;
7289                        Some(sitemap_url) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker)  => {
7290                            if !self.handle_process(handle, &mut interval, async {}).await {
7291                                break 'outer;
7292                            }
7293
7294                            let link = <CompactString as Clone>::clone(&(*sitemap_url)).into();
7295
7296                            let allowed = self.is_allowed_budgetless(&link);
7297
7298                            if allowed.eq(&ProcessLinkStatus::Blocked) {
7299                                continue;
7300                            }
7301
7302                            self.insert_link(link).await;
7303
7304                            match attempt_navigation(
7305                                "about:blank",
7306                                &shared.2,
7307                                &self.configuration.request_timeout,
7308                                &shared.5,
7309                                &self.configuration.viewport,
7310                            )
7311                            .await {
7312                                Ok(new_page) => {
7313                                    let (_, intercept_handle) = tokio::join!(
7314                                        crate::features::chrome::setup_chrome_events(
7315                                            &new_page,
7316                                            &self.configuration
7317                                        ),
7318                                        self.setup_chrome_interception(&new_page)
7319                                    );
7320
7321                                    let mut page = Page::new(
7322                                        &sitemap_url,
7323                                        &client,
7324                                        &new_page,
7325                                        &self.configuration.wait_for,
7326                                        &self.configuration.screenshot,
7327                                        false, // we use the initial about:blank page.
7328                                        &self.configuration.openai_config,
7329                                        &self.configuration.execution_scripts,
7330                                        &self.configuration.automation_scripts,
7331                                        &self.configuration.viewport,
7332                                        &self.configuration.request_timeout,
7333                                        &self.configuration.track_events,
7334                                        self.configuration.referer.clone(),
7335                                        self.configuration.max_page_bytes,
7336                                        self.configuration.get_cache_options(),
7337                                        &self.configuration.cache_policy,
7338                                        &self.configuration.remote_multimodal,
7339                                    )
7340                                    .await;
7341
7342                                    if let Some(h) = intercept_handle {
7343                                        let abort_handle = h.abort_handle();
7344                                        if let Err(elasped) =
7345                                            tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await
7346                                        {
7347                                            log::warn!("Handler timeout exceeded {elasped}");
7348                                            abort_handle.abort();
7349                                        }
7350                                    }
7351
7352                                    drop(new_page);
7353
7354                                    let is_xml_entry = page.get_html_bytes_u8().starts_with(b"<?xml");
7355                                    let is_xml = is_xml_entry
7356                                        && !page.get_html_bytes_u8().ends_with(b"</html>");
7357
7358                                    if is_xml {
7359                                        let reader = SiteMapReader::new(&*page.get_html_bytes_u8());
7360                                        let mut stream = tokio_stream::iter(reader);
7361
7362                                        while let Some(entity) = stream.next().await {
7363                                            if !self.handle_process(handle, &mut interval, async {}).await {
7364                                                break;
7365                                            }
7366                                            match entity {
7367                                                SiteMapEntity::Url(url_entry) => match url_entry.loc {
7368                                                    Location::Url(url) => {
7369                                                        let link: CaseInsensitiveString = url.as_str().into();
7370
7371                                                        let allowed = self.is_allowed(&link);
7372
7373                                                        if allowed.eq(&ProcessLinkStatus::Blocked) {
7374                                                            continue;
7375                                                        }
7376                                                        if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
7377                                                            exceeded_budget = true;
7378                                                            break;
7379                                                        }
7380
7381                                                        self.insert_link(link.clone()).await;
7382
7383                                                        let client = client.clone();
7384                                                        let shared = shared.clone();
7385
7386                                                        spawn_set("page_fetch", &mut set, async move {
7387                                                            if let Ok(new_page) = attempt_navigation(
7388                                                                "about:blank",
7389                                                                &shared.2,
7390                                                                &shared.3.request_timeout,
7391                                                                &shared.5,
7392                                                                &shared.3.viewport,
7393                                                            )
7394                                                            .await
7395                                                            {
7396                                                                let (_, intercept_handle) = tokio::join!(
7397                                                                    crate::features::chrome::setup_chrome_events(
7398                                                                        &new_page, &shared.3,
7399                                                                    ),
7400                                                                    crate::features::chrome::setup_chrome_interception_base(
7401                                                                        &new_page,
7402                                                                        shared.3.chrome_intercept.enabled,
7403                                                                        &shared.3.auth_challenge_response,
7404                                                                        shared.3.chrome_intercept.block_visuals,
7405                                                                        &shared.4,
7406                                                                    )
7407                                                                );
7408
7409                                                                let mut page = Page::new(
7410                                                                    &link.inner(),
7411                                                                    &client,
7412                                                                    &new_page,
7413                                                                    &shared.3.wait_for,
7414                                                                    &shared.3.screenshot,
7415                                                                    false,
7416                                                                    &shared.3.openai_config,
7417                                                                    &shared.3.execution_scripts,
7418                                                                    &shared.3.automation_scripts,
7419                                                                    &shared.3.viewport,
7420                                                                    &shared.3.request_timeout,
7421                                                                    &shared.3.track_events,
7422                                                                    shared.3.referer.clone(),
7423                                                                    shared.3.max_page_bytes,
7424                                                                    shared.3.get_cache_options(),
7425                                                                    &shared.3.cache_policy,
7426                                                                    &shared.3.remote_multimodal,
7427                                                                )
7428                                                                .await;
7429
7430                                                                if let Some(intercept_handle) = intercept_handle
7431                                                                {
7432                                                                    let abort_handle =
7433                                                                        intercept_handle.abort_handle();
7434
7435                                                                    if let Err(elasped) = tokio::time::timeout(
7436                                                                        tokio::time::Duration::from_secs(10),
7437                                                                        async { intercept_handle.await },
7438                                                                    )
7439                                                                    .await
7440                                                                    {
7441                                                                        log::warn!("Handler timeout exceeded {elasped}");
7442                                                                        abort_handle.abort();
7443                                                                    }
7444                                                                }
7445
7446                                                                if page.page_links.is_none() {
7447                                                                    let links =
7448                                                                        page.links(&shared.6, &shared.7).await;
7449                                                                    page.page_links = Some(links.into());
7450                                                                }
7451
7452                                                                Some(page)
7453                                                            } else {
7454                                                                None
7455                                                            }
7456                                                        });
7457                                                    }
7458                                                    Location::None | Location::ParseErr(_) => (),
7459                                                },
7460                                                SiteMapEntity::SiteMap(sitemap_entry) => {
7461                                                    match sitemap_entry.loc {
7462                                                        Location::Url(url) => {
7463                                                            sitemaps.push(Box::new(CompactString::new(
7464                                                                &url.as_str(),
7465                                                            )));
7466                                                        }
7467                                                        Location::None | Location::ParseErr(_) => (),
7468                                                    }
7469                                                }
7470                                                SiteMapEntity::Err(err) => {
7471                                                    log::info!("incorrect sitemap error: {:?}", err.msg(),)
7472                                                }
7473                                            };
7474
7475                                            if exceeded_budget {
7476                                                break;
7477                                            }
7478                                        }
7479                                    } else {
7480
7481                                        if is_xml_entry {
7482                                            page.modify_xml_html();
7483                                        }
7484
7485                                        let links = page.links(&shared.6, &shared.7).await;
7486
7487                                        let mut stream = tokio_stream::iter(links);
7488
7489                                        while let Some(link) = stream.next().await {
7490                                            if !self.handle_process(handle, &mut interval, async {}).await {
7491                                                break;
7492                                            }
7493
7494                                            if link.ends_with(".xml") {
7495                                                sitemaps.push(Box::new(link.inner().clone()));
7496                                                continue;
7497                                            }
7498
7499                                            let allowed = self.is_allowed(&link);
7500
7501                                            if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
7502                                                exceeded_budget = true;
7503                                                break;
7504                                            }
7505                                            if allowed.eq(&ProcessLinkStatus::Blocked) {
7506                                                continue;
7507                                            }
7508
7509                                            self.insert_link(link.clone()).await;
7510
7511                                            let client = client.clone();
7512                                            let shared = shared.clone();
7513
7514                                            spawn_set("page_fetch", &mut set, async move {
7515                                                match attempt_navigation(
7516                                                    "about:blank",
7517                                                    &shared.2,
7518                                                    &shared.3.request_timeout,
7519                                                    &shared.5,
7520                                                    &shared.3.viewport,
7521                                                )
7522                                                .await {
7523                                                    Ok(new_page) => {
7524                                                        let (_, intercept_handle) = tokio::join!(
7525                                                            crate::features::chrome::setup_chrome_events(
7526                                                                &new_page, &shared.3,
7527                                                            ),
7528                                                            crate::features::chrome::setup_chrome_interception_base(
7529                                                                &new_page,
7530                                                                shared.3.chrome_intercept.enabled,
7531                                                                &shared.3.auth_challenge_response,
7532                                                                shared.3.chrome_intercept.block_visuals,
7533                                                                &shared.4,
7534                                                            )
7535                                                        );
7536
7537                                                        let mut page = Page::new(
7538                                                            &link.inner(),
7539                                                            &client,
7540                                                            &new_page,
7541                                                            &shared.3.wait_for,
7542                                                            &shared.3.screenshot,
7543                                                            false,
7544                                                            &shared.3.openai_config,
7545                                                            &shared.3.execution_scripts,
7546                                                            &shared.3.automation_scripts,
7547                                                            &shared.3.viewport,
7548                                                            &shared.3.request_timeout,
7549                                                            &shared.3.track_events,
7550                                                            shared.3.referer.clone(),
7551                                                            shared.3.max_page_bytes,
7552                                                            shared.3.get_cache_options(),
7553                                                            &shared.3.cache_policy,
7554                                                            &shared.3.remote_multimodal,
7555                                                        )
7556                                                        .await;
7557
7558                                                        if let Some(intercept_handle) = intercept_handle {
7559                                                            let abort_handle = intercept_handle.abort_handle();
7560
7561                                                            if let Err(elasped) = tokio::time::timeout(
7562                                                                tokio::time::Duration::from_secs(10),
7563                                                                async { intercept_handle.await },
7564                                                            )
7565                                                            .await
7566                                                            {
7567                                                                log::warn!("Handler timeout exceeded {elasped}");
7568                                                                abort_handle.abort();
7569                                                            }
7570                                                        }
7571
7572                                                        if page.page_links.is_none() {
7573                                                            let links = page.links(&shared.6, &shared.7).await;
7574                                                            page.page_links = Some(links.into());
7575                                                        }
7576
7577                                                        Some(page)
7578                                                    }
7579                                                    Err(err) => {
7580                                                        log::error!("chrome failed to open: {:?}", err);
7581                                                        None
7582                                                    }
7583                                                }
7584                                            });
7585
7586                                            if exceeded_budget {
7587                                                break;
7588                                            }
7589                                        }
7590                                    }
7591                                }
7592                                Err(err) => {
7593                                    log::error!("chrome failed to open: {:?}", err);
7594                                }
7595                            }
7596
7597
7598                        },
7599                        Some(result) = set.join_next(), if !set.is_empty() => {
7600                            if let Ok(res) = result {
7601                                match res {
7602                                    Some(page) => {
7603                                        if let Some(signature) = page.signature {
7604                                            if self.is_signature_allowed(signature).await {
7605                                                if let Some(mut links) = page.page_links.clone() {
7606                                                    self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7607                                                    self.extra_links.extend(*links)
7608                                                }
7609                                                self.insert_signature(signature).await;
7610
7611                                                channel_send_page(
7612                                                    &shared.0, page.clone(), &shared.1,
7613                                                );
7614
7615                                                if scrape || persist_links {
7616                                                    if let Some(p) = self.pages.as_mut() {
7617                                                        p.push(page);
7618                                                    }
7619                                                }
7620                                            }
7621                                        } else {
7622                                            if let Some(mut links) = page.page_links.clone() {
7623                                                self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7624                                                self.extra_links.extend(*links)
7625                                            }
7626                                            channel_send_page(
7627                                                &shared.0, page.clone(), &shared.1,
7628                                            );
7629                                            if scrape || persist_links {
7630                                                if let Some(p) = self.pages.as_mut() {
7631                                                    p.push(page);
7632                                                }
7633                                            }
7634                                        }
7635                                    }
7636                                    _ => ()
7637                                }
7638                            } else {
7639                                break;
7640                            }
7641                        }
7642                        else => break,
7643                    }
7644
7645                    if sitemaps.len() == 0 || exceeded_budget {
7646                        break;
7647                    }
7648                }
7649
7650                while let Some(result) = set.join_next().await {
7651                    if let Ok(res) = result {
7652                        match res {
7653                            Some(page) => {
7654                                if let Some(signature) = page.signature {
7655                                    if self.is_signature_allowed(signature).await {
7656                                        if let Some(mut links) = page.page_links.clone() {
7657                                            self.dequeue(&mut q, &mut links, &mut exceeded_budget)
7658                                                .await;
7659                                            self.extra_links.extend(*links)
7660                                        }
7661                                        self.insert_signature(signature).await;
7662                                        channel_send_page(&shared.0, page.clone(), &shared.1);
7663                                        if scrape || persist_links {
7664                                            if let Some(p) = self.pages.as_mut() {
7665                                                p.push(page);
7666                                            }
7667                                        }
7668                                    }
7669                                } else {
7670                                    if let Some(mut links) = page.page_links.clone() {
7671                                        self.dequeue(&mut q, &mut links, &mut exceeded_budget)
7672                                            .await;
7673                                        self.extra_links.extend(*links)
7674                                    }
7675                                    channel_send_page(&shared.0, page.clone(), &shared.1);
7676                                    if scrape || persist_links {
7677                                        if let Some(p) = self.pages.as_mut() {
7678                                            p.push(page);
7679                                        }
7680                                    }
7681                                }
7682                            }
7683                            _ => (),
7684                        }
7685                    }
7686                }
7687                b.dispose();
7688                self.configuration
7689                    .remove_sitemap_from_whitelist(whitelist_changes);
7690            }
7691        }
7692    }
7693
7694    /// Sitemap crawl entire lists. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
7695    #[cfg(feature = "sitemap")]
7696    pub async fn sitemap_crawl(
7697        &mut self,
7698        client: &Client,
7699        handle: &Option<Arc<AtomicI8>>,
7700        scrape: bool,
7701    ) {
7702        self.sitemap_crawl_raw(client, handle, scrape).await
7703    }
7704
7705    /// Sitemap crawl entire lists chain. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
7706    #[cfg(all(
7707        feature = "sitemap",
7708        any(not(feature = "chrome"), feature = "decentralized")
7709    ))]
7710    async fn sitemap_crawl_chain(
7711        &mut self,
7712        client: &Client,
7713        handle: &Option<Arc<AtomicI8>>,
7714        scrape: bool,
7715    ) {
7716        if !self.configuration.ignore_sitemap {
7717            self.sitemap_crawl_raw(client, handle, scrape).await
7718        }
7719    }
7720
7721    /// Sitemap crawl entire lists chain using chrome. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
7722    #[cfg(all(
7723        feature = "sitemap",
7724        feature = "chrome",
7725        not(feature = "decentralized")
7726    ))]
7727    pub async fn sitemap_crawl_chain(
7728        &mut self,
7729        client: &Client,
7730        handle: &Option<Arc<AtomicI8>>,
7731        scrape: bool,
7732    ) {
7733        if !self.configuration.ignore_sitemap {
7734            self.sitemap_crawl_chrome(client, handle, scrape).await
7735        }
7736    }
7737
7738    /// Sitemap parse entire lists. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
7739    #[cfg(feature = "sitemap")]
7740    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7741    pub async fn sitemap_parse(
7742        &mut self,
7743        client: &Client,
7744        first_request: &mut bool,
7745        sitemap_url: &mut Box<CompactString>,
7746        attempted_correct: &mut bool,
7747    ) -> bool {
7748        let mut valid = *attempted_correct == false;
7749
7750        if valid {
7751            if let Some(domain) = &self.domain_parsed {
7752                // attempt to parse the sitemap from the html.
7753                match client.get(domain.as_str()).send().await {
7754                    Ok(response) => {
7755                        let limit = *crate::utils::MAX_SIZE_BYTES as u64;
7756
7757                        if let Some(response_content_length) = response.content_length() {
7758                            if limit > 0 && response_content_length >= limit {
7759                                log::info!("{} exceeded parse limit: {:?}", domain, limit);
7760                                *first_request = true;
7761                                *attempted_correct = true;
7762                                valid = false;
7763                            }
7764                        }
7765
7766                        if valid {
7767                            // stream the bytes to lol_html to parse the sitemap from the path.
7768                            let cell = tokio::sync::OnceCell::new();
7769
7770                            let rewriter_settings = lol_html::Settings {
7771                                element_content_handlers: vec![lol_html::element!(
7772                                    r#"link[rel="sitemap"]"#,
7773                                    |el| {
7774                                        if let Some(href) = el.get_attribute("href") {
7775                                            let _ = cell.set(href);
7776                                        }
7777                                        Ok(())
7778                                    }
7779                                )],
7780                                adjust_charset_on_meta_tag: false,
7781                                ..lol_html::send::Settings::new_for_handler_types()
7782                            };
7783
7784                            let mut rewriter = lol_html::send::HtmlRewriter::new(
7785                                rewriter_settings,
7786                                |_c: &[u8]| {},
7787                            );
7788
7789                            let mut wrote_error = false;
7790                            let mut stream = response.bytes_stream();
7791
7792                            while let Some(chunk) = stream.next().await {
7793                                if let Ok(chunk) = chunk {
7794                                    if rewriter.write(&chunk).is_err() {
7795                                        wrote_error = true;
7796                                        break;
7797                                    }
7798                                }
7799                                if cell.initialized() {
7800                                    break;
7801                                }
7802                            }
7803
7804                            if !wrote_error {
7805                                let _ = rewriter.end();
7806                            }
7807
7808                            if let Some(sitemap) = cell.get() {
7809                                if sitemap.is_empty() {
7810                                    *first_request = true;
7811                                }
7812
7813                                if let Err(_) = domain.join(sitemap) {
7814                                    *first_request = true;
7815                                }
7816                                // if we retried the request here it should succeed.
7817                                *sitemap_url = Box::new(sitemap.into());
7818                                *attempted_correct = true;
7819                            } else {
7820                                *first_request = true;
7821                            }
7822                        }
7823                    }
7824                    Err(err) => {
7825                        *first_request = true;
7826                        valid = false;
7827                        log::info!("http parse error: {:?}", err.to_string())
7828                    }
7829                };
7830            }
7831        }
7832
7833        valid
7834    }
7835    /// Sitemap parse entire lists. Note: this method does not re-crawl the links of the pages found on the sitemap. This does nothing without the `sitemap` flag.
7836    #[cfg(feature = "sitemap")]
7837    #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7838    async fn sitemap_parse_crawl(
7839        &mut self,
7840        client: &Client,
7841        handle: &Option<Arc<AtomicI8>>,
7842        b: bytes::Bytes,
7843        mut interval: &mut Interval,
7844        exceeded_budget: &mut bool,
7845        tx: &tokio::sync::mpsc::Sender<Page>,
7846        sitemaps: &mut Vec<Box<CompactString>>,
7847        crawl: bool,
7848    ) {
7849        use sitemap::reader::{SiteMapEntity, SiteMapReader};
7850        use sitemap::structs::Location;
7851
7852        if !b.is_empty() && b.starts_with(b"<?xml") {
7853            let mut stream = tokio_stream::iter(SiteMapReader::new(&*b));
7854
7855            let retry = self.configuration.retry;
7856
7857            while let Some(entity) = stream.next().await {
7858                if !self.handle_process(handle, &mut interval, async {}).await {
7859                    break;
7860                }
7861                match entity {
7862                    SiteMapEntity::Url(url_entry) => match url_entry.loc {
7863                        Location::Url(url) => {
7864                            let link: CaseInsensitiveString = url.as_str().into();
7865
7866                            let allowed = self.is_allowed(&link);
7867
7868                            if allowed.eq(&ProcessLinkStatus::Blocked) {
7869                                continue;
7870                            }
7871
7872                            if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
7873                                *exceeded_budget = true;
7874                                break;
7875                            }
7876
7877                            self.insert_link(link.clone()).await;
7878
7879                            if crawl {
7880                                let client = client.clone();
7881                                let tx = tx.clone();
7882
7883                                crate::utils::spawn_task("page_fetch", async move {
7884                                    let mut page = Page::new_page(&link.inner(), &client).await;
7885
7886                                    let mut retry_count = retry;
7887
7888                                    while page.should_retry && retry_count > 0 {
7889                                        if let Some(timeout) = page.get_timeout() {
7890                                            tokio::time::sleep(timeout).await;
7891                                        }
7892                                        page.clone_from(
7893                                            &Page::new_page(link.inner(), &client).await,
7894                                        );
7895                                        retry_count -= 1;
7896                                    }
7897
7898                                    if let Ok(permit) = tx.reserve().await {
7899                                        permit.send(page);
7900                                    }
7901                                });
7902                            }
7903                        }
7904                        Location::None | Location::ParseErr(_) => (),
7905                    },
7906                    SiteMapEntity::SiteMap(sitemap_entry) => match sitemap_entry.loc {
7907                        Location::Url(url) => {
7908                            sitemaps.push(Box::new(CompactString::new(&url.as_str())));
7909                        }
7910                        Location::None | Location::ParseErr(_) => (),
7911                    },
7912                    SiteMapEntity::Err(err) => {
7913                        log::info!("incorrect sitemap error: {:?}", err.msg())
7914                    }
7915                };
7916
7917                if *exceeded_budget {
7918                    break;
7919                }
7920            }
7921        }
7922    }
7923
7924    /// get base link for crawl establishing.
7925    #[cfg(feature = "regex")]
7926    pub fn get_base_link(&self) -> &CaseInsensitiveString {
7927        &self.url
7928    }
7929
7930    /// get base link for crawl establishing.
7931    #[cfg(not(feature = "regex"))]
7932    pub fn get_base_link(&self) -> &CompactString {
7933        self.url.inner()
7934    }
7935
7936    /// Guard the channel from closing until all subscription events complete.
7937    pub async fn subscription_guard(&self) {
7938        if let Some(channel) = &self.channel {
7939            if !channel.1.is_empty() {
7940                if let Some(guard_counter) = &self.channel_guard {
7941                    guard_counter.lock().await
7942                }
7943            }
7944        }
7945    }
7946
7947    /// Launch or connect to browser with setup.
7948    #[cfg(feature = "chrome")]
7949    pub async fn setup_browser_base(
7950        config: &Configuration,
7951        url_parsed: &Option<Box<Url>>,
7952        jar: Option<&Arc<reqwest::cookie::Jar>>,
7953    ) -> Option<crate::features::chrome::BrowserController> {
7954        match crate::features::chrome::launch_browser_cookies(&config, url_parsed, jar).await {
7955            Some((browser, browser_handle, context_id)) => {
7956                let browser: Arc<chromiumoxide::Browser> = Arc::new(browser);
7957                let b = (browser, Some(browser_handle), context_id);
7958
7959                Some(crate::features::chrome::BrowserController::new(b))
7960            }
7961            _ => None,
7962        }
7963    }
7964
7965    /// Launch or connect to browser with setup.
7966    #[cfg(feature = "chrome")]
7967    pub async fn setup_browser(&self) -> Option<crate::features::chrome::BrowserController> {
7968        Website::setup_browser_base(
7969            &self.configuration,
7970            self.get_url_parsed(),
7971            Some(&self.cookie_jar),
7972        )
7973        .await
7974    }
7975
7976    /// Launch or connect to WebDriver with setup.
7977    #[cfg(feature = "webdriver")]
7978    pub async fn setup_webdriver(&self) -> Option<crate::features::webdriver::WebDriverController> {
7979        crate::features::webdriver::launch_driver(&self.configuration).await
7980    }
7981
7982    /// Render a page using WebDriver.
7983    #[cfg(feature = "webdriver")]
7984    pub async fn render_webdriver_page(
7985        &self,
7986        url: &str,
7987        driver: &std::sync::Arc<thirtyfour::WebDriver>,
7988    ) -> Option<String> {
7989        use crate::features::webdriver::{attempt_navigation, get_page_content, setup_driver_events};
7990
7991        let timeout = self
7992            .configuration
7993            .webdriver_config
7994            .as_ref()
7995            .and_then(|c| c.timeout);
7996
7997        // Navigate to the URL
7998        if let Err(e) = attempt_navigation(url, driver, &timeout).await {
7999            log::error!("WebDriver navigation failed: {:?}", e);
8000            return None;
8001        }
8002
8003        // Setup events (stealth injection)
8004        setup_driver_events(driver, &self.configuration).await;
8005
8006        // Get page content
8007        match get_page_content(driver).await {
8008            Ok(content) => Some(content),
8009            Err(e) => {
8010                log::error!("Failed to get WebDriver page content: {:?}", e);
8011                None
8012            }
8013        }
8014    }
8015
8016    /// Respect robots.txt file.
8017    pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self {
8018        self.configuration
8019            .with_respect_robots_txt(respect_robots_txt);
8020        self
8021    }
8022
8023    /// Include subdomains detection.
8024    pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self {
8025        self.configuration.with_subdomains(subdomains);
8026        self
8027    }
8028
8029    /// Bypass CSP protection detection. This does nothing without the feat flag `chrome` enabled.
8030    pub fn with_csp_bypass(&mut self, enabled: bool) -> &mut Self {
8031        self.configuration.with_csp_bypass(enabled);
8032        self
8033    }
8034
8035    /// Configure WebDriver for browser automation. This does nothing without the `webdriver` feature flag enabled.
8036    /// When configured, the `crawl()` function will automatically use WebDriver instead of raw HTTP.
8037    #[cfg(feature = "webdriver")]
8038    pub fn with_webdriver(
8039        &mut self,
8040        webdriver_config: crate::features::webdriver_common::WebDriverConfig,
8041    ) -> &mut Self {
8042        self.configuration
8043            .with_webdriver_config(Some(webdriver_config));
8044        self
8045    }
8046
8047    /// Configure WebDriver for browser automation. This does nothing without the `webdriver` feature flag enabled.
8048    #[cfg(not(feature = "webdriver"))]
8049    pub fn with_webdriver(&mut self, _webdriver_config: ()) -> &mut Self {
8050        self
8051    }
8052
8053    /// Use sqlite to store data and track large crawls. This does nothing without the `disk` flag enabled.
8054    #[cfg(feature = "disk")]
8055    pub fn with_sqlite(&mut self, sqlite: bool) -> &mut Self {
8056        if sqlite {
8057            self.enable_sqlite = true;
8058        } else {
8059            self.enable_sqlite = false;
8060            self.sqlite = None;
8061        };
8062        self
8063    }
8064
8065    /// Use sqlite to store data and track large crawls.
8066    #[cfg(not(feature = "disk"))]
8067    pub fn with_sqlite(&mut self, _sqlite: bool) -> &mut Self {
8068        self
8069    }
8070
8071    /// Include tld detection.
8072    pub fn with_tld(&mut self, tld: bool) -> &mut Self {
8073        self.configuration.with_tld(tld);
8074        self
8075    }
8076
8077    /// The max duration for the crawl. This is useful when websites use a robots.txt with long durations and throttle the timeout removing the full concurrency.
8078    pub fn with_crawl_timeout(&mut self, crawl_timeout: Option<Duration>) -> &mut Self {
8079        self.configuration.with_crawl_timeout(crawl_timeout);
8080        self
8081    }
8082
8083    /// Only use HTTP/2.
8084    pub fn with_http2_prior_knowledge(&mut self, http2_prior_knowledge: bool) -> &mut Self {
8085        self.configuration
8086            .with_http2_prior_knowledge(http2_prior_knowledge);
8087        self
8088    }
8089
8090    /// Delay between request as ms.
8091    pub fn with_delay(&mut self, delay: u64) -> &mut Self {
8092        self.configuration.with_delay(delay);
8093        self
8094    }
8095
8096    /// Max time to wait for request.
8097    pub fn with_request_timeout(&mut self, request_timeout: Option<Duration>) -> &mut Self {
8098        self.configuration.with_request_timeout(request_timeout);
8099        self
8100    }
8101
8102    /// Dangerously accept invalid certificates - this should be used as a last resort.
8103    pub fn with_danger_accept_invalid_certs(&mut self, accept_invalid_certs: bool) -> &mut Self {
8104        self.configuration
8105            .with_danger_accept_invalid_certs(accept_invalid_certs);
8106        self
8107    }
8108
8109    /// Add user agent to request.
8110    pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self {
8111        self.configuration.with_user_agent(user_agent);
8112        self
8113    }
8114
8115    /// Preserve the HOST header.
8116    pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self {
8117        self.configuration.with_preserve_host_header(preserve);
8118        self
8119    }
8120
8121    #[cfg(feature = "sitemap")]
8122    /// Add user agent to request. This does nothing without the `sitemap` flag enabled.
8123    pub fn with_sitemap(&mut self, sitemap_url: Option<&str>) -> &mut Self {
8124        self.configuration.with_sitemap(sitemap_url);
8125        self
8126    }
8127
8128    #[cfg(not(feature = "sitemap"))]
8129    /// Add user agent to request. This does nothing without the `sitemap` flag enabled.
8130    pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self {
8131        self
8132    }
8133
8134    /// Use proxies for request.
8135    pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self {
8136        self.configuration.with_proxies(proxies);
8137        self
8138    }
8139
8140    /// Use proxies for request with control between chrome and http.
8141    pub fn with_proxies_direct(
8142        &mut self,
8143        proxies: Option<Vec<crate::configuration::RequestProxy>>,
8144    ) -> &mut Self {
8145        self.configuration.with_proxies_direct(proxies);
8146        self
8147    }
8148
8149    /// Set the concurrency limits. If you set the value to None to use the default limits using the system CPU cors * n.
8150    pub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self {
8151        self.configuration.with_concurrency_limit(limit);
8152        self
8153    }
8154
8155    /// Set a crawl ID to use for tracking crawls. This does nothing without the `control` flag enabled.
8156    #[cfg(not(feature = "control"))]
8157    pub fn with_crawl_id(&mut self, _crawl_id: String) -> &mut Self {
8158        self
8159    }
8160
8161    /// Set a crawl ID to use for tracking crawls. This does nothing without the [control] flag enabled.
8162    #[cfg(feature = "control")]
8163    pub fn with_crawl_id(&mut self, crawl_id: String) -> &mut Self {
8164        self.crawl_id = crawl_id.into();
8165        self
8166    }
8167
8168    /// Add blacklist urls to ignore.
8169    pub fn with_blacklist_url<T>(&mut self, blacklist_url: Option<Vec<T>>) -> &mut Self
8170    where
8171        Vec<CompactString>: From<Vec<T>>,
8172    {
8173        self.configuration.with_blacklist_url(blacklist_url);
8174        self
8175    }
8176
8177    /// Set the retry limit for request. Set the value to 0 for no retries. The default is 0.
8178    pub fn with_retry(&mut self, retry: u8) -> &mut Self {
8179        self.configuration.with_retry(retry);
8180        self
8181    }
8182
8183    /// Skip setting up a control thread for pause, start, and shutdown programmatic handling. This does nothing without the 'control' flag enabled.
8184    pub fn with_no_control_thread(&mut self, no_control_thread: bool) -> &mut Self {
8185        self.configuration.with_no_control_thread(no_control_thread);
8186        self
8187    }
8188
8189    /// Add whitelist urls to allow.
8190    pub fn with_whitelist_url<T>(&mut self, whitelist_url: Option<Vec<T>>) -> &mut Self
8191    where
8192        Vec<CompactString>: From<Vec<T>>,
8193    {
8194        self.configuration.with_whitelist_url(whitelist_url);
8195        self
8196    }
8197
8198    #[cfg(feature = "chrome")]
8199    /// Track the events made via chrome.
8200    pub fn with_event_tracker(
8201        &mut self,
8202        track_events: Option<crate::configuration::ChromeEventTracker>,
8203    ) -> &mut Self {
8204        self.configuration.with_event_tracker(track_events);
8205        self
8206    }
8207
8208    /// Set HTTP headers for request using [reqwest::header::HeaderMap](https://docs.rs/reqwest/latest/reqwest/header/struct.HeaderMap.html).
8209    pub fn with_headers(&mut self, headers: Option<reqwest::header::HeaderMap>) -> &mut Self {
8210        self.configuration.with_headers(headers);
8211        self
8212    }
8213
8214    /// Modify the headers to mimic a real browser.
8215    pub fn with_modify_headers(&mut self, modify_headers: bool) -> &mut Self {
8216        self.configuration.with_modify_headers(modify_headers);
8217        self
8218    }
8219
8220    /// Modify the HTTP client headers to mimic a real browser.
8221    pub fn with_modify_http_client_headers(
8222        &mut self,
8223        modify_http_client_headers: bool,
8224    ) -> &mut Self {
8225        self.configuration
8226            .with_modify_http_client_headers(modify_http_client_headers);
8227        self
8228    }
8229
8230    /// Set a crawl budget per path with levels support /a/b/c or for all paths with "*". This does nothing without the `budget` flag enabled.
8231    pub fn with_budget(&mut self, budget: Option<HashMap<&str, u32>>) -> &mut Self {
8232        self.configuration.with_budget(budget);
8233        self
8234    }
8235
8236    /// Set the crawl budget directly. This does nothing without the `budget` flag enabled.
8237    pub fn set_crawl_budget(&mut self, budget: Option<HashMap<CaseInsensitiveString, u32>>) {
8238        self.configuration.budget = budget;
8239    }
8240
8241    /// Set a crawl depth limit. If the value is 0 there is no limit.
8242    pub fn with_depth(&mut self, depth: usize) -> &mut Self {
8243        self.configuration.with_depth(depth);
8244        self
8245    }
8246
8247    /// Group external domains to treat the crawl as one. If None is passed this will clear all prior domains.
8248    pub fn with_external_domains<'a, 'b>(
8249        &mut self,
8250        external_domains: Option<impl Iterator<Item = String> + 'a>,
8251    ) -> &mut Self {
8252        self.configuration.with_external_domains(external_domains);
8253        self
8254    }
8255
8256    /// Perform a callback to run on each link find.
8257    pub fn with_on_link_find_callback(
8258        &mut self,
8259        on_link_find_callback: Option<OnLinkFindCallback>,
8260    ) -> &mut Self {
8261        match on_link_find_callback {
8262            Some(callback) => self.on_link_find_callback = Some(callback),
8263            _ => self.on_link_find_callback = None,
8264        };
8265        self
8266    }
8267
8268    /// Perform a callback to run on each link find shorthand.
8269    pub fn set_on_link_find<F>(&mut self, f: F)
8270    where
8271        F: Fn(CaseInsensitiveString, Option<String>) -> (CaseInsensitiveString, Option<String>)
8272            + Send
8273            + Sync
8274            + 'static,
8275    {
8276        self.on_link_find_callback = Some(Arc::new(f));
8277    }
8278
8279    /// Use a callback to determine if a page should be ignored. Return false to ensure that the discovered links are not crawled.
8280    pub fn with_on_should_crawl_callback(
8281        &mut self,
8282        on_should_crawl_callback: Option<fn(&Page) -> bool>,
8283    ) -> &mut Self {
8284        match on_should_crawl_callback {
8285            Some(callback) => {
8286                self.on_should_crawl_callback = Some(OnShouldCrawlCallback::Fn(callback))
8287            }
8288            _ => self.on_should_crawl_callback = None,
8289        };
8290        self
8291    }
8292
8293    /// Use an immutable closure to determine if a page should be ignored. Return false to ensure that the discovered links are not crawled.
8294    ///
8295    /// Slightly slower than [`Self::with_on_should_crawl_callback`].
8296    pub fn with_on_should_crawl_callback_closure<F: OnShouldCrawlClosure>(
8297        &mut self,
8298        on_should_crawl_closure: Option<F>,
8299    ) -> &mut Self {
8300        match on_should_crawl_closure {
8301            Some(callback) => {
8302                self.on_should_crawl_callback =
8303                    Some(OnShouldCrawlCallback::Closure(Arc::new(callback)))
8304            }
8305            _ => self.on_should_crawl_callback = None,
8306        };
8307        self
8308    }
8309
8310    /// Cookie string to use in request. This does nothing without the `cookies` flag enabled.
8311    pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self {
8312        self.configuration.with_cookies(cookie_str);
8313        self
8314    }
8315
8316    /// Setup cron jobs to run. This does nothing without the `cron` flag enabled.
8317    pub fn with_cron(&mut self, cron_str: &str, cron_type: CronType) -> &mut Self {
8318        self.configuration.with_cron(cron_str, cron_type);
8319        self
8320    }
8321
8322    /// Overrides default host system locale with the specified one. This does nothing without the `chrome` flag enabled.
8323    pub fn with_locale(&mut self, locale: Option<String>) -> &mut Self {
8324        self.configuration.with_locale(locale);
8325        self
8326    }
8327
8328    /// Use stealth mode for the request. This does nothing without the `chrome` flag enabled.
8329    pub fn with_stealth(&mut self, stealth_mode: bool) -> &mut Self {
8330        self.configuration.with_stealth(stealth_mode);
8331        self
8332    }
8333
8334    /// Use stealth mode for the request. This does nothing without the `chrome` flag enabled.
8335    #[cfg(feature = "chrome")]
8336    pub fn with_stealth_advanced(
8337        &mut self,
8338        stealth_mode: spider_fingerprint::configs::Tier,
8339    ) -> &mut Self {
8340        self.configuration.with_stealth_advanced(stealth_mode);
8341        self
8342    }
8343
8344    /// Set the cache policy.
8345    pub fn with_cache_policy(
8346        &mut self,
8347        cache_policy: Option<crate::utils::BasicCachePolicy>,
8348    ) -> &mut Self {
8349        self.configuration.with_cache_policy(cache_policy);
8350
8351        self
8352    }
8353
8354    /// Use OpenAI to get dynamic javascript to drive the browser. This does nothing without the `openai` flag enabled.
8355    pub fn with_openai(&mut self, openai_configs: Option<configuration::GPTConfigs>) -> &mut Self {
8356        self.configuration.with_openai(openai_configs);
8357        self
8358    }
8359
8360    /// Use a remote multimodal model (vision + HTML + URL) to drive browser automation.
8361    ///
8362    /// When enabled, Spider can ask an OpenAI-compatible “chat completions” endpoint to
8363    /// generate a JSON plan (a list of `WebAutomation` steps), execute those steps against a
8364    /// live Chrome page, then re-capture state and iterate until the model reports it is done
8365    /// (or the configured limits are hit). The default system prompt is set to handle web challenges that can be adjusted if required.
8366    /// Take a look at `DEFAULT_SYSTEM_PROMPT` at spider::features::automation::DEFAULT_SYSTEM_PROMPT for a base line.
8367    ///
8368    /// This is useful for:
8369    /// - handling captchas,
8370    /// - dismissing popups / cookie banners,
8371    /// - navigating to a target page (pricing, docs, etc.),
8372    /// - clicking through multi-step UI flows,
8373    /// - recovering from dynamic page state that plain HTML scraping can’t handle.
8374    ///
8375    /// # Feature gate
8376    /// This method only has an effect when the crate is built with `feature="chrome"`.
8377    /// Without `chrome`, the method is not available.
8378    ///
8379    /// # Parameters
8380    /// - `cfg`: The remote multimodal configuration bundle (endpoint, model, prompts, and runtime knobs).
8381    ///   Pass `None` to disable remote multimodal automation.
8382    ///
8383    /// # Example
8384    /// ```no_run
8385    /// # #[cfg(feature = "chrome")]
8386    /// # async fn demo() -> Result<(), Box<dyn std::error::Error>> {
8387    /// use spider::website::Website;
8388    /// use spider::configuration::Configuration;
8389    /// use spider::features::automation::{RemoteMultimodalConfigs, RemoteMultimodalConfig};
8390    ///
8391    /// // Build the engine configs (similar to GPTConfigs::new(...))
8392    /// let mm_cfgs = RemoteMultimodalConfigs::new(
8393    ///     "http://localhost:11434/v1/chat/completions",
8394    ///     "qwen2.5-vl", // any OpenAI-compatible model id your endpoint understands
8395    /// )
8396    /// .with_api_key(None)
8397    /// .with_system_prompt_extra(Some("Never log in. Prefer closing popups and continuing."))
8398    /// .with_user_message_extra(Some("Goal: reach the pricing page, then stop."))
8399    /// .with_cfg(RemoteMultimodalConfig {
8400    ///     // keep HTML smaller if you want lower token usage
8401    ///     include_html: true,
8402    ///     html_max_bytes: 24_000,
8403    ///     include_url: true,
8404    ///     include_title: true,
8405    ///     // loop controls
8406    ///     max_rounds: 6,
8407    ///     post_plan_wait_ms: 400,
8408    ///     ..Default::default()
8409    /// })
8410    /// .with_concurrency_limit(Some(8));
8411    ///
8412    /// // Attach to the crawler configuration
8413    /// let mut cfg = Configuration::new();
8414    /// cfg.with_remote_multimodal(Some(mm_cfgs));
8415    ///
8416    /// // Use the configuration in a Website (example)
8417    /// let mut site = Website::new("https://example.com");
8418    /// site.configuration = cfg;
8419    ///
8420    /// // Start crawling/scraping as you normally would...
8421    /// // site.crawl().await?;
8422    ///
8423    /// Ok(())
8424    /// # }
8425    /// ```
8426    ///
8427    /// # Notes
8428    /// - Remote multimodal automation typically requires `feature="serde"` if you deserialize model
8429    ///   steps into `WebAutomation`.
8430    /// - If your endpoint does not support `response_format: {"type":"json_object"}`, disable that
8431    ///   in `RemoteMultimodalConfig` (`request_json_object = false`).
8432    #[cfg(feature = "chrome")]
8433    pub fn with_remote_multimodal(
8434        &mut self,
8435        cfg: Option<crate::features::automation::RemoteMultimodalConfigs>,
8436    ) -> &mut Self {
8437        self.configuration.with_remote_multimodal(cfg);
8438        self
8439    }
8440
8441    /// Use Gemini to get dynamic javascript to drive the browser. This does nothing without the `gemini` flag enabled.
8442    pub fn with_gemini(
8443        &mut self,
8444        gemini_configs: Option<configuration::GeminiConfigs>,
8445    ) -> &mut Self {
8446        self.configuration.with_gemini(gemini_configs);
8447        self
8448    }
8449
8450    /// Cache the page following HTTP rules. This method does nothing if the `cache` feature is not enabled.
8451    pub fn with_caching(&mut self, cache: bool) -> &mut Self {
8452        self.configuration.with_caching(cache);
8453        self
8454    }
8455
8456    /// Enable or disable Service Workers. This method does nothing if the `chrome` feature is not enabled.
8457    pub fn with_service_worker_enabled(&mut self, enabled: bool) -> &mut Self {
8458        self.configuration.with_service_worker_enabled(enabled);
8459        self
8460    }
8461
8462    /// Automatically setup geo-location configurations when using a proxy. This method does nothing if the `chrome` feature is not enabled.
8463    pub fn with_auto_geolocation(&mut self, enabled: bool) -> &mut Self {
8464        self.configuration.with_auto_geolocation(enabled);
8465        self
8466    }
8467
8468    #[cfg(feature = "chrome")]
8469    /// Set custom fingerprint ID for request. This does nothing without the `chrome` flag enabled.
8470    pub fn with_fingerprint_advanced(
8471        &mut self,
8472        fingerprint: crate::configuration::Fingerprint,
8473    ) -> &mut Self {
8474        self.configuration.with_fingerprint_advanced(fingerprint);
8475        self
8476    }
8477
8478    /// Setup custom fingerprinting for chrome. This method does nothing if the `chrome` feature is not enabled.
8479    pub fn with_fingerprint(&mut self, fingerprint: bool) -> &mut Self {
8480        self.configuration.with_fingerprint(fingerprint);
8481        self
8482    }
8483
8484    /// Configures the viewport of the browser, which defaults to 800x600. This method does nothing if the `chrome` feature is not enabled.
8485    pub fn with_viewport(&mut self, viewport: Option<crate::configuration::Viewport>) -> &mut Self {
8486        self.configuration.with_viewport(viewport);
8487        self
8488    }
8489
8490    /// Wait for network request to be idle within a time frame period (500ms no network connections). This does nothing without the `chrome` flag enabled.
8491    pub fn with_wait_for_idle_network(
8492        &mut self,
8493        wait_for_idle_network: Option<crate::configuration::WaitForIdleNetwork>,
8494    ) -> &mut Self {
8495        self.configuration
8496            .with_wait_for_idle_network(wait_for_idle_network);
8497        self
8498    }
8499
8500    /// Wait for network request with a max timeout. This does nothing without the `chrome` flag enabled.
8501    pub fn with_wait_for_idle_network0(
8502        &mut self,
8503        wait_for_idle_network: Option<crate::configuration::WaitForIdleNetwork>,
8504    ) -> &mut Self {
8505        self.configuration
8506            .with_wait_for_idle_network0(wait_for_idle_network);
8507        self
8508    }
8509
8510    /// Wait for network to be almost idle with a max timeout. This does nothing without the `chrome` flag enabled.
8511    pub fn with_wait_for_almost_idle_network0(
8512        &mut self,
8513        wait_for_idle_network: Option<crate::configuration::WaitForIdleNetwork>,
8514    ) -> &mut Self {
8515        self.configuration
8516            .with_wait_for_almost_idle_network0(wait_for_idle_network);
8517        self
8518    }
8519
8520    /// Wait for a CSS query selector. This method does nothing if the `chrome` feature is not enabled.
8521    pub fn with_wait_for_selector(
8522        &mut self,
8523        wait_for_selector: Option<crate::configuration::WaitForSelector>,
8524    ) -> &mut Self {
8525        self.configuration.with_wait_for_selector(wait_for_selector);
8526        self
8527    }
8528
8529    /// Wait for idle dom mutations for target element. This method does nothing if the `chrome` feature is not enabled.
8530    pub fn with_wait_for_idle_dom(
8531        &mut self,
8532        wait_for_selector: Option<crate::configuration::WaitForSelector>,
8533    ) -> &mut Self {
8534        self.configuration.with_wait_for_idle_dom(wait_for_selector);
8535        self
8536    }
8537
8538    /// Wait for a delay. Should only be used for testing. This method does nothing if the `chrome` feature is not enabled.
8539    pub fn with_wait_for_delay(
8540        &mut self,
8541        wait_for_delay: Option<crate::configuration::WaitForDelay>,
8542    ) -> &mut Self {
8543        self.configuration.with_wait_for_delay(wait_for_delay);
8544        self
8545    }
8546
8547    /// The default http connect timeout.
8548    pub fn with_default_http_connect_timeout(
8549        &mut self,
8550        default_http_connect_timeout: Option<Duration>,
8551    ) -> &mut Self {
8552        self.configuration
8553            .with_default_http_connect_timeout(default_http_connect_timeout);
8554
8555        self
8556    }
8557
8558    /// The default http read timeout.
8559    pub fn with_default_http_read_timeout(
8560        &mut self,
8561        default_http_read_timeout: Option<Duration>,
8562    ) -> &mut Self {
8563        self.configuration
8564            .with_default_http_read_timeout(default_http_read_timeout);
8565
8566        self
8567    }
8568
8569    /// Set the max redirects allowed for request.
8570    pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self {
8571        self.configuration.with_redirect_limit(redirect_limit);
8572        self
8573    }
8574
8575    /// Set the redirect policy to use, either Strict or Loose by default.
8576    pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self {
8577        self.configuration.with_redirect_policy(policy);
8578        self
8579    }
8580
8581    /// Use request intercept for the request to only allow content that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the `chrome_intercept` flag is not enabled.
8582    pub fn with_chrome_intercept(
8583        &mut self,
8584        chrome_intercept: RequestInterceptConfiguration,
8585    ) -> &mut Self {
8586        self.configuration
8587            .with_chrome_intercept(chrome_intercept, &self.domain_parsed);
8588        self
8589    }
8590
8591    /// Add a referer to the request.
8592    pub fn with_referer(&mut self, referer: Option<String>) -> &mut Self {
8593        self.configuration.with_referer(referer);
8594        self
8595    }
8596
8597    /// Add a referer to the request.
8598    pub fn with_referrer(&mut self, referer: Option<String>) -> &mut Self {
8599        self.configuration.with_referrer(referer);
8600        self
8601    }
8602
8603    /// Determine whether to collect all the resources found on pages.
8604    pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self {
8605        self.configuration.with_full_resources(full_resources);
8606        self
8607    }
8608
8609    /// Dismiss all dialogs on the page. This method does nothing if the `chrome` feature is not enabled.
8610    pub fn with_dismiss_dialogs(&mut self, full_resources: bool) -> &mut Self {
8611        self.configuration.with_dismiss_dialogs(full_resources);
8612        self
8613    }
8614
8615    /// Set the request emuluation. This method does nothing if the `wreq` flag is not enabled.
8616    #[cfg(feature = "wreq")]
8617    pub fn with_emulation(&mut self, emulation: Option<wreq_util::Emulation>) -> &mut Self {
8618        self.configuration.with_emulation(emulation);
8619        self
8620    }
8621
8622    /// Ignore the sitemap when crawling. This method does nothing if the `sitemap` flag is not enabled.
8623    pub fn with_ignore_sitemap(&mut self, ignore_sitemap: bool) -> &mut Self {
8624        self.configuration.with_ignore_sitemap(ignore_sitemap);
8625        self
8626    }
8627
8628    /// Overrides default host system timezone with the specified one. This does nothing without the `chrome` flag enabled.
8629    pub fn with_timezone_id(&mut self, timezone_id: Option<String>) -> &mut Self {
8630        self.configuration.with_timezone_id(timezone_id);
8631        self
8632    }
8633
8634    /// Set a custom script to evaluate on new document creation. This does nothing without the feat flag `chrome` enabled.
8635    pub fn with_evaluate_on_new_document(
8636        &mut self,
8637        evaluate_on_new_document: Option<Box<String>>,
8638    ) -> &mut Self {
8639        self.configuration
8640            .with_evaluate_on_new_document(evaluate_on_new_document);
8641
8642        self
8643    }
8644
8645    /// Set a crawl page limit. If the value is 0 there is no limit.
8646    pub fn with_limit(&mut self, limit: u32) -> &mut Self {
8647        self.configuration.with_limit(limit);
8648        self
8649    }
8650
8651    /// Set the chrome screenshot configuration. This does nothing without the `chrome` flag enabled.
8652    pub fn with_screenshot(
8653        &mut self,
8654        screenshot_config: Option<configuration::ScreenShotConfig>,
8655    ) -> &mut Self {
8656        self.configuration.with_screenshot(screenshot_config);
8657        self
8658    }
8659
8660    /// Use a shared semaphore to evenly handle workloads. The default is false.
8661    pub fn with_shared_queue(&mut self, shared_queue: bool) -> &mut Self {
8662        self.configuration.with_shared_queue(shared_queue);
8663        self
8664    }
8665
8666    /// Set the authentiation challenge response. This does nothing without the feat flag `chrome` enabled.
8667    pub fn with_auth_challenge_response(
8668        &mut self,
8669        auth_challenge_response: Option<configuration::AuthChallengeResponse>,
8670    ) -> &mut Self {
8671        self.configuration
8672            .with_auth_challenge_response(auth_challenge_response);
8673        self
8674    }
8675
8676    /// Return the links found on the page in the channel subscriptions. This method does nothing if the `decentralized` is enabled.
8677    pub fn with_return_page_links(&mut self, return_page_links: bool) -> &mut Self {
8678        self.configuration.with_return_page_links(return_page_links);
8679        self
8680    }
8681
8682    /// Set the connection url for the chrome instance. This method does nothing if the `chrome` is not enabled.
8683    pub fn with_chrome_connection(&mut self, chrome_connection_url: Option<String>) -> &mut Self {
8684        self.configuration
8685            .with_chrome_connection(chrome_connection_url);
8686        self
8687    }
8688
8689    /// Set JS to run on certain pages. This method does nothing if the `chrome` is not enabled.
8690    pub fn with_execution_scripts(
8691        &mut self,
8692        execution_scripts: Option<ExecutionScriptsMap>,
8693    ) -> &mut Self {
8694        self.configuration.with_execution_scripts(execution_scripts);
8695        self
8696    }
8697
8698    /// Run web automated actions on certain pages. This method does nothing if the `chrome` is not enabled.
8699    pub fn with_automation_scripts(
8700        &mut self,
8701        automation_scripts: Option<AutomationScriptsMap>,
8702    ) -> &mut Self {
8703        self.configuration
8704            .with_automation_scripts(automation_scripts);
8705        self
8706    }
8707
8708    /// Bind the connections only on the network interface.
8709    pub fn with_network_interface(&mut self, network_interface: Option<String>) -> &mut Self {
8710        self.configuration.with_network_interface(network_interface);
8711        self
8712    }
8713
8714    /// Bind to a local IP Address.
8715    pub fn with_local_address(&mut self, local_address: Option<IpAddr>) -> &mut Self {
8716        self.configuration.with_local_address(local_address);
8717        self
8718    }
8719
8720    /// Block assets from loading from the network. Focus primarly on HTML documents.
8721    pub fn with_block_assets(&mut self, only_html: bool) -> &mut Self {
8722        self.configuration.with_block_assets(only_html);
8723        self
8724    }
8725
8726    /// Normalize the content de-duplicating trailing slash pages and other pages that can be duplicated. This may initially show the link in your links_visited or subscription calls but, the following links will not be crawled.
8727    pub fn with_normalize(&mut self, normalize: bool) -> &mut Self {
8728        self.configuration.with_normalize(normalize);
8729        self
8730    }
8731
8732    /// Store all the links found on the disk to share the state. This does nothing without the `disk` flag enabled.
8733    pub fn with_shared_state(&mut self, shared: bool) -> &mut Self {
8734        self.configuration.with_shared_state(shared);
8735        self
8736    }
8737
8738    /// Set the max amount of bytes to collect per page. Only used for chrome atm.
8739    pub fn with_max_page_bytes(&mut self, max_page_bytes: Option<f64>) -> &mut Self {
8740        self.configuration.with_max_page_bytes(max_page_bytes);
8741        self
8742    }
8743
8744    /// Set the max amount of bytes to collected for the browser context. Only used for chrome atm.
8745    pub fn with_max_bytes_allowed(&mut self, max_bytes_allowed: Option<u64>) -> &mut Self {
8746        self.configuration.with_max_bytes_allowed(max_bytes_allowed);
8747        self
8748    }
8749
8750    /// Set the configuration for the website directly.
8751    pub fn with_config(&mut self, config: Configuration) -> &mut Self {
8752        self.configuration = config.into();
8753        self
8754    }
8755
8756    /// Set a [spider.cloud](https://spider.cloud) API key (Proxy mode).
8757    #[cfg(feature = "spider_cloud")]
8758    pub fn with_spider_cloud(&mut self, api_key: &str) -> &mut Self {
8759        self.configuration.with_spider_cloud(api_key);
8760        self
8761    }
8762
8763    /// Set a [spider.cloud](https://spider.cloud) API key (no-op without `spider_cloud` feature).
8764    #[cfg(not(feature = "spider_cloud"))]
8765    pub fn with_spider_cloud(&mut self, _api_key: &str) -> &mut Self {
8766        self
8767    }
8768
8769    /// Set a [spider.cloud](https://spider.cloud) config.
8770    #[cfg(feature = "spider_cloud")]
8771    pub fn with_spider_cloud_config(
8772        &mut self,
8773        config: crate::configuration::SpiderCloudConfig,
8774    ) -> &mut Self {
8775        self.configuration.with_spider_cloud_config(config);
8776        self
8777    }
8778
8779    /// Set a [spider.cloud](https://spider.cloud) config (no-op without `spider_cloud` feature).
8780    #[cfg(not(feature = "spider_cloud"))]
8781    pub fn with_spider_cloud_config(&mut self, _config: ()) -> &mut Self {
8782        self
8783    }
8784
8785    /// Build the website configuration when using with_builder.
8786    pub fn build(&self) -> Result<Self, Self> {
8787        if self.domain_parsed.is_none() {
8788            Err(self.to_owned())
8789        } else {
8790            Ok(self.to_owned())
8791        }
8792    }
8793
8794    /// Clear the HTTP headers for the requests.
8795    pub fn clear_headers(&mut self) {
8796        if let Some(headers) = self.configuration.headers.as_mut() {
8797            headers.0.clear();
8798        }
8799    }
8800
8801    /// Determine if the budget has a wildcard path and the depth limit distance. This does nothing without the `budget` flag enabled.
8802    pub fn determine_limits(&mut self) {
8803        self.configuration.configure_budget();
8804        if self.configuration.inner_budget.is_some() {
8805            let wild_card_budget = match &self.configuration.inner_budget {
8806                Some(budget) => budget.contains_key(&*WILD_CARD_PATH),
8807                _ => false,
8808            };
8809            self.configuration.wild_card_budgeting = wild_card_budget;
8810        }
8811        if self.configuration.depth > 0 && self.domain_parsed.is_some() {
8812            if let Some(domain) = &self.domain_parsed {
8813                if let Some(segments) = domain.path_segments() {
8814                    let segments_cnt = segments.count();
8815
8816                    if segments_cnt > self.configuration.depth {
8817                        self.configuration.depth_distance = self.configuration.depth
8818                            + self.configuration.depth.abs_diff(segments_cnt);
8819                    } else {
8820                        self.configuration.depth_distance = self.configuration.depth;
8821                    }
8822                }
8823            }
8824        }
8825    }
8826
8827    #[cfg(not(feature = "sync"))]
8828    /// Sets up a subscription to receive concurrent data. This will panic if it is larger than `usize::MAX / 2`.
8829    /// Set the value to `0` to use the semaphore permits. If the subscription is going to block or use async methods,
8830    /// make sure to spawn a task to avoid losing messages. This does nothing unless the `sync` flag is enabled.
8831    ///
8832    /// # Examples
8833    ///
8834    /// Subscribe and receive messages using an async tokio environment:
8835    ///
8836    /// ```rust
8837    /// use spider::{tokio, website::Website};
8838    ///
8839    /// #[tokio::main]
8840    /// async fn main() {
8841    ///     let mut website = Website::new("http://example.com");
8842    ///     let mut rx = website.subscribe(0).unwrap();
8843    ///
8844    ///     tokio::spawn(async move {
8845    ///         while let Ok(page) = rx.recv().await {
8846    ///             tokio::spawn(async move {
8847    ///                 // Process the received page.
8848    ///                 // If performing non-blocking tasks or managing a high subscription count, configure accordingly.
8849    ///             });
8850    ///         }
8851    ///     });
8852    ///
8853    ///     website.crawl().await;
8854    /// }
8855    /// ```
8856    pub fn subscribe(&mut self, capacity: usize) -> Option<broadcast::Receiver<Page>> {
8857        None
8858    }
8859
8860    /// Sets up a subscription to receive concurrent data. This will panic if it is larger than `usize::MAX / 2`.
8861    /// Set the value to `0` to use the semaphore permits. If the subscription is going to block or use async methods,
8862    /// make sure to spawn a task to avoid losing messages. This does nothing unless the `sync` flag is enabled.
8863    ///
8864    /// # Examples
8865    ///
8866    /// Subscribe and receive messages using an async tokio environment:
8867    ///
8868    /// ```rust
8869    /// use spider::{tokio, website::Website};
8870    ///
8871    /// #[tokio::main]
8872    /// async fn main() {
8873    ///     let mut website = Website::new("http://example.com");
8874    ///     let mut rx = website.subscribe(0).unwrap();
8875    ///
8876    ///     tokio::spawn(async move {
8877    ///         while let Ok(page) = rx.recv().await {
8878    ///             tokio::spawn(async move {
8879    ///                 // Process the received page.
8880    ///                 // If performing non-blocking tasks or managing a high subscription count, configure accordingly.
8881    ///             });
8882    ///         }
8883    ///     });
8884    ///
8885    ///     website.crawl().await;
8886    /// }
8887    /// ```
8888    #[cfg(feature = "sync")]
8889    pub fn subscribe(&mut self, capacity: usize) -> Option<broadcast::Receiver<Page>> {
8890        let channel = self.channel.get_or_insert_with(|| {
8891            let (tx, rx) = broadcast::channel(
8892                (if capacity == 0 {
8893                    *DEFAULT_PERMITS
8894                } else {
8895                    capacity
8896                })
8897                .max(1),
8898            );
8899            (tx, Arc::new(rx))
8900        });
8901
8902        let rx2 = channel.0.subscribe();
8903
8904        Some(rx2)
8905    }
8906
8907    /// Get a sender for queueing extra links mid crawl. This does nothing unless the `sync` flag is enabled.
8908    #[cfg(feature = "sync")]
8909    pub fn queue(&mut self, capacity: usize) -> Option<broadcast::Sender<String>> {
8910        let channel = self.channel_queue.get_or_insert_with(|| {
8911            let (tx, rx) = broadcast::channel(capacity);
8912            (tx, Arc::new(rx))
8913        });
8914
8915        Some(channel.0.to_owned())
8916    }
8917
8918    /// Get a sender for queueing extra links mid crawl. This does nothing unless the `sync` flag is enabled.
8919    #[cfg(not(feature = "sync"))]
8920    pub fn queue(
8921        &mut self,
8922        _capacity: usize,
8923    ) -> Option<Arc<(broadcast::Sender<Page>, broadcast::Receiver<Page>)>> {
8924        None
8925    }
8926
8927    /// Remove subscriptions for data. This is useful for auto droping subscriptions that are running on another thread. This does nothing without the `sync` flag enabled.
8928    #[cfg(not(feature = "sync"))]
8929    pub fn unsubscribe(&mut self) {}
8930
8931    /// Remove subscriptions for data. This is useful for auto droping subscriptions that are running on another thread. This does nothing without the `sync` flag enabled.
8932    #[cfg(feature = "sync")]
8933    pub fn unsubscribe(&mut self) {
8934        self.channel.take();
8935    }
8936
8937    /// Get the channel sender to send manual subscriptions.
8938    pub fn get_channel(
8939        &self,
8940    ) -> &Option<(broadcast::Sender<Page>, Arc<broadcast::Receiver<Page>>)> {
8941        &self.channel
8942    }
8943
8944    /// Get the channel guard to send manual subscriptions from closing.
8945    pub fn get_channel_guard(&self) -> &Option<ChannelGuard> {
8946        &self.channel_guard
8947    }
8948
8949    /// Setup subscription counter to track concurrent operation completions.
8950    /// This helps keep a chrome instance active until all operations are completed from all threads to safely take screenshots and other actions.
8951    /// Make sure to call `inc` if you take a guard. Without calling `inc` in the subscription receiver the crawl will stay in a infinite loop.
8952    /// This does nothing without the `sync` flag enabled. You also need to use the 'chrome_store_page' to keep the page alive between request.
8953    ///
8954    /// # Example
8955    ///
8956    /// ```
8957    /// use spider::tokio;
8958    /// use spider::website::Website;
8959    /// #[tokio::main]
8960    ///
8961    /// async fn main() {
8962    ///     let mut website: Website = Website::new("http://example.com");
8963    ///     let mut rx2 = website.subscribe(18).unwrap();
8964    ///     let mut rxg = website.subscribe_guard().unwrap();
8965    ///
8966    ///     tokio::spawn(async move {
8967    ///         while let Ok(page) = rx2.recv().await {
8968    ///             println!("📸 - {:?}", page.get_url());
8969    ///             page
8970    ///                 .screenshot(
8971    ///                     true,
8972    ///                     true,
8973    ///                     spider::configuration::CaptureScreenshotFormat::Png,
8974    ///                     Some(75),
8975    ///                     None::<std::path::PathBuf>,
8976    ///                     None,
8977    ///                 )
8978    ///                 .await;
8979    ///             rxg.inc();
8980    ///         }
8981    ///     });
8982    ///     website.crawl().await;
8983    /// }
8984    /// ```
8985    #[cfg(not(feature = "sync"))]
8986    pub fn subscribe_guard(&mut self) -> Option<ChannelGuard> {
8987        None
8988    }
8989
8990    /// Setup subscription counter to track concurrent operation completions.
8991    /// This helps keep a chrome instance active until all operations are completed from all threads to safely take screenshots and other actions.
8992    /// Make sure to call `inc` if you take a guard. Without calling `inc` in the subscription receiver the crawl will stay in a infinite loop.
8993    /// This does nothing without the `sync` flag enabled. You also need to use the 'chrome_store_page' to keep the page alive between request.
8994    ///
8995    /// # Example
8996    ///
8997    /// ```
8998    /// use spider::tokio;
8999    /// use spider::website::Website;
9000    ///
9001    /// #[tokio::main]
9002    /// async fn main() {
9003    ///     let mut website: Website = Website::new("http://example.com");
9004    ///     let mut rx2 = website.subscribe(18).unwrap();
9005    ///     let mut rxg = website.subscribe_guard().unwrap();
9006    ///
9007    ///     tokio::spawn(async move {
9008    ///         while let Ok(page) = rx2.recv().await {
9009    ///             println!("📸 - {:?}", page.get_url());
9010    ///             page
9011    ///                 .screenshot(
9012    ///                     true,
9013    ///                     true,
9014    ///                     spider::configuration::CaptureScreenshotFormat::Png,
9015    ///                     Some(75),
9016    ///                     None::<std::path::PathBuf>,
9017    ///                     None,
9018    ///                 )
9019    ///                 .await;
9020    ///             rxg.inc();
9021    ///         }
9022    ///     });
9023    ///     website.crawl().await;
9024    /// }
9025    /// ```
9026    #[cfg(feature = "sync")]
9027    pub fn subscribe_guard(&mut self) -> Option<ChannelGuard> {
9028        // *note*: it would be better to handle this on page drop if the subscription is used automatically. For now we add the API upfront.
9029        let channel_guard = self.channel_guard.get_or_insert_with(ChannelGuard::new);
9030        Some(channel_guard.clone())
9031    }
9032
9033    #[cfg(feature = "cron")]
9034    /// Start a cron job - if you use subscribe on another thread you need to abort the handle in conjuction with runner.stop.
9035    pub async fn run_cron(&self) -> Runner {
9036        async_job::Runner::new()
9037            .add(Box::new(self.clone()))
9038            .run()
9039            .await
9040    }
9041
9042    #[cfg(not(feature = "control"))]
9043    /// Get the attached crawl id.
9044    pub fn get_crawl_id(&self) -> Option<&Box<String>> {
9045        None
9046    }
9047
9048    #[cfg(feature = "control")]
9049    /// Get the attached crawl id.
9050    pub fn get_crawl_id(&self) -> Option<&Box<String>> {
9051        if self.crawl_id.is_empty() {
9052            None
9053        } else {
9054            Some(&self.crawl_id)
9055        }
9056    }
9057
9058    /// Set extra useful information.
9059    #[cfg(feature = "extra_information")]
9060    pub fn set_extra_info(&mut self, info: Option<String>) {
9061        self.extra_info = info.map(|f| f.into());
9062    }
9063
9064    /// Get extra information stored.
9065    #[cfg(feature = "extra_information")]
9066    pub fn get_extra_info(&self) -> Option<&Box<String>> {
9067        self.extra_info.as_ref()
9068    }
9069
9070    /// Set the initial HTML page instead of firing a request to the URL.
9071    pub fn set_seeded_html(&mut self, html: Option<String>) {
9072        self.seed_html = html;
9073    }
9074
9075    /// Get the initial seeded html.
9076    pub fn get_seeded_html(&self) -> &Option<String> {
9077        &self.seed_html
9078    }
9079
9080    /// Apply configuration from a `PromptConfiguration` generated by an LLM.
9081    ///
9082    /// This method takes a configuration object produced by
9083    /// `RemoteMultimodalEngine::configure_from_prompt()` and applies the
9084    /// settings to this website.
9085    ///
9086    /// # Example
9087    /// ```ignore
9088    /// use spider::features::automation::{RemoteMultimodalEngine, configure_crawler_from_prompt};
9089    ///
9090    /// let config = configure_crawler_from_prompt(
9091    ///     "http://localhost:11434/v1/chat/completions",
9092    ///     "llama3",
9093    ///     None,
9094    ///     "Crawl blog posts only, respect robots.txt, max 100 pages, 200ms delay"
9095    /// ).await?;
9096    ///
9097    /// let mut website = Website::new("https://example.com");
9098    /// website.apply_prompt_configuration(&config);
9099    /// ```
9100    #[cfg(feature = "serde")]
9101    pub fn apply_prompt_configuration(
9102        &mut self,
9103        config: &crate::features::automation::PromptConfiguration,
9104    ) -> &mut Self {
9105        // Core crawling
9106        if let Some(v) = config.respect_robots_txt {
9107            self.configuration.respect_robots_txt = v;
9108        }
9109        if let Some(v) = config.subdomains {
9110            self.configuration.subdomains = v;
9111        }
9112        if let Some(v) = config.tld {
9113            self.configuration.tld = v;
9114        }
9115        if let Some(v) = config.depth {
9116            self.configuration.depth = v;
9117        }
9118        if let Some(v) = config.delay {
9119            self.configuration.delay = v;
9120        }
9121        if let Some(ms) = config.request_timeout_ms {
9122            self.configuration.request_timeout =
9123                Some(Box::new(std::time::Duration::from_millis(ms)));
9124        }
9125        if let Some(ms) = config.crawl_timeout_ms {
9126            self.configuration.crawl_timeout = Some(std::time::Duration::from_millis(ms));
9127        }
9128
9129        // URL filtering
9130        if let Some(ref urls) = config.blacklist_url {
9131            self.configuration.blacklist_url =
9132                Some(urls.iter().map(|s| s.as_str().into()).collect());
9133        }
9134        if let Some(ref urls) = config.whitelist_url {
9135            self.configuration.whitelist_url =
9136                Some(urls.iter().map(|s| s.as_str().into()).collect());
9137        }
9138        if let Some(ref domains) = config.external_domains {
9139            for domain in domains {
9140                self.configuration
9141                    .external_domains_caseless
9142                    .insert(case_insensitive_string::CaseInsensitiveString::new(domain));
9143            }
9144        }
9145
9146        // Request settings
9147        if let Some(ref ua) = config.user_agent {
9148            self.configuration.user_agent = Some(Box::new(ua.as_str().into()));
9149        }
9150        if let Some(v) = config.http2_prior_knowledge {
9151            self.configuration.http2_prior_knowledge = v;
9152        }
9153        if let Some(v) = config.accept_invalid_certs {
9154            self.configuration.accept_invalid_certs = v;
9155        }
9156
9157        // Limits
9158        if let Some(v) = config.redirect_limit {
9159            self.configuration.redirect_limit = Box::new(v);
9160        }
9161        if let Some(ref budget_map) = config.budget {
9162            let mut budget = hashbrown::HashMap::new();
9163            for (k, v) in budget_map {
9164                budget.insert(
9165                    case_insensitive_string::CaseInsensitiveString::new(k),
9166                    *v,
9167                );
9168            }
9169            self.configuration.budget = Some(budget);
9170        }
9171        if let Some(v) = config.max_page_bytes {
9172            self.configuration.max_page_bytes = Some(v);
9173        }
9174
9175        // Content
9176        if let Some(v) = config.full_resources {
9177            self.configuration.full_resources = v;
9178        }
9179        if let Some(v) = config.only_html {
9180            self.configuration.only_html = v;
9181        }
9182        if let Some(v) = config.return_page_links {
9183            self.configuration.return_page_links = v;
9184        }
9185
9186        // Chrome options
9187        #[cfg(feature = "chrome")]
9188        if let Some(true) = config.use_chrome {
9189            // Chrome is enabled via feature flag, this is a hint for the user
9190        }
9191        if let Some(ref mode) = config.stealth_mode {
9192            self.configuration.stealth_mode = match mode.to_lowercase().as_str() {
9193                "basic" => spider_fingerprint::configs::Tier::Basic,
9194                "low" => spider_fingerprint::configs::Tier::Low,
9195                "mid" => spider_fingerprint::configs::Tier::Mid,
9196                "full" => spider_fingerprint::configs::Tier::Full,
9197                _ => spider_fingerprint::configs::Tier::None,
9198            };
9199        }
9200        if config.viewport_width.is_some() || config.viewport_height.is_some() {
9201            let width = config.viewport_width.unwrap_or(800);
9202            let height = config.viewport_height.unwrap_or(600);
9203            self.configuration.viewport = Some(crate::configuration::Viewport::new(width, height));
9204        }
9205        #[cfg(feature = "chrome")]
9206        {
9207            let mut wait_for = self
9208                .configuration
9209                .wait_for
9210                .take()
9211                .unwrap_or_default();
9212
9213            if let Some(true) = config.wait_for_idle_network {
9214                wait_for.idle_network = Some(
9215                    crate::features::chrome_common::WaitForIdleNetwork::new(Some(
9216                        std::time::Duration::from_secs(30),
9217                    )),
9218                );
9219            }
9220            if let Some(ms) = config.wait_for_delay_ms {
9221                wait_for.delay = Some(crate::features::chrome_common::WaitForDelay::new(Some(
9222                    std::time::Duration::from_millis(ms),
9223                )));
9224            }
9225            if let Some(ref selector) = config.wait_for_selector {
9226                wait_for.selector = Some(crate::features::chrome_common::WaitForSelector::new(
9227                    Some(std::time::Duration::from_secs(30)),
9228                    selector.clone(),
9229                ));
9230            }
9231
9232            if wait_for.idle_network.is_some()
9233                || wait_for.delay.is_some()
9234                || wait_for.selector.is_some()
9235            {
9236                self.configuration.wait_for = Some(wait_for);
9237            }
9238        }
9239        #[cfg(feature = "chrome")]
9240        if let Some(ref js) = config.evaluate_on_new_document {
9241            self.configuration.evaluate_on_new_document = Some(Box::new(js.clone()));
9242        }
9243
9244        // Performance
9245        if let Some(v) = config.shared_queue {
9246            self.configuration.shared_queue = v;
9247        }
9248        if let Some(v) = config.retry {
9249            self.configuration.retry = v;
9250        }
9251
9252        self
9253    }
9254
9255    /// Configure the website from a natural language prompt using an LLM.
9256    ///
9257    /// This is a convenience method that calls the LLM to generate configuration
9258    /// and applies it to the website in one step.
9259    ///
9260    /// # Arguments
9261    /// * `api_url` - OpenAI-compatible chat completions endpoint
9262    /// * `model_name` - Model identifier (e.g., "gpt-4", "llama3", "qwen2.5")
9263    /// * `api_key` - Optional API key for authenticated endpoints
9264    /// * `prompt` - Natural language description of crawling requirements
9265    ///
9266    /// # Example
9267    /// ```ignore
9268    /// let mut website = Website::new("https://example.com");
9269    /// website.configure_from_prompt(
9270    ///     "http://localhost:11434/v1/chat/completions",
9271    ///     "llama3",
9272    ///     None,
9273    ///     "Only crawl product pages, use 100ms delay, max depth 5, respect robots.txt"
9274    /// ).await?;
9275    ///
9276    /// website.crawl().await;
9277    /// ```
9278    /// Requires the `agent` and `serde` features.
9279    #[cfg(all(feature = "agent", feature = "serde"))]
9280    pub async fn configure_from_prompt(
9281        &mut self,
9282        api_url: &str,
9283        model_name: &str,
9284        api_key: Option<&str>,
9285        prompt: &str,
9286    ) -> Result<&mut Self, crate::features::automation::EngineError> {
9287        let config = crate::features::automation::configure_crawler_from_prompt(
9288            api_url, model_name, api_key, prompt,
9289        )
9290        .await?;
9291        Ok(self.apply_prompt_configuration(&config))
9292    }
9293}
9294
9295/// Channel broadcast send the Page to receivers.
9296pub fn channel_send_page(
9297    channel: &Option<(
9298        tokio::sync::broadcast::Sender<Page>,
9299        std::sync::Arc<tokio::sync::broadcast::Receiver<Page>>,
9300    )>,
9301    page: Page,
9302    channel_guard: &Option<ChannelGuard>,
9303) {
9304    if let Some(c) = channel {
9305        if c.0.send(page).is_ok() {
9306            if let Some(guard) = channel_guard {
9307                ChannelGuard::inc_guard(&guard.0 .1)
9308            }
9309        }
9310    }
9311}
9312
9313/// Guard a channel from closing until all concurrent operations are done.
9314#[derive(Debug, Clone)]
9315pub struct ChannelGuard(Arc<(AtomicBool, AtomicUsize, AtomicUsize)>);
9316
9317impl ChannelGuard {
9318    /// Create a new channel guard. The tuple has the guard control and the counter.
9319    pub(crate) fn new() -> ChannelGuard {
9320        ChannelGuard(Arc::new((
9321            AtomicBool::new(true),
9322            AtomicUsize::new(0),
9323            AtomicUsize::new(0),
9324        )))
9325    }
9326    /// Lock the channel until complete. This is only used for when storing the chrome page outside.
9327    pub(crate) async fn lock(&self) {
9328        if self.0 .0.load(Ordering::Relaxed) {
9329            let old = self.0 .1.load(Ordering::Relaxed);
9330
9331            while self
9332                .0
9333                 .2
9334                .compare_exchange_weak(old, 0, Ordering::Acquire, Ordering::Relaxed)
9335                .is_err()
9336            {
9337                tokio::task::yield_now().await;
9338            }
9339            std::sync::atomic::fence(Ordering::Acquire);
9340        }
9341    }
9342
9343    /// Set the guard control manually. If this is set to false the loop will not enter.
9344    pub fn guard(&mut self, guard: bool) {
9345        self.0 .0.store(guard, Ordering::Release);
9346    }
9347
9348    /// Increment the guard channel completions.
9349    // rename on next major since logic is now flow-controlled.
9350    pub fn inc(&mut self) {
9351        self.0 .2.fetch_add(1, std::sync::atomic::Ordering::Release);
9352    }
9353
9354    /// Increment a guard channel completions.
9355    pub(crate) fn inc_guard(guard: &AtomicUsize) {
9356        guard.fetch_add(1, std::sync::atomic::Ordering::Release);
9357    }
9358}
9359
9360impl Drop for ChannelGuard {
9361    fn drop(&mut self) {
9362        self.0 .0.store(false, Ordering::Release);
9363    }
9364}
9365
9366#[cfg(feature = "cron")]
9367/// Start a cron job taking ownership of the website
9368pub async fn run_cron(website: Website) -> Runner {
9369    async_job::Runner::new().add(Box::new(website)).run().await
9370}
9371
9372#[cfg(feature = "cron")]
9373#[async_trait]
9374impl Job for Website {
9375    fn schedule(&self) -> Option<async_job::Schedule> {
9376        match self.configuration.cron_str.parse() {
9377            Ok(schedule) => Some(schedule),
9378            Err(e) => {
9379                log::error!("{:?}", e);
9380                None
9381            }
9382        }
9383    }
9384    async fn handle(&mut self) {
9385        log::info!(
9386            "CRON: {} - cron job running {}",
9387            self.get_url().as_ref(),
9388            self.now()
9389        );
9390        if self.configuration.cron_type == CronType::Crawl {
9391            self.crawl().await;
9392        } else {
9393            self.scrape().await;
9394        }
9395    }
9396}
9397
9398impl std::fmt::Display for Website {
9399    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
9400        write!(
9401            f,
9402            "Website:\n  URL: {}\n ID: {:?}\n Configuration: {:?}",
9403            self.get_url(),
9404            self.get_crawl_id(),
9405            self.configuration
9406        )
9407    }
9408}
9409
9410impl std::error::Error for Website {}
9411
9412#[tokio::test]
9413#[cfg(not(feature = "decentralized"))]
9414async fn crawl() {
9415    let url = "https://choosealicense.com";
9416    let mut website: Website = Website::new(url);
9417    website.crawl().await;
9418    assert!(
9419        website
9420            .links_visited
9421            .contains(&"https://choosealicense.com/licenses/".into()),
9422        "{:?}",
9423        website.links_visited
9424    );
9425}
9426
9427#[tokio::test]
9428#[cfg(feature = "cron")]
9429async fn crawl_cron() {
9430    let url = "https://choosealicense.com";
9431    let mut website: Website = Website::new(&url)
9432        .with_cron("1/5 * * * * *", Default::default())
9433        .build()
9434        .unwrap();
9435    let mut rx2 = website.subscribe(16).unwrap();
9436
9437    // handle an event on every cron
9438    let join_handle = tokio::spawn(async move {
9439        let mut links_visited = HashSet::new();
9440        while let Ok(res) = rx2.recv().await {
9441            let url = res.get_url();
9442            links_visited.insert(CaseInsensitiveString::new(url));
9443        }
9444        assert!(
9445            links_visited.contains(&CaseInsensitiveString::from(
9446                "https://choosealicense.com/licenses/"
9447            )),
9448            "{:?}",
9449            links_visited
9450        );
9451    });
9452
9453    let mut runner = website.run_cron().await;
9454    log::debug!("Starting the Runner for 10 seconds");
9455    tokio::time::sleep(Duration::from_secs(10)).await;
9456    runner.stop().await;
9457    join_handle.abort();
9458    let _ = join_handle.await;
9459}
9460
9461#[tokio::test]
9462#[cfg(feature = "cron")]
9463async fn crawl_cron_own() {
9464    let url = "https://choosealicense.com";
9465    let mut website: Website = Website::new(&url)
9466        .with_cron("1/5 * * * * *", Default::default())
9467        .build()
9468        .unwrap();
9469    let mut rx2 = website.subscribe(16).unwrap();
9470
9471    // handle an event on every cron
9472    let join_handle = tokio::spawn(async move {
9473        let mut links_visited = HashSet::new();
9474        while let Ok(res) = rx2.recv().await {
9475            let url = res.get_url();
9476            links_visited.insert(CaseInsensitiveString::new(url));
9477        }
9478        assert!(
9479            links_visited.contains(&CaseInsensitiveString::from(
9480                "https://choosealicense.com/licenses/"
9481            )),
9482            "{:?}",
9483            links_visited
9484        );
9485    });
9486
9487    let mut runner = run_cron(website).await;
9488    log::debug!("Starting the Runner for 10 seconds");
9489    tokio::time::sleep(Duration::from_secs(10)).await;
9490    let _ = tokio::join!(runner.stop(), join_handle);
9491}
9492
9493#[tokio::test]
9494#[cfg(not(feature = "decentralized"))]
9495async fn scrape() {
9496    let mut website: Website = Website::new("https://choosealicense.com");
9497    website.scrape().await;
9498    assert!(
9499        website
9500            .links_visited
9501            .contains(&"https://choosealicense.com/licenses/".into()),
9502        "{:?}",
9503        website.links_visited
9504    );
9505
9506    assert!(!website.get_pages().unwrap()[0].get_html().is_empty());
9507}
9508
9509#[tokio::test]
9510#[cfg(not(feature = "decentralized"))]
9511async fn crawl_invalid() {
9512    let mut website: Website = Website::new("https://w.com");
9513    website.crawl().await;
9514    assert!(website.links_visited.len() <= 1); // only the target url should exist
9515}
9516
9517#[tokio::test]
9518#[cfg(feature = "decentralized")]
9519async fn crawl_invalid() {
9520    let domain = "https://w.com";
9521    let mut website: Website = Website::new(domain);
9522    website.crawl().await;
9523    let mut uniq: Box<HashSet<CaseInsensitiveString>> = Box::new(HashSet::new());
9524    uniq.insert(format!("{}/", domain.to_string()).into()); // TODO: remove trailing slash mutate
9525
9526    assert_eq!(website.links_visited.get_links(), *uniq); // only the target url should exist
9527}
9528
9529#[tokio::test]
9530async fn not_crawl_blacklist() {
9531    let mut website: Website = Website::new("https://choosealicense.com");
9532    website.configuration.blacklist_url = Some(Vec::from([CompactString::from(
9533        "https://choosealicense.com/licenses/",
9534    )]));
9535
9536    website.crawl().await;
9537    assert!(
9538        !website
9539            .links_visited
9540            .contains(&"https://choosealicense.com/licenses/".into()),
9541        "{:?}",
9542        website.links_visited
9543    );
9544}
9545
9546#[tokio::test]
9547#[cfg(feature = "regex")]
9548async fn not_crawl_blacklist_regex() {
9549    let mut website: Website = Website::new("https://choosealicense.com");
9550    website.with_blacklist_url(Some(Vec::from(["choosealicense.com".into()])));
9551    website.crawl().await;
9552    assert_eq!(website.links_visited.len(), 0);
9553}
9554
9555#[test]
9556#[cfg(feature = "ua_generator")]
9557fn randomize_website_agent() {
9558    assert_eq!(get_ua(false).is_empty(), false);
9559}
9560
9561#[tokio::test]
9562#[cfg(not(feature = "decentralized"))]
9563async fn test_respect_robots_txt() {
9564    let mut website: Website = Website::new("https://stackoverflow.com");
9565    website.configuration.respect_robots_txt = true;
9566    website.configuration.user_agent = Some(Box::new("*".into()));
9567
9568    let (client, _): (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) =
9569        website.setup().await;
9570
9571    website.configure_robots_parser(&client).await;
9572
9573    assert_eq!(website.configuration.delay, 0);
9574
9575    assert!(!&website
9576        .is_allowed(&"https://stackoverflow.com/posts/".into())
9577        .eq(&ProcessLinkStatus::Allowed));
9578
9579    // test match for bing bot
9580    let mut website_second: Website = Website::new("https://www.mongodb.com");
9581    website_second.configuration.respect_robots_txt = true;
9582    website_second.configuration.user_agent = Some(Box::new("bingbot".into()));
9583
9584    let (client_second, _): (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) =
9585        website_second.setup().await;
9586    website_second.configure_robots_parser(&client_second).await;
9587
9588    assert!(!&website
9589        .is_allowed(&"https://www.mongodb.com/community/forums/auth/".into())
9590        .eq(&ProcessLinkStatus::Allowed));
9591
9592    // assert_eq!(website_second.configuration.delay, 60000); // should equal one minute in ms
9593}
9594
9595#[tokio::test]
9596#[cfg(not(feature = "decentralized"))]
9597async fn test_crawl_subdomains() {
9598    let mut website: Website = Website::new("https://choosealicense.com");
9599    website.configuration.subdomains = true;
9600    website.crawl().await;
9601    assert!(
9602        website
9603            .links_visited
9604            .contains(&"https://choosealicense.com/licenses/".into()),
9605        "{:?}",
9606        website.links_visited
9607    );
9608}
9609
9610#[tokio::test]
9611#[cfg(all(
9612    not(feature = "regex"),
9613    not(feature = "openai"),
9614    not(feature = "gemini")
9615))]
9616async fn test_with_configuration() {
9617    let mut website = Website::new("https://choosealicense.com");
9618
9619    website
9620        .with_respect_robots_txt(true)
9621        .with_subdomains(true)
9622        .with_tld(false)
9623        .with_delay(0)
9624        .with_request_timeout(None)
9625        .with_http2_prior_knowledge(false)
9626        .with_user_agent(Some(crate::page::TEST_AGENT_NAME))
9627        .with_headers(None)
9628        .with_proxies(None);
9629
9630    let mut configuration = Box::new(configuration::Configuration::new());
9631
9632    configuration.respect_robots_txt = true;
9633    configuration.subdomains = true;
9634    configuration.tld = false;
9635    configuration.delay = 0;
9636    configuration.request_timeout = None;
9637    configuration.http2_prior_knowledge = false;
9638    configuration.user_agent = Some(Box::new(CompactString::new(crate::page::TEST_AGENT_NAME)));
9639    configuration.headers = None;
9640    configuration.proxies = None;
9641
9642    assert!(
9643        website.configuration == configuration,
9644        "Left\n{:?}\n\nRight\n{:?}",
9645        website.configuration,
9646        configuration
9647    );
9648}
9649
9650#[tokio::test]
9651#[cfg(all(feature = "glob", not(feature = "decentralized")))]
9652async fn test_crawl_glob() {
9653    let mut website: Website =
9654        Website::new("https://choosealicense.com/licenses/{mit,apache-2.0,mpl-2.0}/");
9655    website.crawl().await;
9656
9657    // check for either https/http in collection
9658    assert!(
9659        website
9660            .links_visited
9661            .contains(&"https://choosealicense.com/licenses/".into())
9662            || website
9663                .links_visited
9664                .contains(&"http://choosealicense.com/licenses/".into()),
9665        "{:?}",
9666        website.links_visited
9667    );
9668}
9669
9670#[tokio::test]
9671#[cfg(not(feature = "decentralized"))]
9672async fn test_crawl_tld() {
9673    let mut website: Website = Website::new("https://choosealicense.com");
9674    website.configuration.tld = true;
9675    website.crawl().await;
9676
9677    assert!(
9678        website
9679            .links_visited
9680            .contains(&"https://choosealicense.com/licenses/".into()),
9681        "{:?}",
9682        website.links_visited
9683    );
9684}
9685
9686#[tokio::test]
9687#[cfg(all(feature = "sync", not(feature = "decentralized")))]
9688async fn test_crawl_subscription() {
9689    let mut website: Website = Website::new("https://choosealicense.com");
9690    let mut rx2 = website.subscribe(100).unwrap();
9691
9692    let join_handle = tokio::spawn(async move {
9693        let mut count = 0;
9694
9695        while let Ok(_) = rx2.recv().await {
9696            count += 1;
9697        }
9698        count
9699    });
9700
9701    website.crawl().await;
9702    website.unsubscribe();
9703    let website_links = website.get_links().len();
9704    let count = join_handle.await.unwrap();
9705
9706    // no subscription if did not fulfill. The root page is always captured in links.
9707    assert!(count == website_links, "{:?}", true);
9708}
9709
9710#[tokio::test]
9711#[cfg(all(feature = "socks", not(feature = "decentralized")))]
9712async fn test_crawl_proxy() {
9713    let mut website: Website = Website::new("https://choosealicense.com");
9714    website
9715        .configuration
9716        .proxies
9717        .get_or_insert(Default::default())
9718        .push("socks5://127.0.0.1:1080".into());
9719
9720    website.crawl().await;
9721
9722    let mut license_found = false;
9723
9724    for links_visited in website.get_links() {
9725        // Proxy may return http or https in socks5 per platform.
9726        // We may want to replace the protocol with the host of the platform regardless of proxy response.
9727        if links_visited.as_ref().contains("/licenses/") {
9728            license_found = true;
9729        };
9730    }
9731
9732    assert!(license_found, "{:?}", website.links_visited);
9733}
9734
9735#[tokio::test]
9736async fn test_link_duplicates() {
9737    fn has_unique_elements<T>(iter: T) -> bool
9738    where
9739        T: IntoIterator,
9740        T::Item: Eq + std::hash::Hash,
9741    {
9742        let mut uniq = HashSet::new();
9743        iter.into_iter().all(move |x| uniq.insert(x))
9744    }
9745
9746    let mut website: Website = Website::new("http://0.0.0.0:8000");
9747    website.crawl().await;
9748
9749    assert!(has_unique_elements(website.links_visited.get_links()));
9750}
9751
9752#[tokio::test]
9753async fn test_crawl_budget() {
9754    let mut website: Website = Website::new("https://choosealicense.com");
9755    website.with_budget(Some(HashMap::from([("*", 1), ("/licenses", 1)])));
9756    website.crawl().await;
9757
9758    assert!(website.links_visited.len() <= 1);
9759}
9760
9761#[tokio::test]
9762#[cfg(feature = "control")]
9763#[ignore]
9764async fn test_crawl_pause_resume() {
9765    use crate::utils::{pause, resume};
9766
9767    let domain = "https://choosealicense.com/";
9768    let mut website: Website = Website::new(&domain);
9769
9770    let start = tokio::time::Instant::now();
9771
9772    tokio::spawn(async move {
9773        pause(domain).await;
9774        // static website test pause/resume - scan will never take longer than 5secs for target website choosealicense
9775        tokio::time::sleep(Duration::from_millis(5000)).await;
9776        resume(domain).await;
9777    });
9778
9779    website.crawl().await;
9780
9781    let duration = start.elapsed();
9782
9783    assert!(duration.as_secs() >= 5, "{:?}", duration);
9784
9785    assert!(
9786        website
9787            .links_visited
9788            .contains(&"https://choosealicense.com/licenses/".into()),
9789        "{:?}",
9790        website.links_visited
9791    );
9792}
9793
9794#[cfg(feature = "control")]
9795#[ignore]
9796#[tokio::test]
9797async fn test_crawl_shutdown() {
9798    use crate::utils::shutdown;
9799
9800    // use target blog to prevent shutdown of prior crawler
9801    let domain = "https://spider.cloud/";
9802    let mut website: Website = Website::new(&domain);
9803
9804    tokio::spawn(async move {
9805        shutdown(domain).await;
9806    });
9807
9808    website.crawl().await;
9809    let links_visited_count = website.links_visited.len();
9810
9811    assert!(links_visited_count <= 1, "{:?}", links_visited_count);
9812}
9813
9814#[tokio::test]
9815#[cfg(all(feature = "cache_request", not(feature = "decentralized")))]
9816async fn test_cache() {
9817    let domain = "https://choosealicense.com/";
9818    let mut website: Website = Website::new(&domain);
9819    website.configuration.cache = true;
9820
9821    let fresh_start = tokio::time::Instant::now();
9822    website.crawl().await;
9823    let fresh_duration = fresh_start.elapsed();
9824
9825    let cached_start = tokio::time::Instant::now();
9826    website.crawl().await;
9827    let cached_duration = cached_start.elapsed();
9828
9829    // cache should be faster at least 5x.
9830    assert!(
9831        fresh_duration.as_millis() > cached_duration.as_millis() * 5,
9832        "{:?}",
9833        cached_duration
9834    );
9835}
9836
9837#[cfg(test)]
9838mod tests {
9839    use super::*;
9840
9841    #[cfg(not(feature = "decentralized"))]
9842    #[test]
9843    fn test_client_rotator_round_robin() {
9844        // Build 3 simple clients to verify round-robin cycling.
9845        let clients: Vec<Client> = (0..3)
9846            .map(|_| {
9847                #[cfg(not(feature = "cache_request"))]
9848                {
9849                    unsafe { crate::ClientBuilder::new().build().unwrap_unchecked() }
9850                }
9851                #[cfg(feature = "cache_request")]
9852                {
9853                    reqwest_middleware::ClientBuilder::new(unsafe {
9854                        reqwest::ClientBuilder::new().build().unwrap_unchecked()
9855                    })
9856                    .build()
9857                }
9858            })
9859            .collect();
9860
9861        let rotator = ClientRotator::new(clients);
9862        assert_eq!(rotator.len(), 3);
9863        assert!(!rotator.is_empty());
9864
9865        // Each call to next() should advance the index.
9866        // We verify the pattern cycles by checking the internal index.
9867        let _ = rotator.next(); // index 0
9868        let _ = rotator.next(); // index 1
9869        let _ = rotator.next(); // index 2
9870        let _ = rotator.next(); // index 3 -> wraps to 0
9871
9872        // After 4 calls, the atomic index should be 4.
9873        let current_idx = rotator.index.load(Ordering::Relaxed);
9874        assert_eq!(current_idx, 4);
9875    }
9876
9877    #[cfg(not(feature = "decentralized"))]
9878    #[test]
9879    fn test_build_rotated_clients_with_multiple_proxies() {
9880        let mut website = Website::new("http://example.com");
9881        website.configuration.with_proxies(Some(vec![
9882            "http://proxy1.example.com:8080".to_string(),
9883            "http://proxy2.example.com:8080".to_string(),
9884            "http://proxy3.example.com:8080".to_string(),
9885        ]));
9886
9887        let rotator = website.build_rotated_clients();
9888        assert!(rotator.is_some(), "Should build rotator with 3 proxies");
9889        let rotator = rotator.unwrap();
9890        assert_eq!(rotator.len(), 3);
9891    }
9892
9893    #[cfg(not(feature = "decentralized"))]
9894    #[test]
9895    fn test_build_rotated_clients_single_proxy_returns_none() {
9896        let mut website = Website::new("http://example.com");
9897        website.configuration.with_proxies(Some(vec![
9898            "http://proxy1.example.com:8080".to_string(),
9899        ]));
9900
9901        let rotator = website.build_rotated_clients();
9902        assert!(
9903            rotator.is_none(),
9904            "Should not build rotator with only 1 proxy"
9905        );
9906    }
9907
9908    #[cfg(not(feature = "decentralized"))]
9909    #[test]
9910    fn test_build_rotated_clients_no_proxies_returns_none() {
9911        let website = Website::new("http://example.com");
9912        let rotator = website.build_rotated_clients();
9913        assert!(
9914            rotator.is_none(),
9915            "Should not build rotator with no proxies"
9916        );
9917    }
9918}