1use crate::black_list::contains;
2use crate::client::redirect::Policy;
3use crate::compact_str::CompactString;
4use crate::configuration::{
5 self, get_ua, AutomationScriptsMap, Configuration, ExecutionScriptsMap, RedirectPolicy,
6 SerializableHeaderMap,
7};
8use crate::{page::build, utils::PageResponse};
9
10#[cfg(feature = "smart")]
11use crate::features::chrome::OnceBrowser;
12use crate::features::chrome_common::RequestInterceptConfiguration;
13#[cfg(feature = "disk")]
14use crate::features::disk::DatabaseHandler;
15use crate::packages::robotparser::parser::RobotFileParser;
16use crate::page::{
17 AntiBotTech, Page, PageLinkBuildSettings, CHROME_UNKNOWN_STATUS_ERROR, UNKNOWN_STATUS_ERROR,
18};
19use crate::utils::abs::{convert_abs_url, parse_absolute_url};
20use crate::utils::interner::ListBucket;
21use crate::utils::{
22 crawl_duration_expired, emit_log, emit_log_shutdown, get_path_from_url, get_semaphore,
23 networking_capable, prepare_url, setup_website_selectors, spawn_set, AllowedDomainTypes,
24};
25use crate::{CaseInsensitiveString, Client, ClientBuilder, RelativeSelectors};
26#[cfg(feature = "cron")]
27use async_job::{async_trait, Job, Runner};
28use hashbrown::{HashMap, HashSet};
29use reqwest::header::REFERER;
30use reqwest::StatusCode;
31use std::fmt;
32use std::net::IpAddr;
33use std::sync::atomic::{AtomicBool, AtomicI8, AtomicUsize, Ordering};
34use std::sync::Arc;
35use std::time::{Duration, Instant};
36use tokio::{
37 sync::{broadcast, Semaphore},
38 task::JoinSet,
39 time::Interval,
40};
41use tokio_stream::StreamExt;
42use url::Url;
43
44#[cfg(feature = "cache_request")]
45use http_cache_reqwest::{Cache, CacheMode, HttpCache, HttpCacheOptions};
46
47#[cfg(feature = "cache_request")]
48pub use http_global_cache::CACACHE_MANAGER;
49
50const BACKOFF_MAX_DURATION: tokio::time::Duration = tokio::time::Duration::from_secs(60);
52
53pub fn calc_limits(multiplier: usize) -> usize {
55 let logical = num_cpus::get();
56 let physical = num_cpus::get_physical();
57
58 let sem_limit = if logical > physical {
59 (logical) / (physical)
60 } else {
61 logical
62 };
63
64 let (sem_limit, sem_max) = if logical == physical {
65 (sem_limit * physical, 30 * multiplier)
66 } else {
67 (sem_limit * 2, 20 * multiplier)
68 };
69
70 sem_limit.max(sem_max)
71}
72
73static JS_SAFE_CHALLENGE_PATTERNS: &[&str] = &[
75 r#"Enable JavaScript and cookies to continue"#, r#"To continue, please enable JavaScript in your browser settings"#, r#"Please enable JavaScript to view the page content"#, ];
79
80pub fn is_safe_javascript_challenge(page: &Page) -> bool {
82 let page = page.get_html_bytes_u8();
83
84 let page_size = page.len();
85
86 if page_size == 0 || page_size > 10_000 {
87 return false;
88 }
89
90 AC_JS_CHALLENGE.find(page).is_some()
91}
92
93#[cfg(all(
94 any(
95 target_os = "android",
96 target_os = "fuchsia",
97 target_os = "illumos",
98 target_os = "ios",
99 target_os = "linux",
100 target_os = "macos",
101 target_os = "solaris",
102 target_os = "tvos",
103 target_os = "visionos",
104 target_os = "watchos",
105 ),
106 not(feature = "wreq")
107))]
108pub fn set_interface(client: ClientBuilder, network_interface: &str) -> ClientBuilder {
110 client.interface(&network_interface)
111}
112
113#[cfg(not(any(
114 feature = "wreq",
115 target_os = "android",
116 target_os = "fuchsia",
117 target_os = "illumos",
118 target_os = "ios",
119 target_os = "linux",
120 target_os = "macos",
121 target_os = "solaris",
122 target_os = "tvos",
123 target_os = "visionos",
124 target_os = "watchos",
125)))]
126pub fn set_interface(client: ClientBuilder, _interface: &str) -> ClientBuilder {
128 client
129}
130
131lazy_static! {
132 static ref AC_JS_CHALLENGE: aho_corasick::AhoCorasick = aho_corasick::AhoCorasick::new(JS_SAFE_CHALLENGE_PATTERNS).expect("safe challenges");
133 pub static ref DEFAULT_PERMITS: usize = calc_limits(1);
135 pub(crate) static ref SEM_SHARED: Arc<Semaphore> = {
137 let base_limit = match std::env::var("SEMAPHORE_MULTIPLIER") {
138 Ok(multiplier) => match multiplier.parse::<isize>() {
139 Ok(parsed_value) => (*DEFAULT_PERMITS as isize)
140 .wrapping_mul(parsed_value)
141 .max(1) as usize,
142 Err(_) => *DEFAULT_PERMITS,
143 },
144 _ => *DEFAULT_PERMITS,
145 };
146 Arc::new(Semaphore::const_new(base_limit))
147 };
148 pub(crate) static ref LINKS_VISITED_MEMORY_LIMIT: usize = {
150 const DEFAULT_LIMIT: usize = 15_000;
151
152 match std::env::var("LINKS_VISITED_MEMORY_LIMIT") {
153 Ok(limit) => limit.parse::<usize>().unwrap_or(DEFAULT_LIMIT),
154 _ => DEFAULT_LIMIT
155 }
156 };
157 static ref WILD_CARD_PATH: CaseInsensitiveString = CaseInsensitiveString::from("*");
158}
159
160#[cfg(not(feature = "decentralized"))]
161lazy_static! {
162 static ref SEM: Semaphore = {
164 let base_limit = calc_limits(1);
165
166 let base_limit = match std::env::var("SEMAPHORE_MULTIPLIER") {
167 Ok(multiplier) => match multiplier.parse::<isize>() {
168 Ok(parsed_value) => (base_limit as isize * parsed_value).max(1) as usize,
169 Err(_) => base_limit,
170 },
171 _ => base_limit,
172 };
173
174 Semaphore::const_new(base_limit)
175 };
176}
177
178#[cfg(feature = "decentralized")]
179lazy_static! {
180 static ref WORKERS: HashSet<String> = {
182 let mut set: HashSet<_> = HashSet::new();
183
184 for worker in std::env::var("SPIDER_WORKER_SCRAPER")
185 .unwrap_or_else(|_| "http://127.0.0.1:3031".to_string())
186 .split(",")
187 {
188 set.insert(worker.to_string());
189 }
190
191 for worker in std::env::var("SPIDER_WORKER")
192 .unwrap_or_else(|_| "http://127.0.0.1:3030".to_string())
193 .split(",")
194 {
195 set.insert(worker.to_string());
196 }
197
198 set
199 };
200 static ref SEM: Semaphore = {
201 let sem_limit = calc_limits(3);
202 Semaphore::const_new(sem_limit * WORKERS.len())
203 };
204}
205
206#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, strum::EnumString, strum::Display)]
210#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
211pub enum CrawlStatus {
212 #[default]
214 Start,
215 Idle,
217 Active,
219 Blocked,
221 FirewallBlocked,
223 ServerError,
225 ConnectError,
227 RateLimited,
229 Empty,
231 Invalid,
233 #[cfg(feature = "control")]
234 Shutdown,
236 #[cfg(feature = "control")]
237 Paused,
239}
240
241#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, strum::EnumString, strum::Display)]
243#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
244pub enum ProcessLinkStatus {
245 #[default]
247 Allowed,
248 Blocked,
250 BudgetExceeded,
252}
253
254#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, strum::EnumString, strum::Display)]
256#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
257pub enum CronType {
258 #[default]
259 Crawl,
261 Scrape,
263}
264
265#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, strum::EnumString, strum::Display)]
266#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
267pub enum WebsiteMetaInfo {
269 RequiresJavascript,
271 Apache403,
273 OpenResty403,
275 #[default]
277 None,
278}
279
280pub type OnLinkFindCallback = Arc<
282 dyn Fn(CaseInsensitiveString, Option<String>) -> (CaseInsensitiveString, Option<String>)
283 + Send
284 + Sync,
285>;
286
287pub trait OnShouldCrawlClosure: Fn(&Page) -> bool + Send + Sync + 'static {}
289impl<F: Fn(&Page) -> bool + Send + Sync + 'static> OnShouldCrawlClosure for F {}
290
291#[derive(Clone)]
293pub enum OnShouldCrawlCallback {
294 Fn(fn(&Page) -> bool),
296
297 Closure(Arc<dyn OnShouldCrawlClosure>),
299}
300impl OnShouldCrawlCallback {
301 fn call(&self, page: &Page) -> bool {
302 match self {
303 Self::Fn(func) => func(page),
304 Self::Closure(closure) => closure(page),
305 }
306 }
307}
308
309#[derive(Clone)]
312pub struct ClientRotator {
313 clients: Vec<Client>,
314 index: Arc<AtomicUsize>,
315}
316
317impl ClientRotator {
318 pub fn new(clients: Vec<Client>) -> Self {
320 Self {
321 clients,
322 index: Arc::new(AtomicUsize::new(0)),
323 }
324 }
325
326 pub fn next(&self) -> &Client {
328 let idx = self.index.fetch_add(1, Ordering::Relaxed) % self.clients.len();
329 &self.clients[idx]
330 }
331
332 pub fn len(&self) -> usize {
334 self.clients.len()
335 }
336
337 pub fn is_empty(&self) -> bool {
339 self.clients.is_empty()
340 }
341}
342
343#[derive(Clone, Default)]
355pub struct Website {
356 pub configuration: Box<Configuration>,
358 pub on_link_find_callback: Option<OnLinkFindCallback>,
360 pub on_should_crawl_callback: Option<OnShouldCrawlCallback>,
362 pub crawl_id: Box<String>,
364 #[cfg(feature = "extra_information")]
365 pub extra_info: Option<Box<String>>,
367 seed_html: Option<String>,
369 links_visited: Box<ListBucket>,
371 signatures: Box<HashSet<u64>>,
373 extra_links: Box<HashSet<CaseInsensitiveString>>,
375 pages: Option<Vec<Page>>,
377 robot_file_parser: Option<Box<RobotFileParser>>,
379 url: Box<CaseInsensitiveString>,
381 domain_parsed: Option<Box<Url>>,
383 channel: Option<(broadcast::Sender<Page>, Arc<broadcast::Receiver<Page>>)>,
385 channel_guard: Option<ChannelGuard>,
387 channel_queue: Option<(broadcast::Sender<String>, Arc<broadcast::Receiver<String>>)>,
389 status: CrawlStatus,
391 initial_status_code: StatusCode,
393 initial_anti_bot_tech: AntiBotTech,
395 initial_html_length: usize,
397 initial_page_waf_check: bool,
399 initial_page_should_retry: bool,
401 shutdown: bool,
403 client: Option<Client>,
405 client_rotator: Option<Arc<ClientRotator>>,
407 #[cfg(feature = "disk")]
409 sqlite: Option<Box<DatabaseHandler>>,
410 #[cfg(feature = "disk")]
412 enable_sqlite: bool,
413 send_configured: bool,
415 website_meta_info: WebsiteMetaInfo,
417 skip_initial: bool,
419 #[cfg(feature = "cookies")]
420 pub cookie_jar: Arc<reqwest::cookie::Jar>,
422}
423
424impl fmt::Debug for Website {
425 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
426 let domain_str = self.domain_parsed.as_ref().map(|u| u.as_str().to_owned());
427 let pages_len = self.pages.as_ref().map(|p| p.len()).unwrap_or(0);
428
429 let mut ds = f.debug_struct("Website");
430
431 ds.field("url", &self.url.as_ref())
432 .field("crawl_id", &self.crawl_id)
433 .field("domain_parsed", &domain_str)
434 .field(
436 "on_link_find_callback",
437 &self.on_link_find_callback.is_some(),
438 )
439 .field(
440 "on_should_crawl_callback",
441 &self.on_should_crawl_callback.is_some(),
442 )
443 .field("status", &self.status)
445 .field("shutdown", &self.shutdown)
446 .field("extra_links_len", &self.extra_links.len())
447 .field("signatures_len", &self.signatures.len())
448 .field("pages_len", &pages_len)
449 .field("channel_present", &self.channel.is_some())
451 .field("channel_queue_present", &self.channel_queue.is_some())
452 .field("client_present", &self.client.is_some())
453 .field("initial_status_code", &self.initial_status_code)
455 .field("initial_html_length", &self.initial_html_length)
456 .field("initial_anti_bot_tech", &self.initial_anti_bot_tech)
457 .field("initial_page_waf_check", &self.initial_page_waf_check)
458 .field("initial_page_should_retry", &self.initial_page_should_retry)
459 .field("send_configured", &self.send_configured)
461 .field("website_meta_info", &self.website_meta_info)
462 .field("skip_initial", &self.skip_initial);
463
464 #[cfg(feature = "disk")]
465 {
466 ds.field("sqlite_present", &self.sqlite.is_some())
467 .field("enable_sqlite", &self.enable_sqlite);
468 }
469
470 ds.finish()
471 }
472}
473
474impl Website {
475 fn _new(url: &str, check_firewall: bool) -> Self {
477 let url = url.trim();
478 let url: Box<CaseInsensitiveString> = if networking_capable(url) {
479 CaseInsensitiveString::new(&url).into()
480 } else {
481 CaseInsensitiveString::new(&prepare_url(url)).into()
482 };
483
484 let domain_parsed: Option<Box<Url>> = parse_absolute_url(&url);
485 let mut status = CrawlStatus::Start;
486
487 if let Some(u) = &domain_parsed {
488 if check_firewall && crate::utils::abs::block_website(&u) {
489 status = CrawlStatus::FirewallBlocked;
490 }
491 }
492
493 Self {
494 configuration: Configuration::new().into(),
495 status,
496 domain_parsed,
497 url,
498 #[cfg(feature = "disk")]
499 enable_sqlite: true,
500 ..Default::default()
501 }
502 }
503
504 pub fn new(url: &str) -> Self {
506 Website::_new(url, true)
507 }
508
509 pub fn new_with_firewall(url: &str, check_firewall: bool) -> Self {
511 Website::_new(url, check_firewall)
512 }
513
514 #[cfg(feature = "disk")]
516 pub fn setup_database_handler(&self) -> Box<DatabaseHandler> {
517 Box::new(DatabaseHandler::new(&Some(self.target_id())))
518 }
519
520 #[cfg(feature = "disk")]
521 pub fn setup_shared_db(&mut self, db: Box<DatabaseHandler>) {
523 self.sqlite = Some(db)
524 }
525
526 #[cfg(feature = "disk")]
527 pub fn setup_sqlite(&mut self) {
529 if self.sqlite.is_none() {
530 self.sqlite = Some(self.setup_database_handler())
531 }
532 }
533
534 pub fn set_url(&mut self, url: &str) -> &mut Self {
536 let url = if url.starts_with(' ') || url.ends_with(' ') {
537 url.trim()
538 } else {
539 url
540 };
541
542 let domain: Box<CaseInsensitiveString> = if networking_capable(url) {
543 CaseInsensitiveString::new(&url).into()
544 } else {
545 CaseInsensitiveString::new(&prepare_url(&url)).into()
546 };
547
548 self.domain_parsed = parse_absolute_url(&domain);
549 self.url = domain;
550 self
551 }
552
553 pub fn set_url_only(&mut self, url: &str) -> &mut Self {
555 self.url = CaseInsensitiveString::new(&url).into();
556 self
557 }
558
559 pub fn target_id(&self) -> String {
561 string_concat!(self.crawl_id, self.url.inner())
562 }
563
564 pub fn single_page(&self) -> bool {
566 match &self.configuration.inner_budget {
567 Some(b) => match b.get(&*WILD_CARD_PATH) {
568 Some(b) => b.eq(&1),
569 _ => false,
570 },
571 _ => false,
572 }
573 }
574
575 #[cfg(feature = "disk")]
577 pub fn setup_disk(&mut self) {
578 if self.enable_sqlite && self.sqlite.is_none() {
579 self.setup_sqlite();
580 }
581 if self.configuration.shared {
583 if let Some(sqlite) = self.sqlite.as_mut() {
584 sqlite.seeded = true;
585 }
587 }
588 }
589
590 #[cfg(feature = "disk")]
591 pub fn set_disk_persistance(&mut self, persist: bool) -> &mut Self {
593 if self.enable_sqlite {
594 if !self.sqlite.is_none() {
595 if let Some(sqlite) = self.sqlite.as_mut() {
596 sqlite.persist = persist;
597 }
598 }
599 }
600 self
601 }
602
603 #[cfg(not(feature = "disk"))]
605 pub fn setup_disk(&mut self) {}
606
607 pub fn get_robots_parser(&self) -> &Option<Box<RobotFileParser>> {
609 &self.robot_file_parser
610 }
611
612 pub fn get_requires_javascript(&self) -> bool {
614 self.website_meta_info == WebsiteMetaInfo::RequiresJavascript
615 }
616
617 pub fn get_website_meta_info(&self) -> &WebsiteMetaInfo {
619 &self.website_meta_info
620 }
621
622 #[cfg(feature = "disk")]
624 pub async fn is_allowed_disk(&self, url_to_check: &str) -> bool {
625 match &self.sqlite {
626 Some(sqlite) => {
627 if !sqlite.ready() {
628 true
629 } else {
630 let db_pool = sqlite.get_db_pool().await;
631 let allowed = sqlite.url_exists(db_pool, url_to_check).await;
632
633 !allowed
634 }
635 }
636 _ => true,
637 }
638 }
639
640 #[cfg(not(feature = "disk"))]
642 pub async fn is_allowed_disk(&self, _url_to_check: &str) -> bool {
643 true
644 }
645
646 #[cfg(feature = "disk")]
648 pub async fn is_allowed_signature_disk(&self, signature_to_check: u64) -> bool {
649 match &self.sqlite {
650 Some(sqlite) => {
651 if !sqlite.ready() {
652 true
653 } else {
654 let db_pool = sqlite.get_db_pool().await;
655
656 !sqlite.signature_exists(db_pool, signature_to_check).await
657 }
658 }
659 _ => true,
660 }
661 }
662
663 #[cfg(not(feature = "disk"))]
665 pub async fn is_allowed_signature_disk(&self, _signature_to_check: u64) -> bool {
666 true
667 }
668
669 pub async fn is_signature_allowed(&self, signature: u64) -> bool {
671 !self.signatures.contains(&signature) || self.is_allowed_signature_disk(signature).await
672 }
673
674 #[cfg(feature = "disk")]
676 pub async fn clear_disk(&self) {
677 if let Some(sqlite) = &self.sqlite {
678 if sqlite.pool_inited() {
679 let _ = DatabaseHandler::clear_table(sqlite.get_db_pool().await).await;
680 }
681 }
682 }
683
684 #[cfg(not(feature = "disk"))]
686 pub async fn clear_disk(&self) {}
687
688 #[cfg(not(feature = "disk"))]
690 pub(crate) fn shared_disk_enabled(&self) -> bool {
691 false
692 }
693
694 #[cfg(feature = "disk")]
696 pub(crate) fn shared_disk_enabled(&self) -> bool {
697 self.configuration.shared && self.sqlite.is_some()
698 }
699
700 #[cfg(feature = "disk")]
702 pub async fn insert_url_disk(&self, new_url: &str) {
703 if let Some(sqlite) = &self.sqlite {
704 sqlite.insert_url(sqlite.get_db_pool().await, new_url).await
705 }
706 }
707
708 #[cfg(feature = "disk")]
710 pub async fn insert_signature_disk(&self, signature: u64) {
711 if let Some(sqlite) = &self.sqlite {
712 sqlite
713 .insert_signature(sqlite.get_db_pool().await, signature)
714 .await
715 }
716 }
717
718 #[cfg(feature = "disk")]
720 pub async fn insert_link(&mut self, new_url: CaseInsensitiveString) {
721 let mem_load = crate::utils::detect_system::get_global_memory_state().await;
722 let beyond_memory_limits = self.links_visited.len() >= *LINKS_VISITED_MEMORY_LIMIT;
723 let seed_check = mem_load == 2 || mem_load == 1 || beyond_memory_limits;
724
725 if seed_check {
726 let mut seeded = false;
727 if let Some(sqlite) = &self.sqlite {
728 if !sqlite.ready() {
729 let _ = self.seed().await;
730 seeded = true;
731 }
732 }
733 if let Some(sqlite) = self.sqlite.as_mut() {
734 sqlite.set_seeded(seeded);
735 }
736 }
737
738 if mem_load == 2 || beyond_memory_limits || self.shared_disk_enabled() {
739 self.insert_url_disk(&new_url).await
740 } else if mem_load == 1 {
741 if self.links_visited.len() <= 100 {
742 self.links_visited.insert(new_url);
743 } else {
744 self.insert_url_disk(&new_url).await
745 }
746 } else {
747 self.links_visited.insert(new_url);
748 }
749 }
750
751 #[cfg(not(feature = "disk"))]
753 pub async fn insert_link(&mut self, link: CaseInsensitiveString) {
754 self.links_visited.insert(link);
755 }
756
757 #[cfg(feature = "disk")]
759 pub async fn insert_signature(&mut self, new_signature: u64) {
760 let mem_load = crate::utils::detect_system::get_global_memory_state().await;
761 let beyond_memory_limits = self.signatures.len() >= *LINKS_VISITED_MEMORY_LIMIT;
762 let seed_check = mem_load == 2 || mem_load == 1 || beyond_memory_limits;
763
764 if seed_check {
765 let mut seeded = false;
766 if let Some(sqlite) = &self.sqlite {
767 if !sqlite.ready() {
768 let _ = self.seed().await;
769 seeded = true;
770 }
771 }
772 if let Some(sqlite) = self.sqlite.as_mut() {
773 sqlite.set_seeded(seeded);
774 }
775 }
776
777 if mem_load == 2 || beyond_memory_limits || self.shared_disk_enabled() {
778 self.insert_signature_disk(new_signature).await
779 } else if mem_load == 1 {
780 if self.signatures.len() <= 100 {
781 self.signatures.insert(new_signature);
782 } else {
783 self.insert_signature_disk(new_signature).await
784 }
785 } else {
786 self.signatures.insert(new_signature);
787 }
788 }
789
790 #[cfg(not(feature = "disk"))]
792 pub async fn insert_signature(&mut self, new_signature: u64) {
793 self.signatures.insert(new_signature);
794 }
795
796 #[cfg(feature = "disk")]
798 pub async fn seed(&mut self) -> Result<(), sqlx::Error> {
799 let links = self.get_links();
800
801 if let Some(sqlite) = &self.sqlite {
802 if let Ok(links) = sqlite.seed(sqlite.get_db_pool().await, links).await {
803 self.links_visited.clear();
804
805 for link in links {
806 self.links_visited.insert(link);
807 }
808
809 if let Some(sqlite) = self.sqlite.as_mut() {
810 sqlite.seeded = true;
811 }
812 }
813 }
814
815 Ok(())
816 }
817
818 async fn handle_process<T>(
820 &self,
821 handle: &Option<Arc<AtomicI8>>,
822 interval: &mut Interval,
823 shutdown: T,
824 ) -> bool
825 where
826 T: std::future::Future<Output = ()>,
827 {
828 if self.shutdown {
829 (shutdown).await;
830 false
831 } else {
832 match handle.as_ref() {
833 Some(handle) => {
834 while handle.load(Ordering::Relaxed) == 1 {
835 interval.tick().await;
836 }
837 if handle.load(Ordering::Relaxed) == 2 {
838 (shutdown).await;
839 false
840 } else {
841 true
842 }
843 }
844 _ => true,
845 }
846 }
847 }
848
849 #[inline]
858 #[cfg(not(feature = "regex"))]
859 pub fn is_allowed(&mut self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
860 let status = self.is_allowed_budgetless(link);
861
862 if status.eq(&ProcessLinkStatus::Allowed) {
863 if self.is_over_budget(link) {
864 return ProcessLinkStatus::BudgetExceeded;
865 }
866 }
867
868 status
869 }
870
871 #[inline]
880 #[cfg(feature = "regex")]
881 pub fn is_allowed(&mut self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
882 let status = self.is_allowed_budgetless(link);
883
884 if status.eq(&ProcessLinkStatus::Allowed) {
885 if self.is_over_budget(link) {
886 return ProcessLinkStatus::BudgetExceeded;
887 }
888 }
889 status
890 }
891
892 #[inline]
900 #[cfg(not(feature = "regex"))]
901 pub fn is_allowed_budgetless(&mut self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
902 if self.links_visited.contains(link) {
903 ProcessLinkStatus::Blocked
904 } else {
905 let status = self.is_allowed_default(link.inner());
906
907 if status.eq(&ProcessLinkStatus::Allowed) {
908 if self.is_over_depth(link) {
909 return ProcessLinkStatus::Blocked;
910 }
911 }
912
913 status
914 }
915 }
916
917 #[inline]
925 #[cfg(feature = "regex")]
926 pub fn is_allowed_budgetless(&mut self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
927 if self.links_visited.contains(link) {
928 ProcessLinkStatus::Blocked
929 } else {
930 let status = self.is_allowed_default(link);
931 if status.eq(&ProcessLinkStatus::Allowed) {
932 if self.is_over_depth(link) {
933 return ProcessLinkStatus::Blocked;
934 }
935 }
936 status
937 }
938 }
939
940 #[inline]
946 #[cfg(feature = "regex")]
947 pub fn is_allowed_default(&self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
948 let blacklist = self.configuration.get_blacklist_compiled();
949 let whitelist = self.configuration.get_whitelist_compiled();
950
951 let blocked_whitelist = !whitelist.is_empty() && !contains(&whitelist, link.inner());
952 let blocked_blacklist = !blacklist.is_empty() && contains(&blacklist, link.inner());
953
954 if blocked_whitelist || blocked_blacklist || !self.is_allowed_robots(&link.as_ref()) {
955 ProcessLinkStatus::Blocked
956 } else {
957 ProcessLinkStatus::Allowed
958 }
959 }
960
961 #[inline]
967 #[cfg(not(feature = "regex"))]
968 pub fn is_allowed_default(&self, link: &CompactString) -> ProcessLinkStatus {
969 let whitelist = self.configuration.get_whitelist_compiled();
970 let blacklist = self.configuration.get_blacklist_compiled();
971
972 let blocked_whitelist = !whitelist.is_empty() && !contains(whitelist, link);
973 let blocked_blacklist = !blacklist.is_empty() && contains(blacklist, link);
974
975 if blocked_whitelist || blocked_blacklist || !self.is_allowed_robots(link) {
976 ProcessLinkStatus::Blocked
977 } else {
978 ProcessLinkStatus::Allowed
979 }
980 }
981
982 pub fn is_allowed_robots(&self, link: &str) -> bool {
986 if self.configuration.respect_robots_txt {
987 if let Some(r) = &self.robot_file_parser {
988 return r.can_fetch(
989 match &self.configuration.user_agent {
990 Some(ua) => ua,
991 _ => "*",
992 },
993 link,
994 );
995 }
996 }
997
998 true
999 }
1000
1001 pub(crate) fn is_over_inner_depth_budget(&mut self, link: &CaseInsensitiveString) -> bool {
1003 let mut over = false;
1004
1005 if let Some(segments) = get_path_from_url(link)
1006 .strip_prefix('/')
1007 .map(|remainder| remainder.split('/'))
1008 {
1009 let mut depth: usize = 0;
1010
1011 for _ in segments {
1012 depth = depth.saturating_add(1);
1013 if depth > self.configuration.depth_distance {
1014 over = true;
1015 break;
1016 }
1017 }
1018 }
1019
1020 over
1021 }
1022
1023 #[cfg(feature = "sitemap")]
1025 pub(crate) fn is_over_wild_budget(
1026 &self,
1027 budget: &Option<hashbrown::HashMap<case_insensitive_string::CaseInsensitiveString, u32>>,
1028 ) -> bool {
1029 let exceeded_wild_budget = if self.configuration.wild_card_budgeting {
1030 match budget {
1031 Some(budget) => match budget.get(&*WILD_CARD_PATH) {
1032 Some(budget) => {
1033 if budget.abs_diff(0) == 1 {
1034 true
1035 } else {
1036 false
1037 }
1038 }
1039 _ => false,
1040 },
1041 _ => false,
1042 }
1043 } else {
1044 false
1045 };
1046 exceeded_wild_budget
1047 }
1048
1049 pub(crate) fn is_over_inner_budget(&mut self, link: &CaseInsensitiveString) -> bool {
1051 match self.configuration.inner_budget.as_mut() {
1052 Some(budget) => {
1053 let exceeded_wild_budget = if self.configuration.wild_card_budgeting {
1054 match budget.get_mut(&*WILD_CARD_PATH) {
1055 Some(budget) => {
1056 if budget.abs_diff(0) == 1 {
1057 true
1058 } else {
1059 *budget -= 1;
1060 false
1061 }
1062 }
1063 _ => false,
1064 }
1065 } else {
1066 false
1067 };
1068
1069 let skip_paths = self.configuration.wild_card_budgeting && budget.len() == 1;
1072 let has_depth_control = self.configuration.depth_distance > 0;
1073
1074 if !skip_paths && !exceeded_wild_budget {
1076 let path_segments = get_path_from_url(link)
1077 .strip_prefix('/')
1078 .map(|remainder| remainder.split('/'));
1079
1080 match path_segments {
1081 Some(segments) => {
1082 let mut joint_segment = CaseInsensitiveString::default();
1083 let mut over = false;
1084 let mut depth: usize = 0;
1085
1086 for seg in segments {
1087 if has_depth_control {
1088 depth = depth.saturating_add(1);
1089 if depth > self.configuration.depth_distance {
1090 over = true;
1091 break;
1092 }
1093 }
1094
1095 joint_segment.push_str(seg);
1096
1097 if budget.contains_key(&joint_segment) {
1098 if let Some(budget) = budget.get_mut(&joint_segment) {
1099 if budget.abs_diff(0) == 0 || *budget == 0 {
1100 over = true;
1101 break;
1102 } else {
1103 *budget -= 1;
1104 continue;
1105 }
1106 }
1107 }
1108 }
1109
1110 over
1111 }
1112 _ => false,
1113 }
1114 } else {
1115 exceeded_wild_budget
1116 }
1117 }
1118 _ => false,
1119 }
1120 }
1121
1122 pub(crate) fn is_over_depth(&mut self, link: &CaseInsensitiveString) -> bool {
1124 self.configuration.depth_distance > 0 && self.is_over_inner_depth_budget(link)
1125 }
1126
1127 pub(crate) fn is_over_budget(&mut self, link: &CaseInsensitiveString) -> bool {
1129 self.is_over_inner_budget(link)
1130 }
1131
1132 pub fn size(&self) -> usize {
1134 self.links_visited.len()
1135 }
1136
1137 #[cfg(not(feature = "disk"))]
1139 pub async fn get_size(&self) -> usize {
1140 self.links_visited.len()
1141 }
1142
1143 #[cfg(feature = "disk")]
1145 pub async fn get_size(&self) -> usize {
1146 let disk_count = if let Some(sqlite) = &self.sqlite {
1147 if sqlite.pool_inited() {
1148 let disk_count = DatabaseHandler::count_records(sqlite.get_db_pool().await).await;
1149 let disk_count = disk_count.unwrap_or_default() as usize;
1150 disk_count
1151 } else {
1152 0
1153 }
1154 } else {
1155 0
1156 };
1157
1158 let mut mem_count = self.links_visited.len();
1159
1160 if mem_count >= *LINKS_VISITED_MEMORY_LIMIT {
1161 mem_count -= *LINKS_VISITED_MEMORY_LIMIT;
1162 }
1163
1164 disk_count + mem_count
1165 }
1166
1167 pub fn drain_extra_links(&mut self) -> hashbrown::hash_set::Drain<'_, CaseInsensitiveString> {
1169 self.extra_links.drain()
1170 }
1171
1172 pub fn set_initial_status_code(&mut self, initial_status_code: StatusCode) {
1174 self.initial_status_code = initial_status_code;
1175 }
1176
1177 pub fn get_initial_status_code(&self) -> &StatusCode {
1179 &self.initial_status_code
1180 }
1181
1182 pub fn set_initial_html_length(&mut self, initial_html_length: usize) {
1184 self.initial_html_length = initial_html_length;
1185 }
1186
1187 pub fn get_initial_html_length(&self) -> usize {
1189 self.initial_html_length
1190 }
1191
1192 pub fn set_initial_anti_bot_tech(&mut self, initial_anti_bot_tech: AntiBotTech) {
1194 self.initial_anti_bot_tech = initial_anti_bot_tech;
1195 }
1196
1197 pub fn get_initial_anti_bot_tech(&self) -> &AntiBotTech {
1199 &self.initial_anti_bot_tech
1200 }
1201
1202 pub fn set_initial_page_waf_check(&mut self, initial_page_waf_check: bool) {
1204 self.initial_page_waf_check = initial_page_waf_check;
1205 }
1206
1207 pub fn get_initial_page_waf_check(&self) -> bool {
1209 self.initial_page_waf_check
1210 }
1211
1212 pub fn set_initial_page_should_retry(&mut self, initial_page_should_retry: bool) {
1214 self.initial_page_should_retry = initial_page_should_retry;
1215 }
1216
1217 pub fn get_initial_page_should_retry(&self) -> bool {
1219 self.initial_page_should_retry
1220 }
1221
1222 #[cfg(any(
1224 feature = "string_interner_bucket_backend",
1225 feature = "string_interner_string_backend",
1226 feature = "string_interner_buffer_backend",
1227 ))]
1228 pub fn drain_links(
1229 &mut self,
1230 ) -> hashbrown::hash_set::Drain<'_, string_interner::symbol::SymbolUsize> {
1231 self.links_visited.drain()
1232 }
1233
1234 #[cfg(not(any(
1235 feature = "string_interner_bucket_backend",
1236 feature = "string_interner_string_backend",
1237 feature = "string_interner_buffer_backend",
1238 )))]
1239 pub fn drain_links(&mut self) -> hashbrown::hash_set::Drain<'_, CaseInsensitiveString> {
1241 self.links_visited.drain()
1242 }
1243
1244 #[cfg(any(
1246 feature = "string_interner_bucket_backend",
1247 feature = "string_interner_string_backend",
1248 feature = "string_interner_buffer_backend",
1249 ))]
1250 pub fn drain_signatures(&mut self) -> hashbrown::hash_set::Drain<'_, u64> {
1251 self.signatures.drain()
1252 }
1253
1254 #[cfg(not(any(
1255 feature = "string_interner_bucket_backend",
1256 feature = "string_interner_string_backend",
1257 feature = "string_interner_buffer_backend",
1258 )))]
1259 pub fn drain_signatures(&mut self) -> hashbrown::hash_set::Drain<'_, u64> {
1261 self.signatures.drain()
1262 }
1263
1264 pub fn set_extra_links(
1266 &mut self,
1267 extra_links: HashSet<CaseInsensitiveString>,
1268 ) -> &HashSet<CaseInsensitiveString> {
1269 self.extra_links.extend(extra_links);
1270 &self.extra_links
1271 }
1272
1273 pub fn get_extra_links(&self) -> &HashSet<CaseInsensitiveString> {
1275 &self.extra_links
1276 }
1277
1278 pub async fn clear_all(&mut self) {
1280 self.clear();
1281 self.clear_disk().await;
1282 }
1283
1284 pub fn clear(&mut self) {
1286 self.links_visited.clear();
1287 self.signatures.clear();
1288 self.pages.take();
1289 self.extra_links.clear();
1290 }
1291
1292 pub fn get_client(&self) -> &Option<Client> {
1294 &self.client
1295 }
1296
1297 pub fn get_pages(&self) -> Option<&Vec<Page>> {
1299 self.pages.as_ref()
1300 }
1301
1302 #[cfg(not(feature = "disk"))]
1304 pub async fn get_links_disk(&self) -> HashSet<CaseInsensitiveString> {
1305 Default::default()
1306 }
1307
1308 #[cfg(feature = "disk")]
1310 pub async fn get_links_disk(&self) -> HashSet<CaseInsensitiveString> {
1311 if let Some(sqlite) = &self.sqlite {
1312 if sqlite.pool_inited() {
1313 if let Ok(links) =
1314 DatabaseHandler::get_all_resources(sqlite.get_db_pool().await).await
1315 {
1316 links
1317 } else {
1318 Default::default()
1319 }
1320 } else {
1321 Default::default()
1322 }
1323 } else {
1324 Default::default()
1325 }
1326 }
1327
1328 #[cfg(feature = "disk")]
1330 pub async fn get_all_links_visited(&self) -> HashSet<CaseInsensitiveString> {
1331 let mut l = self.get_links_disk().await;
1332 let m = self.links_visited.get_links();
1333
1334 l.extend(m);
1335
1336 l
1337 }
1338
1339 #[cfg(not(feature = "disk"))]
1341 pub async fn get_all_links_visited(&self) -> HashSet<CaseInsensitiveString> {
1342 self.get_links()
1343 }
1344
1345 pub fn get_links(&self) -> HashSet<CaseInsensitiveString> {
1347 self.links_visited.get_links()
1348 }
1349
1350 pub fn get_url_parsed(&self) -> &Option<Box<Url>> {
1352 &self.domain_parsed
1353 }
1354
1355 pub fn get_url(&self) -> &CaseInsensitiveString {
1357 &self.url
1358 }
1359
1360 pub fn get_delay(&self) -> Duration {
1362 Duration::from_millis(self.configuration.delay)
1363 }
1364
1365 pub fn get_status(&self) -> &CrawlStatus {
1367 &self.status
1368 }
1369
1370 pub fn set_status(&mut self, status: CrawlStatus) -> &CrawlStatus {
1372 self.status = status;
1373 &self.status
1374 }
1375
1376 pub fn reset_status(&mut self) -> &CrawlStatus {
1378 self.status = CrawlStatus::Start;
1379 &self.status
1380 }
1381
1382 pub fn persist_links(&mut self) -> &mut Self {
1385 self.status = CrawlStatus::Active;
1386 self
1387 }
1388
1389 pub fn get_absolute_path(&self, domain: Option<&str>) -> Option<Url> {
1391 if domain.is_some() {
1392 url::Url::parse(domain.unwrap_or_default())
1393 .ok()
1394 .map(|mut url| {
1395 if let Ok(mut path) = url.path_segments_mut() {
1396 path.clear();
1397 }
1398 url
1399 })
1400 } else if let Some(mut d) = self.domain_parsed.as_deref().cloned() {
1401 if let Ok(mut path) = d.path_segments_mut() {
1402 path.clear();
1403 }
1404 Some(d)
1405 } else {
1406 None
1407 }
1408 }
1409
1410 pub fn stop(&mut self) {
1412 self.shutdown = true;
1413 }
1414
1415 pub fn start(&mut self) {
1417 self.shutdown = false;
1418 }
1419
1420 pub async fn configure_robots_parser(&mut self, client: &Client) {
1422 if self.configuration.respect_robots_txt {
1423 let robot_file_parser = self
1424 .robot_file_parser
1425 .get_or_insert_with(RobotFileParser::new);
1426
1427 if robot_file_parser.mtime() <= 4000 {
1428 let host_str = match &self.domain_parsed {
1429 Some(domain) => domain.as_str(),
1430 _ => self.url.inner(),
1431 };
1432
1433 if !host_str.is_empty() {
1434 if host_str.ends_with('/') {
1435 robot_file_parser.read(&client, host_str).await;
1436 } else {
1437 robot_file_parser
1438 .read(&client, &string_concat!(host_str, "/"))
1439 .await;
1440 }
1441 }
1442 if let Some(delay) =
1443 robot_file_parser.get_crawl_delay(&self.configuration.user_agent)
1444 {
1445 self.configuration.delay = delay.as_millis().min(60000) as u64;
1446 }
1447 }
1448 }
1449 }
1450
1451 pub fn setup_strict_policy(&self) -> Policy {
1453 use crate::client::redirect::Attempt;
1454 use crate::page::domain_name;
1455 use std::sync::atomic::AtomicU8;
1456
1457 let default_policy = Policy::default();
1458
1459 match self.domain_parsed.as_deref().cloned() {
1460 Some(host_s) => {
1461 let initial_redirect_limit = if self.configuration.respect_robots_txt {
1462 2
1463 } else {
1464 1
1465 };
1466 let subdomains = self.configuration.subdomains;
1467 let tld = self.configuration.tld;
1468 let host_domain_name = if tld {
1469 domain_name(&host_s).to_string()
1470 } else {
1471 Default::default()
1472 };
1473 let redirect_limit = *self.configuration.redirect_limit;
1474
1475 let custom_policy = {
1476 let initial_redirect = Arc::new(AtomicU8::new(0));
1477
1478 move |attempt: Attempt| {
1479 if tld && domain_name(attempt.url()) == host_domain_name
1480 || subdomains
1481 && attempt
1482 .url()
1483 .host_str()
1484 .unwrap_or_default()
1485 .ends_with(host_s.host_str().unwrap_or_default())
1486 || attempt.url().host() == host_s.host()
1487 {
1488 default_policy.redirect(attempt)
1489 } else if attempt.previous().len() > redirect_limit {
1490 attempt.error("too many redirects")
1491 } else if attempt.status().is_redirection()
1492 && (0..initial_redirect_limit)
1493 .contains(&initial_redirect.load(Ordering::Relaxed))
1494 {
1495 initial_redirect.fetch_add(1, Ordering::Relaxed);
1496 default_policy.redirect(attempt)
1497 } else {
1498 attempt.stop()
1499 }
1500 }
1501 };
1502 Policy::custom(custom_policy)
1503 }
1504 _ => default_policy,
1505 }
1506 }
1507
1508 pub fn setup_redirect_policy(&self) -> Policy {
1510 match self.configuration.redirect_policy {
1511 RedirectPolicy::Loose => Policy::limited(*self.configuration.redirect_limit),
1512 RedirectPolicy::None => Policy::none(),
1513 RedirectPolicy::Strict => self.setup_strict_policy(),
1514 }
1515 }
1516
1517 pub fn configure_headers(&mut self) {
1519 let mut headers: reqwest::header::HeaderMap = reqwest::header::HeaderMap::new();
1520
1521 let user_agent = match &self.configuration.user_agent {
1522 Some(ua) => ua.as_str(),
1523 _ => get_ua(self.configuration.only_chrome_agent()),
1524 };
1525
1526 if self.configuration.modify_headers {
1527 crate::utils::header_utils::extend_headers(
1528 &mut headers,
1529 user_agent,
1530 &self.configuration.headers,
1531 &None,
1532 &self.configuration.viewport,
1533 &self.domain_parsed,
1534 );
1535
1536 if !headers.is_empty() {
1537 if let Some(referer) = headers.remove(REFERER) {
1539 if let Ok(v) = referer.to_str() {
1540 if self.configuration.referer.is_none() && !v.is_empty() {
1542 self.configuration.referer = Some(v.into())
1543 }
1544 }
1545 }
1546 self.configuration
1547 .headers
1548 .replace(Box::new(SerializableHeaderMap::from(headers)));
1549 }
1550 }
1551 }
1552
1553 #[cfg(all(not(feature = "wreq"), not(feature = "decentralized")))]
1554 pub fn configure_base_client(&self) -> ClientBuilder {
1556 let policy = self.setup_redirect_policy();
1557
1558 let user_agent = match &self.configuration.user_agent {
1559 Some(ua) => ua.as_str(),
1560 _ => get_ua(self.configuration.only_chrome_agent()),
1561 };
1562
1563 let missing_agent = match &self.configuration.headers {
1566 Some(headers) => {
1567 !headers.contains_key(crate::client::header::USER_AGENT)
1568 && !headers.contains_key("User-Agent")
1569 }
1570 _ => true,
1571 };
1572
1573 let timeout_mult = if self.configuration.proxies.is_some() {
1574 2
1575 } else {
1576 1
1577 };
1578
1579 let client = reqwest::Client::builder()
1580 .redirect(policy)
1581 .http09_responses()
1582 .http1_ignore_invalid_headers_in_responses(true)
1583 .referer(self.configuration.referer.is_none())
1584 .connect_timeout(
1585 self.configuration
1586 .default_http_connect_timeout
1587 .unwrap_or(Duration::from_secs(24 * timeout_mult)),
1588 )
1589 .read_timeout(
1590 self.configuration
1591 .default_http_read_timeout
1592 .unwrap_or(Duration::from_secs(42 * timeout_mult)),
1593 )
1594 .http1_title_case_headers()
1595 .http1_allow_obsolete_multiline_headers_in_responses(true)
1596 .http1_allow_spaces_after_header_name_in_responses(true)
1597 .danger_accept_invalid_certs(self.configuration.accept_invalid_certs);
1600
1601 let client = if let Some(network_interface) = &self.configuration.network_interface {
1602 set_interface(client, &network_interface)
1603 } else {
1604 client
1605 };
1606
1607 let client = if let Some(local_address) = &self.configuration.local_address {
1608 client.local_address(*local_address)
1609 } else {
1610 client
1611 };
1612
1613 let client = if self.configuration.proxies.is_none() {
1614 client
1615 } else {
1616 client.tcp_keepalive(Duration::from_secs(30))
1617 };
1618
1619 let client = if missing_agent {
1621 client.user_agent(user_agent)
1622 } else {
1623 client
1624 };
1625
1626 let client = if self.configuration.http2_prior_knowledge {
1627 client.http2_prior_knowledge()
1628 } else {
1629 client
1630 };
1631
1632 crate::utils::header_utils::setup_default_headers(client, &self.configuration)
1633 }
1634
1635 #[cfg(all(feature = "wreq", not(feature = "decentralized")))]
1636 pub fn configure_base_client(&self) -> ClientBuilder {
1638 let policy = self.setup_redirect_policy();
1639
1640 let user_agent = match &self.configuration.user_agent {
1641 Some(ua) => ua.as_str(),
1642 _ => get_ua(self.configuration.only_chrome_agent()),
1643 };
1644
1645 let missing_agent = match &self.configuration.headers {
1646 Some(headers) => {
1647 !headers.contains_key(crate::client::header::USER_AGENT)
1648 && !headers.contains_key("User-Agent")
1649 }
1650 _ => true,
1651 };
1652
1653 let timeout_mult = if self.configuration.proxies.is_some() {
1654 2
1655 } else {
1656 1
1657 };
1658
1659 let client = Client::builder()
1660 .redirect(policy)
1661 .referer(self.configuration.referer.is_none())
1662 .connect_timeout(
1663 self.configuration
1664 .default_http_connect_timeout
1665 .unwrap_or(Duration::from_secs(24 * timeout_mult)),
1666 )
1667 .read_timeout(
1668 self.configuration
1669 .default_http_read_timeout
1670 .unwrap_or(Duration::from_secs(42 * timeout_mult)),
1671 );
1672
1673 let client = if let Some(local_address) = &self.configuration.local_address {
1674 client.local_address(*local_address)
1675 } else {
1676 client
1677 };
1678
1679 let client = if self.configuration.proxies.is_none() {
1680 client
1681 } else {
1682 client.tcp_keepalive(Duration::from_secs(30))
1683 };
1684
1685 let client = if missing_agent {
1686 client.user_agent(user_agent)
1687 } else {
1688 client
1689 };
1690
1691 let client = if let Some(emulation) = self.configuration.emulation {
1692 client.emulation(emulation)
1693 } else {
1694 client
1695 };
1696
1697 crate::utils::header_utils::setup_default_headers(client, &self.configuration)
1698 }
1699
1700 #[cfg(all(not(feature = "decentralized"), not(feature = "cache_request")))]
1702 pub fn configure_http_client_builder(&self) -> ClientBuilder {
1703 let client = self.configure_base_client();
1704
1705 let mut client = match &self.configuration.request_timeout {
1706 Some(t) => client.timeout(**t),
1707 _ => client,
1708 };
1709
1710 let client = match &self.configuration.proxies {
1711 Some(proxies) => {
1712 let linux = cfg!(target_os = "linux");
1713 let ignore_plain_socks = proxies.len() >= 2 && linux;
1714 let replace_plain_socks = proxies.len() == 1 && linux;
1715
1716 for proxie in proxies.iter() {
1717 if proxie.ignore == crate::configuration::ProxyIgnore::Http {
1718 continue;
1719 }
1720
1721 let proxie = &proxie.addr;
1722 let socks = proxie.starts_with("socks://");
1723
1724 if ignore_plain_socks && socks {
1726 continue;
1727 }
1728
1729 if replace_plain_socks && socks {
1731 if let Ok(proxy) =
1732 crate::client::Proxy::all(&proxie.replacen("socks://", "http://", 1))
1733 {
1734 client = client.proxy(proxy);
1735 }
1736 } else {
1737 if let Ok(proxy) = crate::client::Proxy::all(proxie) {
1738 client = client.proxy(proxy);
1739 }
1740 }
1741 }
1742
1743 client
1744 }
1745 _ => client,
1746 };
1747
1748 #[cfg(feature = "spider_cloud")]
1750 let client = if let Some(ref sc) = self.configuration.spider_cloud {
1751 if sc.uses_proxy() {
1752 match (crate::client::Proxy::all(&sc.proxy_url), reqwest::header::HeaderValue::from_str(&format!("Bearer {}", sc.api_key))) {
1753 (Ok(proxy), Ok(auth_value)) => client.proxy(proxy.custom_http_auth(auth_value)),
1754 _ => client,
1755 }
1756 } else {
1757 client
1758 }
1759 } else {
1760 client
1761 };
1762
1763 let client = if crate::utils::connect::background_connect_threading() {
1764 client.connector_layer(crate::utils::connect::BackgroundProcessorLayer::new())
1765 } else {
1766 client
1767 };
1768
1769 let client = match self.configuration.concurrency_limit {
1770 Some(limit) => {
1771 client.connector_layer(tower::limit::concurrency::ConcurrencyLimitLayer::new(limit))
1772 }
1773 _ => client,
1774 };
1775
1776 self.configure_http_client_cookies(client)
1777 }
1778
1779 #[cfg(all(not(feature = "decentralized"), feature = "cache_request"))]
1781 pub fn configure_http_client_builder(&self) -> reqwest_middleware::ClientBuilder {
1782 use crate::utils::create_cache_key;
1783 let client = self.configure_base_client();
1784
1785 let mut client = match &self.configuration.request_timeout {
1786 Some(t) => client.timeout(**t),
1787 _ => client,
1788 };
1789
1790 let client = match &self.configuration.proxies {
1791 Some(proxies) => {
1792 let linux = cfg!(target_os = "linux");
1793 let ignore_plain_socks = proxies.len() >= 2 && linux;
1794 let replace_plain_socks = proxies.len() == 1 && linux;
1795
1796 for proxie in proxies.iter() {
1797 if proxie.ignore == crate::configuration::ProxyIgnore::Http {
1798 continue;
1799 }
1800 let proxie = &proxie.addr;
1801
1802 let socks = proxie.starts_with("socks://");
1803
1804 if ignore_plain_socks && socks {
1806 continue;
1807 }
1808
1809 if replace_plain_socks && socks {
1811 if let Ok(proxy) =
1812 crate::client::Proxy::all(&proxie.replacen("socks://", "http://", 1))
1813 {
1814 client = client.proxy(proxy);
1815 }
1816 } else {
1817 if let Ok(proxy) = crate::client::Proxy::all(proxie) {
1818 client = client.proxy(proxy);
1819 }
1820 }
1821 }
1822
1823 client
1824 }
1825 _ => client,
1826 };
1827
1828 #[cfg(feature = "spider_cloud")]
1830 let client = if let Some(ref sc) = self.configuration.spider_cloud {
1831 if sc.uses_proxy() {
1832 match (crate::client::Proxy::all(&sc.proxy_url), reqwest::header::HeaderValue::from_str(&format!("Bearer {}", sc.api_key))) {
1833 (Ok(proxy), Ok(auth_value)) => client.proxy(proxy.custom_http_auth(auth_value)),
1834 _ => client,
1835 }
1836 } else {
1837 client
1838 }
1839 } else {
1840 client
1841 };
1842
1843 let client = self.configure_http_client_cookies(client);
1844
1845 let client = if crate::utils::connect::background_connect_threading() {
1846 client.connector_layer(crate::utils::connect::BackgroundProcessorLayer::new())
1847 } else {
1848 client
1849 };
1850
1851 let client = match self.configuration.concurrency_limit {
1852 Some(limit) => {
1853 client.connector_layer(tower::limit::concurrency::ConcurrencyLimitLayer::new(limit))
1854 }
1855 _ => client,
1856 };
1857
1858 let client =
1859 reqwest_middleware::ClientBuilder::new(unsafe { client.build().unwrap_unchecked() });
1860
1861 if self.configuration.cache {
1862 let mut cache_options = HttpCacheOptions::default();
1863
1864 cache_options.cache_key = Some(Arc::new(|req: &http::request::Parts| {
1865 let mut auth_token = None;
1866 if let Some(auth) = req.headers.get("authorization") {
1867 if let Ok(token) = auth.to_str() {
1868 if !token.is_empty() {
1869 auth_token = Some(token);
1870 }
1871 }
1872 }
1873 create_cache_key(req, Some(req.method.as_str()), auth_token)
1874 }));
1875 client.with(Cache(HttpCache {
1876 mode: CacheMode::Default,
1877 manager: CACACHE_MANAGER.clone(),
1878 options: cache_options,
1879 }))
1880 } else {
1881 client
1882 }
1883 }
1884
1885 #[cfg(all(not(feature = "decentralized"), feature = "cookies"))]
1887 pub fn configure_http_client_cookies(
1888 &self,
1889 client: crate::client::ClientBuilder,
1890 ) -> crate::client::ClientBuilder {
1891 let client = client.cookie_provider(self.cookie_jar.clone());
1892
1893 if !self.configuration.cookie_str.is_empty() {
1894 if let Some(url) = self.domain_parsed.as_ref() {
1895 self.cookie_jar
1896 .add_cookie_str(&self.configuration.cookie_str, url);
1897 }
1898 }
1899
1900 client
1901 }
1902
1903 #[cfg(all(not(feature = "decentralized"), not(feature = "cookies")))]
1905 pub fn configure_http_client_cookies(
1906 &self,
1907 client: crate::client::ClientBuilder,
1908 ) -> crate::client::ClientBuilder {
1909 client
1910 }
1911
1912 pub fn set_http_client(&mut self, client: Client) -> &Option<Client> {
1914 self.client = Some(client);
1915 &self.client
1916 }
1917
1918 #[cfg(all(not(feature = "decentralized"), not(feature = "cache_request")))]
1920 fn build_single_proxy_client(
1921 &self,
1922 proxy: &crate::configuration::RequestProxy,
1923 ) -> Option<Client> {
1924 if proxy.ignore == crate::configuration::ProxyIgnore::Http {
1925 return None;
1926 }
1927
1928 let client = self.configure_base_client();
1929
1930 let client = match &self.configuration.request_timeout {
1931 Some(t) => client.timeout(**t),
1932 _ => client,
1933 };
1934
1935 let addr = &proxy.addr;
1936 let linux = cfg!(target_os = "linux");
1937 let socks = addr.starts_with("socks://");
1938
1939 let client = if socks && linux {
1940 match crate::client::Proxy::all(&addr.replacen("socks://", "http://", 1)) {
1941 Ok(p) => client.proxy(p),
1942 Err(_) => return None,
1943 }
1944 } else {
1945 match crate::client::Proxy::all(addr) {
1946 Ok(p) => client.proxy(p),
1947 Err(_) => return None,
1948 }
1949 };
1950
1951 #[cfg(feature = "spider_cloud")]
1952 let client = if let Some(ref sc) = self.configuration.spider_cloud {
1953 if sc.uses_proxy() {
1954 match (
1955 crate::client::Proxy::all(&sc.proxy_url),
1956 reqwest::header::HeaderValue::from_str(&format!("Bearer {}", sc.api_key)),
1957 ) {
1958 (Ok(proxy), Ok(auth_value)) => {
1959 client.proxy(proxy.custom_http_auth(auth_value))
1960 }
1961 _ => client,
1962 }
1963 } else {
1964 client
1965 }
1966 } else {
1967 client
1968 };
1969
1970 let client = if crate::utils::connect::background_connect_threading() {
1971 client.connector_layer(crate::utils::connect::BackgroundProcessorLayer::new())
1972 } else {
1973 client
1974 };
1975
1976 let client = match self.configuration.concurrency_limit {
1977 Some(limit) => client
1978 .connector_layer(tower::limit::concurrency::ConcurrencyLimitLayer::new(limit)),
1979 _ => client,
1980 };
1981
1982 let client = self.configure_http_client_cookies(client);
1983 unsafe { Some(client.build().unwrap_unchecked()) }
1984 }
1985
1986 #[cfg(all(not(feature = "decentralized"), feature = "cache_request"))]
1988 fn build_single_proxy_client(
1989 &self,
1990 proxy: &crate::configuration::RequestProxy,
1991 ) -> Option<Client> {
1992 use crate::utils::create_cache_key;
1993
1994 if proxy.ignore == crate::configuration::ProxyIgnore::Http {
1995 return None;
1996 }
1997
1998 let client = self.configure_base_client();
1999
2000 let client = match &self.configuration.request_timeout {
2001 Some(t) => client.timeout(**t),
2002 _ => client,
2003 };
2004
2005 let addr = &proxy.addr;
2006 let linux = cfg!(target_os = "linux");
2007 let socks = addr.starts_with("socks://");
2008
2009 let client = if socks && linux {
2010 match crate::client::Proxy::all(&addr.replacen("socks://", "http://", 1)) {
2011 Ok(p) => client.proxy(p),
2012 Err(_) => return None,
2013 }
2014 } else {
2015 match crate::client::Proxy::all(addr) {
2016 Ok(p) => client.proxy(p),
2017 Err(_) => return None,
2018 }
2019 };
2020
2021 #[cfg(feature = "spider_cloud")]
2022 let client = if let Some(ref sc) = self.configuration.spider_cloud {
2023 if sc.uses_proxy() {
2024 match (
2025 crate::client::Proxy::all(&sc.proxy_url),
2026 reqwest::header::HeaderValue::from_str(&format!("Bearer {}", sc.api_key)),
2027 ) {
2028 (Ok(proxy), Ok(auth_value)) => {
2029 client.proxy(proxy.custom_http_auth(auth_value))
2030 }
2031 _ => client,
2032 }
2033 } else {
2034 client
2035 }
2036 } else {
2037 client
2038 };
2039
2040 let client = self.configure_http_client_cookies(client);
2041
2042 let client = if crate::utils::connect::background_connect_threading() {
2043 client.connector_layer(crate::utils::connect::BackgroundProcessorLayer::new())
2044 } else {
2045 client
2046 };
2047
2048 let client = match self.configuration.concurrency_limit {
2049 Some(limit) => client
2050 .connector_layer(tower::limit::concurrency::ConcurrencyLimitLayer::new(limit)),
2051 _ => client,
2052 };
2053
2054 let client =
2055 reqwest_middleware::ClientBuilder::new(unsafe { client.build().unwrap_unchecked() });
2056
2057 if self.configuration.cache {
2058 let mut cache_options = HttpCacheOptions::default();
2059
2060 cache_options.cache_key = Some(Arc::new(|req: &http::request::Parts| {
2061 let mut auth_token = None;
2062 if let Some(auth) = req.headers.get("authorization") {
2063 if let Ok(token) = auth.to_str() {
2064 if !token.is_empty() {
2065 auth_token = Some(token);
2066 }
2067 }
2068 }
2069 create_cache_key(req, Some(req.method.as_str()), auth_token)
2070 }));
2071
2072 Some(
2073 client
2074 .with(Cache(HttpCache {
2075 mode: CacheMode::Default,
2076 manager: CACACHE_MANAGER.clone(),
2077 options: cache_options,
2078 }))
2079 .build(),
2080 )
2081 } else {
2082 Some(client.build())
2083 }
2084 }
2085
2086 #[cfg(not(feature = "decentralized"))]
2088 fn build_rotated_clients(&self) -> Option<Arc<ClientRotator>> {
2089 let proxies = self.configuration.proxies.as_ref()?;
2090 if proxies.len() < 2 {
2091 return None;
2092 }
2093 let clients: Vec<Client> = proxies
2094 .iter()
2095 .filter_map(|proxy| self.build_single_proxy_client(proxy))
2096 .collect();
2097 if clients.len() < 2 {
2098 return None;
2099 }
2100 Some(Arc::new(ClientRotator::new(clients)))
2101 }
2102
2103 #[cfg(all(not(feature = "decentralized"), not(feature = "cache_request")))]
2105 pub fn configure_http_client(&self) -> Client {
2106 let client = self.configure_http_client_builder();
2107 unsafe { client.build().unwrap_unchecked() }
2109 }
2110
2111 #[cfg(all(not(feature = "decentralized"), feature = "cache_request"))]
2113 pub fn configure_http_client(&self) -> Client {
2114 let client = self.configure_http_client_builder();
2115 client.build()
2116 }
2117
2118 #[cfg(all(feature = "decentralized", not(feature = "cache_request")))]
2120 pub fn configure_http_client(&self) -> Client {
2121 use reqwest::header::{HeaderMap, HeaderValue};
2122
2123 let mut headers = HeaderMap::new();
2124
2125 let policy = self.setup_redirect_policy();
2126
2127 let mut client = Client::builder()
2128 .user_agent(match &self.configuration.user_agent {
2129 Some(ua) => ua.as_str(),
2130 _ => &get_ua(self.configuration.only_chrome_agent()),
2131 })
2132 .redirect(policy)
2133 .tcp_keepalive(Duration::from_millis(500));
2134
2135 let referer = if self.configuration.tld && self.configuration.subdomains {
2136 2
2137 } else if self.configuration.tld {
2138 2
2139 } else if self.configuration.subdomains {
2140 1
2141 } else {
2142 0
2143 };
2144
2145 if referer > 0 {
2146 headers.insert(reqwest::header::REFERER, HeaderValue::from(referer));
2148 }
2149
2150 if let Some(h) = &self.configuration.headers {
2151 headers.extend(h.inner().clone());
2152 }
2153
2154 if let Some(domain_url) = self.get_absolute_path(None) {
2155 let domain_url = domain_url.as_str();
2156 let domain_host = if domain_url.ends_with("/") {
2157 &domain_url[0..domain_url.len() - 1]
2158 } else {
2159 domain_url
2160 };
2161 if let Ok(value) = HeaderValue::from_str(domain_host) {
2162 headers.insert(reqwest::header::HOST, value);
2163 }
2164 }
2165
2166 for worker in WORKERS.iter() {
2167 if let Ok(worker) = crate::client::Proxy::all(worker) {
2168 client = client.proxy(worker);
2169 }
2170 }
2171
2172 if !self.configuration.modify_headers && self.configuration.modify_http_client_headers {
2173 if let Some(ua) = &self.configuration.user_agent {
2174 crate::utils::header_utils::extend_headers(
2175 &mut headers,
2176 ua,
2177 &self.configuration.headers,
2178 &None,
2179 &self.configuration.viewport,
2180 &self.domain_parsed,
2181 );
2182 }
2183 }
2184
2185 unsafe {
2187 match &self.configuration.request_timeout {
2188 Some(t) => client.timeout(**t),
2189 _ => client,
2190 }
2191 .default_headers(headers)
2192 .build()
2193 .unwrap_unchecked()
2194 }
2195 }
2196
2197 #[cfg(all(feature = "decentralized", feature = "cache_request"))]
2199 pub fn configure_http_client(&mut self) -> Client {
2200 use crate::utils::create_cache_key;
2201 use reqwest::header::{HeaderMap, HeaderValue};
2202 use reqwest_middleware::ClientBuilder;
2203
2204 let mut headers = HeaderMap::new();
2205
2206 let policy = self.setup_redirect_policy();
2207
2208 let mut client = reqwest::Client::builder()
2209 .user_agent(match &self.configuration.user_agent {
2210 Some(ua) => ua.as_str(),
2211 _ => &get_ua(self.configuration.only_chrome_agent()),
2212 })
2213 .redirect(policy)
2214 .tcp_keepalive(Duration::from_millis(500));
2215
2216 let referer = if self.configuration.tld && self.configuration.subdomains {
2217 2
2218 } else if self.configuration.tld {
2219 2
2220 } else if self.configuration.subdomains {
2221 1
2222 } else {
2223 0
2224 };
2225
2226 if referer > 0 {
2227 headers.insert(reqwest::header::REFERER, HeaderValue::from(referer));
2229 }
2230
2231 if let Some(h) = &self.configuration.headers {
2232 headers.extend(h.inner().clone());
2233 }
2234
2235 if let Some(domain_url) = self.get_absolute_path(None) {
2236 let domain_url = domain_url.as_str();
2237 let domain_host = if domain_url.ends_with("/") {
2238 &domain_url[0..domain_url.len() - 1]
2239 } else {
2240 domain_url
2241 };
2242 if let Ok(value) = HeaderValue::from_str(domain_host) {
2243 headers.insert(reqwest::header::HOST, value);
2244 }
2245 }
2246
2247 for worker in WORKERS.iter() {
2248 if let Ok(worker) = crate::client::Proxy::all(worker) {
2249 client = client.proxy(worker);
2250 }
2251 }
2252
2253 let mut cache_options = HttpCacheOptions::default();
2254
2255 cache_options.cache_key = Some(Arc::new(|req: &http::request::Parts| {
2256 let mut auth_token = None;
2257 if let Some(auth) = req.headers.get("authorization") {
2258 if let Ok(token) = auth.to_str() {
2259 if !token.is_empty() {
2260 auth_token = Some(token);
2261 }
2262 }
2263 }
2264 create_cache_key(req, Some(req.method.as_str()), auth_token)
2265 }));
2266
2267 if !self.configuration.modify_headers && self.configuration.modify_http_client_headers {
2268 if let Some(ua) = &self.configuration.user_agent {
2269 crate::utils::header_utils::extend_headers(
2270 &mut headers,
2271 ua,
2272 &self.configuration.headers,
2273 &None,
2274 &self.configuration.viewport,
2275 &self.domain_parsed,
2276 );
2277 }
2278 }
2279
2280 let client = ClientBuilder::new(unsafe {
2281 match &self.configuration.request_timeout {
2282 Some(t) => client.timeout(**t),
2283 _ => client,
2284 }
2285 .default_headers(headers)
2286 .build()
2287 .unwrap_unchecked()
2288 })
2289 .with(Cache(HttpCache {
2290 mode: CacheMode::Default,
2291 manager: CACACHE_MANAGER.clone(),
2292 options: cache_options,
2293 }));
2294
2295 client.build()
2296 }
2297
2298 #[cfg(feature = "control")]
2300 pub fn configure_handler(&self) -> Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)> {
2301 use crate::utils::{Handler, CONTROLLER};
2302
2303 if self.configuration.no_control_thread {
2304 None
2305 } else {
2306 let c: Arc<AtomicI8> = Arc::new(AtomicI8::new(0));
2307 let handle = c.clone();
2308 let target_id = self.target_id();
2309
2310 let join_handle = crate::utils::spawn_task("control_handler", async move {
2311 let mut l = CONTROLLER.read().await.1.to_owned();
2312
2313 while l.changed().await.is_ok() {
2314 let n = &*l.borrow();
2315 let (target, rest) = n;
2316
2317 if target_id.eq_ignore_ascii_case(&target) {
2318 if rest == &Handler::Resume {
2319 c.store(0, Ordering::Relaxed);
2320 }
2321 if rest == &Handler::Pause {
2322 c.store(1, Ordering::Relaxed);
2323 }
2324 if rest == &Handler::Shutdown {
2325 c.store(2, Ordering::Relaxed);
2326 }
2327 }
2328 }
2329 });
2330
2331 Some((handle, join_handle))
2332 }
2333 }
2334
2335 #[cfg(not(feature = "control"))]
2336 pub fn configure_handler(&self) -> Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)> {
2338 None
2339 }
2340
2341 #[cfg(all(feature = "chrome", feature = "chrome_intercept"))]
2343 pub async fn setup_chrome_interception(
2344 &self,
2345 page: &chromiumoxide::Page,
2346 ) -> Option<tokio::task::JoinHandle<()>> {
2347 crate::features::chrome::setup_chrome_interception_base(
2348 page,
2349 self.configuration.chrome_intercept.enabled,
2350 &self.configuration.auth_challenge_response,
2351 self.configuration.chrome_intercept.block_visuals,
2352 self.url.inner(),
2353 )
2354 .await
2355 }
2356
2357 #[cfg(all(feature = "chrome", not(feature = "chrome_intercept")))]
2359 pub async fn setup_chrome_interception(
2360 &self,
2361 _chrome_page: &chromiumoxide::Page,
2362 ) -> Option<tokio::task::JoinHandle<()>> {
2363 None
2364 }
2365
2366 pub fn setup_selectors(&self) -> RelativeSelectors {
2368 setup_website_selectors(
2369 self.get_url().inner(),
2370 AllowedDomainTypes::new(self.configuration.subdomains, self.configuration.tld),
2371 )
2372 }
2373
2374 pub fn setup_base(&mut self) -> (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) {
2376 self.determine_limits();
2377 self.setup_disk();
2378 self.configure_headers();
2379
2380 crate::utils::connect::init_background_runtime();
2381
2382 let client = match self.client.take() {
2383 Some(client) => client,
2384 _ => self.configure_http_client(),
2385 };
2386
2387 #[cfg(not(feature = "decentralized"))]
2388 {
2389 self.client_rotator = self.build_rotated_clients();
2390 }
2391
2392 (client, self.configure_handler())
2393 }
2394
2395 pub async fn setup(
2397 &mut self,
2398 ) -> (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) {
2399 let setup = self.setup_base();
2400 if self.status != CrawlStatus::Active {
2401 self.clear_all().await;
2402 } else {
2403 self.skip_initial = !self.extra_links.is_empty();
2404 }
2405 self.configure_robots_parser(&setup.0).await;
2406 setup
2407 }
2408
2409 pub fn setup_crawl(
2411 &self,
2412 ) -> (
2413 std::pin::Pin<Box<tokio::time::Interval>>,
2414 std::pin::Pin<Box<Duration>>,
2415 ) {
2416 let interval = Box::pin(tokio::time::interval(Duration::from_millis(10)));
2417 let throttle = Box::pin(self.get_delay());
2418
2419 (interval, throttle)
2420 }
2421
2422 #[cfg(feature = "glob")]
2424 pub fn get_expanded_links(&self, domain_name: &str) -> Vec<CaseInsensitiveString> {
2425 let mut expanded = crate::features::glob::expand_url(&domain_name);
2426
2427 if expanded.len() == 0 {
2428 if let Some(u) = self.get_absolute_path(Some(domain_name)) {
2429 expanded.push(u.as_str().into());
2430 }
2431 };
2432
2433 expanded
2434 }
2435
2436 pub fn set_crawl_initial_status(
2438 &mut self,
2439 page: &crate::page::Page,
2440 links: &HashSet<CaseInsensitiveString>,
2441 ) {
2442 use crate::utils::{detect_open_resty_forbidden, APACHE_FORBIDDEN};
2443
2444 if page.status_code == reqwest::StatusCode::FORBIDDEN && links.is_empty() {
2445 if is_safe_javascript_challenge(&page) {
2446 self.website_meta_info = WebsiteMetaInfo::RequiresJavascript;
2447 } else if page.get_html_bytes_u8() == *APACHE_FORBIDDEN {
2448 self.website_meta_info = WebsiteMetaInfo::Apache403;
2449 } else if detect_open_resty_forbidden(page.get_html_bytes_u8()) {
2450 self.website_meta_info = WebsiteMetaInfo::OpenResty403;
2451 }
2452 self.status = CrawlStatus::Blocked;
2453 } else if page.status_code == reqwest::StatusCode::TOO_MANY_REQUESTS {
2454 self.status = CrawlStatus::RateLimited;
2455 } else if page.status_code.is_server_error() {
2456 self.status = CrawlStatus::ServerError;
2457 } else if page.is_empty() {
2458 if page.status_code == *UNKNOWN_STATUS_ERROR
2459 || page.status_code == *CHROME_UNKNOWN_STATUS_ERROR
2460 {
2461 self.status = CrawlStatus::ConnectError;
2462 } else {
2463 self.status = CrawlStatus::Empty;
2464 }
2465 }
2466 }
2467
2468 #[cfg(feature = "cmd")]
2470 pub async fn _crawl_establish_cmd(
2471 &mut self,
2472 cmd: std::path::PathBuf,
2473 cmd_args: Vec<String>,
2474 base: &mut RelativeSelectors,
2475 _ssg_build: bool,
2476 ) -> HashSet<CaseInsensitiveString> {
2477 if self.skip_initial {
2478 return Default::default();
2479 }
2480
2481 if !self
2482 .is_allowed_default(self.get_base_link())
2483 .eq(&ProcessLinkStatus::Allowed)
2484 {
2485 return HashSet::new();
2486 }
2487
2488 let url = self.url.inner();
2489
2490 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
2491 let mut links_ssg = HashSet::new();
2492 let mut links_pages = if self.configuration.return_page_links {
2493 Some(HashSet::new())
2494 } else {
2495 None
2496 };
2497
2498 let mut page_links_settings =
2499 PageLinkBuildSettings::new(true, self.configuration.full_resources);
2500 page_links_settings.subdomains = self.configuration.subdomains;
2501 page_links_settings.tld = self.configuration.tld;
2502 page_links_settings.normalize = self.configuration.normalize;
2503
2504 let mut domain_parsed = self.domain_parsed.take();
2505
2506 let mut retry_count = self.configuration.retry;
2507 let mut last_err: Option<std::io::Error> = None;
2508
2509 let build_error_page = |status: StatusCode, _err: std::io::Error| {
2510 let mut p = Page::default();
2511 p.url = url.to_string();
2512 p.status_code = status;
2513 #[cfg(feature = "page_error_status_details")]
2514 {
2515 p.error_for_status = Some(Err(_err));
2516 }
2517 p
2518 };
2519
2520 let mut page: Page = loop {
2521 let bytes = match Self::run_via_cmd(&cmd, &cmd_args, url).await {
2522 Ok(b) => {
2523 if b.is_empty() {
2524 last_err = Some(std::io::Error::new(
2525 std::io::ErrorKind::UnexpectedEof,
2526 "cmd returned empty stdout",
2527 ));
2528 None
2529 } else {
2530 Some(b)
2531 }
2532 }
2533 Err(e) => {
2534 last_err = Some(e);
2535 None
2536 }
2537 };
2538
2539 if let Some(bytes) = bytes.as_deref() {
2540 let mut domain_parsed_out = None;
2541
2542 let page = Page::new_page_streaming_from_bytes(
2543 url,
2544 bytes,
2545 base,
2546 &self.configuration.external_domains_caseless,
2547 &page_links_settings,
2548 &mut links,
2549 Some(&mut links_ssg),
2550 &mut domain_parsed,
2551 &mut domain_parsed_out,
2552 &mut links_pages,
2553 )
2554 .await;
2555
2556 if self.domain_parsed.is_none() {
2557 if let Some(mut dp) = domain_parsed.take() {
2558 convert_abs_url(&mut dp);
2559 self.domain_parsed.replace(dp);
2560 } else if let Some(mut dp) = domain_parsed_out.take() {
2561 convert_abs_url(&mut dp);
2562 self.domain_parsed.replace(dp);
2563 }
2564 } else if self.domain_parsed.is_none() {
2565 self.domain_parsed = domain_parsed_out;
2566 }
2567
2568 if page.should_retry && retry_count > 0 {
2569 retry_count -= 1;
2570 if let Some(timeout) = page.get_timeout() {
2571 tokio::time::sleep(timeout).await;
2572 } else {
2573 tokio::time::sleep(std::time::Duration::from_millis(250)).await;
2574 }
2575 continue;
2576 }
2577
2578 break page;
2579 }
2580
2581 if retry_count == 0 {
2582 let err = last_err.take().unwrap_or_else(|| {
2583 std::io::Error::new(
2584 std::io::ErrorKind::Other,
2585 "cmd fetch failed (unknown error)",
2586 )
2587 });
2588 break build_error_page(StatusCode::BAD_GATEWAY, err);
2589 }
2590
2591 retry_count -= 1;
2592 tokio::time::sleep(std::time::Duration::from_millis(250)).await;
2593 };
2594
2595 if page.get_html_bytes_u8().starts_with(b"<?xml") {
2596 page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
2597 .await;
2598 }
2599
2600 emit_log(url);
2601
2602 if let Some(signature) = page.signature {
2603 if !self.is_signature_allowed(signature).await {
2604 return Default::default();
2605 }
2606 self.insert_signature(signature).await;
2607 }
2608
2609 let url_ci = match &self.on_link_find_callback {
2610 Some(cb) => cb(*self.url.clone(), None).0,
2611 _ => *self.url.clone(),
2612 };
2613 self.insert_link(url_ci).await;
2614
2615 if self.configuration.return_page_links {
2616 page.page_links = links_pages
2617 .filter(|pages: &HashSet<CaseInsensitiveString>| !pages.is_empty())
2618 .map(Box::new);
2619 }
2620
2621 links.extend(links_ssg);
2622
2623 self.initial_status_code = page.status_code;
2624 self.initial_html_length = page.get_html_bytes_u8().len();
2625 self.initial_anti_bot_tech = page.anti_bot_tech;
2626 self.initial_page_should_retry = page.should_retry;
2627 self.initial_page_waf_check = page.waf_check;
2628
2629 self.set_crawl_initial_status(&page, &links);
2630
2631 if let Some(ref cb) = self.on_should_crawl_callback {
2632 if !cb.call(&page) {
2633 page.blocked_crawl = true;
2634 channel_send_page(&self.channel, page, &self.channel_guard);
2635 return Default::default();
2636 }
2637 }
2638
2639 channel_send_page(&self.channel, page, &self.channel_guard);
2640
2641 links
2642 }
2643
2644 #[cfg(not(feature = "glob"))]
2646 pub async fn _crawl_establish(
2647 &mut self,
2648 client: &Client,
2649 base: &mut RelativeSelectors,
2650 _: bool,
2651 ) -> HashSet<CaseInsensitiveString> {
2652 if self.skip_initial {
2653 return Default::default();
2654 }
2655
2656 if self
2657 .is_allowed_default(self.get_base_link())
2658 .eq(&ProcessLinkStatus::Allowed)
2659 {
2660 let url = self.url.inner();
2661
2662 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
2663 let mut links_ssg = HashSet::new();
2664 let mut links_pages = if self.configuration.return_page_links {
2665 Some(HashSet::new())
2666 } else {
2667 None
2668 };
2669 let mut page_links_settings =
2670 PageLinkBuildSettings::new(true, self.configuration.full_resources);
2671
2672 page_links_settings.subdomains = self.configuration.subdomains;
2673 page_links_settings.tld = self.configuration.tld;
2674 page_links_settings.normalize = self.configuration.normalize;
2675
2676 let mut domain_parsed = self.domain_parsed.take();
2677
2678 let mut page = if let Some(mut seeded_page) = self.build_seed_page() {
2679 #[cfg(not(feature = "decentralized"))]
2681 {
2682 let html_bytes = seeded_page.get_html_bytes_u8();
2683 if !html_bytes.is_empty() && !auto_encoder::is_binary_file(html_bytes) {
2684 let html = seeded_page.get_html();
2685 let extracted_links: HashSet<CaseInsensitiveString> = seeded_page
2686 .links_stream_base_ssg(base, &html, client, &self.domain_parsed)
2687 .await;
2688 links.extend(extracted_links);
2689 }
2690 }
2691 seeded_page
2692 } else {
2693 Page::new_page_streaming(
2694 url,
2695 client,
2696 false,
2697 base,
2698 &self.configuration.external_domains_caseless,
2699 &page_links_settings,
2700 &mut links,
2701 Some(&mut links_ssg),
2702 &mut domain_parsed, &mut self.domain_parsed,
2704 &mut links_pages,
2705 )
2706 .await
2707 };
2708
2709 if page.get_html_bytes_u8().starts_with(b"<?xml") {
2710 page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
2711 .await;
2712 }
2713
2714 if self.domain_parsed.is_none() {
2715 if let Some(mut domain_parsed) = domain_parsed.take() {
2716 convert_abs_url(&mut domain_parsed);
2717 self.domain_parsed.replace(domain_parsed);
2718 }
2719 }
2720
2721 let mut retry_count = self.configuration.retry;
2722 let domains_caseless = &self.configuration.external_domains_caseless;
2723
2724 while page.should_retry && retry_count > 0 {
2725 retry_count -= 1;
2726 if let Some(timeout) = page.get_timeout() {
2727 tokio::time::sleep(timeout).await;
2728 }
2729
2730 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
2731 let mut domain_parsed_clone = self.domain_parsed.clone();
2732
2733 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
2734 page.clone_from(
2735 &Page::new_page_streaming(
2736 url,
2737 client,
2738 false,
2739 base,
2740 domains_caseless,
2741 &page_links_settings,
2742 &mut links,
2743 Some(&mut links_ssg),
2744 &mut domain_parsed,
2745 &mut domain_parsed_clone,
2746 &mut links_pages,
2747 )
2748 .await,
2749 );
2750 })
2751 .await
2752 {
2753 log::info!("backoff gateway timeout exceeded {elasped}");
2754 }
2755
2756 self.domain_parsed = domain_parsed_clone;
2757 } else {
2758 page.clone_from(
2759 &Page::new_page_streaming(
2760 url,
2761 client,
2762 false,
2763 base,
2764 &self.configuration.external_domains_caseless,
2765 &page_links_settings,
2766 &mut links,
2767 Some(&mut links_ssg),
2768 &mut domain_parsed,
2769 &mut self.domain_parsed,
2770 &mut links_pages,
2771 )
2772 .await,
2773 );
2774 }
2775 }
2776
2777 emit_log(url);
2778
2779 if let Some(signature) = page.signature {
2780 if !self.is_signature_allowed(signature).await {
2781 return Default::default();
2782 }
2783 self.insert_signature(signature).await;
2784 }
2785
2786 let url = match &self.on_link_find_callback {
2787 Some(cb) => cb(*self.url.clone(), None).0,
2788 _ => *self.url.clone(),
2789 };
2790
2791 self.insert_link(url).await;
2792
2793 if self.configuration.return_page_links {
2794 page.page_links = links_pages
2795 .filter(|pages: &HashSet<CaseInsensitiveString>| !pages.is_empty())
2796 .map(Box::new);
2797 }
2798
2799 links.extend(links_ssg);
2800
2801 self.initial_status_code = page.status_code;
2802 self.initial_html_length = page.get_html_bytes_u8().len();
2803 self.initial_anti_bot_tech = page.anti_bot_tech;
2804 self.initial_page_should_retry = page.should_retry;
2805 self.initial_page_waf_check = page.waf_check;
2806
2807 self.set_crawl_initial_status(&page, &links);
2808
2809 if let Some(ref cb) = self.on_should_crawl_callback {
2810 if !cb.call(&page) {
2811 page.blocked_crawl = true;
2812 channel_send_page(&self.channel, page, &self.channel_guard);
2813 return Default::default();
2814 }
2815 }
2816
2817 channel_send_page(&self.channel, page, &self.channel_guard);
2818
2819 links
2820 } else {
2821 HashSet::new()
2822 }
2823 }
2824
2825 #[cfg(feature = "cmd")]
2827 pub async fn run_via_cmd(
2828 cmd: &std::path::Path,
2829 fixed_args: &[String],
2830 url: &str,
2831 ) -> std::io::Result<Vec<u8>> {
2832 use tokio::process::Command;
2833 let mut args: Vec<String> = Vec::with_capacity(fixed_args.len() + 1);
2834 let mut used_placeholder = false;
2835
2836 for a in fixed_args {
2837 if a.contains("{url}") {
2838 used_placeholder = true;
2839 args.push(a.replace("{url}", url));
2840 } else {
2841 args.push(a.clone());
2842 }
2843 }
2844
2845 if !used_placeholder {
2846 args.push(url.to_string());
2847 }
2848
2849 let out = Command::new(cmd)
2850 .args(&args)
2851 .kill_on_drop(true)
2852 .output()
2853 .await?;
2854
2855 if !out.status.success() {
2856 let code = out.status.code().unwrap_or(-1);
2857 let stderr = String::from_utf8_lossy(&out.stderr);
2858
2859 return Err(std::io::Error::new(
2860 std::io::ErrorKind::Other,
2861 format!("cmd exit={code} stderr={stderr}"),
2862 ));
2863 }
2864
2865 Ok(out.stdout)
2866 }
2867
2868 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
2872 #[cfg(feature = "cmd")]
2873 pub async fn crawl_concurrent_cmd(
2874 &mut self,
2875 cmd: std::path::PathBuf,
2876 cmd_args: Vec<String>,
2877 handle: &Option<Arc<AtomicI8>>,
2878 ) {
2879 self.start();
2880 self.status = CrawlStatus::Active;
2881
2882 let mut selector: (
2883 CompactString,
2884 smallvec::SmallVec<[CompactString; 2]>,
2885 CompactString,
2886 ) = self.setup_selectors();
2887
2888 if self.single_page() {
2889 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
2890 let mut links_pages: Option<HashSet<CaseInsensitiveString>> =
2891 if self.configuration.return_page_links {
2892 Some(HashSet::new())
2893 } else {
2894 None
2895 };
2896
2897 let mut relative_selectors = selector;
2898 let mut domain_parsed = None;
2899
2900 let target = self
2901 .domain_parsed
2902 .as_ref()
2903 .map(|u| u.as_str())
2904 .unwrap_or(self.get_url());
2905
2906 let bytes = match Self::run_via_cmd(&cmd, &cmd_args, target).await {
2907 Ok(b) => b,
2908 Err(e) => {
2909 let mut page = Page::default();
2910 page.url = target.to_string();
2911 page.status_code = StatusCode::BAD_GATEWAY;
2912 #[cfg(feature = "page_error_status_details")]
2913 {
2914 page.error_for_status = Some(Err(e));
2915 }
2916 channel_send_page(&self.channel, page, &self.channel_guard);
2917 return;
2918 }
2919 };
2920
2921 let page = Page::new_page_streaming_from_bytes(
2922 target,
2923 &bytes,
2924 &mut relative_selectors,
2925 &self.configuration.external_domains_caseless,
2926 &PageLinkBuildSettings::new_full(
2927 false,
2928 self.configuration.full_resources,
2929 self.configuration.subdomains,
2930 self.configuration.tld,
2931 self.configuration.normalize,
2932 ),
2933 &mut links,
2934 None,
2935 &self.domain_parsed,
2936 &mut domain_parsed,
2937 &mut links_pages,
2938 )
2939 .await;
2940
2941 channel_send_page(&self.channel, page, &self.channel_guard);
2942 return;
2943 }
2944
2945 let on_should_crawl_callback = self.on_should_crawl_callback.clone();
2946 let return_page_links = self.configuration.return_page_links;
2947 let full_resources = self.configuration.full_resources;
2948 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
2949
2950 let (mut interval, throttle) = self.setup_crawl();
2951 let mut links: HashSet<CaseInsensitiveString> = self.drain_extra_links().collect();
2952
2953 links.extend(
2954 self._crawl_establish_cmd(cmd.clone(), cmd_args.clone(), &mut selector, false)
2955 .await,
2956 );
2957
2958 self.configuration.configure_allowlist();
2959 let semaphore = self.setup_semaphore();
2960
2961 let shared = Arc::new((
2962 cmd,
2963 cmd_args,
2964 selector,
2965 self.channel.clone(),
2966 self.configuration.external_domains_caseless.clone(),
2967 self.channel_guard.clone(),
2968 self.configuration.retry,
2969 return_page_links,
2970 PageLinkBuildSettings::new_full(
2971 false,
2972 full_resources,
2973 self.configuration.subdomains,
2974 self.configuration.tld,
2975 self.configuration.normalize,
2976 ),
2977 self.domain_parsed.clone(),
2978 self.on_link_find_callback.clone(),
2979 ));
2980
2981 let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> = JoinSet::new();
2982
2983 let mut exceeded_budget = false;
2984 let concurrency = throttle.is_zero();
2985
2986 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
2987
2988 if !concurrency && !links.is_empty() {
2989 tokio::time::sleep(*throttle).await;
2990 }
2991
2992 let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
2993 Some(Instant::now())
2994 } else {
2995 None
2996 };
2997
2998 'outer: loop {
2999 let mut stream =
3000 tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect());
3001
3002 loop {
3003 if !concurrency {
3004 tokio::time::sleep(*throttle).await;
3005 }
3006
3007 let semaphore = get_semaphore(&semaphore, !self.configuration.shared_queue).await;
3008
3009 tokio::select! {
3010 biased;
3011
3012 Some(link) = stream.next(),
3013 if semaphore.available_permits() > 0
3014 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) =>
3015 {
3016 if !self.handle_process(handle, &mut interval, async {
3017 emit_log_shutdown(link.inner());
3018 let permits = set.len();
3019 set.shutdown().await;
3020 semaphore.add_permits(permits);
3021 }).await {
3022 while let Some(links) = stream.next().await {
3023 self.extra_links.insert(links);
3024 }
3025 break 'outer;
3026 }
3027
3028 let allowed = self.is_allowed(&link);
3029 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
3030 exceeded_budget = true;
3031 break;
3032 }
3033 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
3034 continue;
3035 }
3036
3037 emit_log(link.inner());
3038 self.insert_link(link.clone()).await;
3039
3040 if let Ok(permit) = semaphore.clone().acquire_owned().await {
3041 let shared = shared.clone();
3042 let on_should_crawl_callback = on_should_crawl_callback.clone();
3043 spawn_set("page_fetch_cmd", &mut set, async move {
3044 let link_result = match &shared.10 {
3045 Some(cb) => cb(link, None),
3046 _ => (link, None),
3047 };
3048
3049 let mut out_links: HashSet<CaseInsensitiveString> = HashSet::new();
3050 let mut links_pages = if shared.7 { Some(HashSet::new()) } else { None };
3051
3052 let mut relative_selectors = shared.2.clone();
3053 let mut r_settings = shared.8;
3054 r_settings.ssg_build = true;
3055
3056 let target_url = link_result.0.as_ref();
3057
3058 let mut retry_count = shared.6;
3060 let mut last_err: Option<std::io::Error> = None;
3061
3062 let bytes = loop {
3063 match Self::run_via_cmd(&shared.0, &shared.1, target_url).await {
3064 Ok(b) if !b.is_empty() => break Some(b),
3065 Ok(_) => {
3066 last_err = Some(std::io::Error::new(
3067 std::io::ErrorKind::UnexpectedEof,
3068 "cmd returned empty stdout",
3069 ));
3070 }
3071 Err(e) => {
3072 last_err = Some(e);
3073 }
3074 }
3075
3076 if retry_count == 0 { break None; }
3077 retry_count -= 1;
3078
3079 tokio::time::sleep(std::time::Duration::from_millis(250)).await;
3080 };
3081
3082 let mut domain_parsed = None;
3083
3084 let mut page = if let Some(bytes) = bytes {
3085 Page::new_page_streaming_from_bytes(
3086 target_url,
3087 &bytes,
3088 &mut relative_selectors,
3089 &shared.4,
3090 &r_settings,
3091 &mut out_links,
3092 None,
3093 &shared.9,
3094 &mut domain_parsed,
3095 &mut links_pages,
3096 ).await
3097 } else {
3098 let mut p = Page::default();
3100 p.url = target_url.to_string();
3101 p.status_code = StatusCode::BAD_GATEWAY;
3102 if let Some(e) = last_err {
3103 #[cfg(feature = "page_error_status_details")]
3104 {
3105 p.error_for_status = Some(Err(e));
3106 }
3107 }
3108 p
3109 };
3110
3111 if shared.7 {
3112 page.page_links = links_pages
3113 .filter(|pages| !pages.is_empty())
3114 .map(Box::new);
3115 }
3116
3117 if let Some(ref cb) = on_should_crawl_callback {
3118 if !cb.call(&page) {
3119 page.blocked_crawl = true;
3120 channel_send_page(&shared.3, page, &shared.5);
3121 drop(permit);
3122 return Default::default();
3123 }
3124 }
3125
3126 let signature = page.signature;
3127 channel_send_page(&shared.3, page, &shared.5);
3128 drop(permit);
3129
3130 (out_links, signature)
3131 });
3132 }
3133
3134 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
3135 },
3136
3137 Some(result) = set.join_next(), if !set.is_empty() => {
3138 if let Ok(res) = result {
3139 match res.1 {
3140 Some(signature) => {
3141 if self.is_signature_allowed(signature).await {
3142 self.insert_signature(signature).await;
3143 self.links_visited.extend_links(&mut links, res.0);
3144 }
3145 }
3146 _ => {
3147 self.links_visited.extend_links(&mut links, res.0);
3148 }
3149 }
3150 } else {
3151 break;
3152 }
3153 }
3154
3155 else => break,
3156 }
3157
3158 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
3159
3160 if (links.is_empty() && set.is_empty()) || exceeded_budget {
3161 if exceeded_budget {
3162 while let Some(links) = stream.next().await {
3163 self.extra_links.insert(links);
3164 }
3165 while let Some(links) = set.join_next().await {
3166 if let Ok(links) = links {
3167 self.extra_links.extend(links.0);
3168 }
3169 }
3170 }
3171 break 'outer;
3172 }
3173 }
3174
3175 self.subscription_guard().await;
3176 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
3177
3178 if links.is_empty() && set.is_empty() {
3179 break;
3180 }
3181 }
3182
3183 if !links.is_empty() {
3184 self.extra_links.extend(links);
3185 }
3186 }
3187
3188 #[allow(dead_code)]
3190 fn build_seed_page(&self) -> Option<Page> {
3191 if let Some(seeded_html) = self.get_seeded_html() {
3192 let mut page_response = PageResponse::default();
3193 page_response.content = Some(Box::new(seeded_html.as_bytes().to_vec()));
3194 Some(build(&self.url.inner(), page_response))
3195 } else {
3196 None
3197 }
3198 }
3199
3200 #[cfg(all(
3202 not(feature = "decentralized"),
3203 feature = "chrome",
3204 not(feature = "glob")
3205 ))]
3206 pub async fn crawl_establish(
3207 &mut self,
3208 client: &Client,
3209 base: &mut RelativeSelectors,
3210 _: bool,
3211 chrome_page: &chromiumoxide::Page,
3212 ) -> HashSet<CaseInsensitiveString> {
3213 if self.skip_initial {
3214 return Default::default();
3215 }
3216
3217 if self
3218 .is_allowed_default(&self.get_base_link())
3219 .eq(&ProcessLinkStatus::Allowed)
3220 {
3221 let (_, intercept_handle) = tokio::join!(
3222 crate::features::chrome::setup_chrome_events(chrome_page, &self.configuration),
3223 self.setup_chrome_interception(&chrome_page)
3224 );
3225
3226 let mut page = if let Some(seeded_html) = self.get_seeded_html() {
3227 Page::new_seeded(
3228 &self.url.inner(),
3229 &client,
3230 &chrome_page,
3231 &self.configuration.wait_for,
3232 &self.configuration.screenshot,
3233 false, &self.configuration.openai_config,
3235 &self.configuration.execution_scripts,
3236 &self.configuration.automation_scripts,
3237 &self.configuration.viewport,
3238 &self.configuration.request_timeout,
3239 &self.configuration.track_events,
3240 self.configuration.referer.clone(),
3241 self.configuration.max_page_bytes,
3242 self.configuration.get_cache_options(),
3243 &self.configuration.cache_policy,
3244 Some(seeded_html.clone()),
3245 Some(&self.cookie_jar),
3246 &self.configuration.remote_multimodal,
3247 )
3248 .await
3249 } else {
3250 Page::new(
3251 &self.url.inner(),
3252 &client,
3253 &chrome_page,
3254 &self.configuration.wait_for,
3255 &self.configuration.screenshot,
3256 false, &self.configuration.openai_config,
3258 &self.configuration.execution_scripts,
3259 &self.configuration.automation_scripts,
3260 &self.configuration.viewport,
3261 &self.configuration.request_timeout,
3262 &self.configuration.track_events,
3263 self.configuration.referer.clone(),
3264 self.configuration.max_page_bytes,
3265 self.configuration.get_cache_options(),
3266 &self.configuration.cache_policy,
3267 &self.configuration.remote_multimodal,
3268 )
3269 .await
3270 };
3271
3272 let mut retry_count = self.configuration.retry;
3273
3274 if let Some(final_redirect_destination) = &page.final_redirect_destination {
3275 if final_redirect_destination == "chrome-error://chromewebdata/"
3276 && page.status_code.is_success()
3277 && page.is_empty()
3278 && self.configuration.proxies.is_some()
3279 {
3280 page.error_status = Some("Invalid proxy configuration.".into());
3281 page.should_retry = true;
3282 page.status_code = *crate::page::CHROME_UNKNOWN_STATUS_ERROR;
3283 }
3284 }
3285
3286 while page.should_retry && retry_count > 0 {
3287 retry_count -= 1;
3288 if let Some(timeout) = page.get_timeout() {
3289 tokio::time::sleep(timeout).await;
3290 }
3291 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
3292 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
3293 let next_page = Page::new(
3294 &self.url.inner(),
3295 &client,
3296 &chrome_page,
3297 &self.configuration.wait_for,
3298 &self.configuration.screenshot,
3299 false, &self.configuration.openai_config,
3301 &self.configuration.execution_scripts,
3302 &self.configuration.automation_scripts,
3303 &self.configuration.viewport,
3304 &self.configuration.request_timeout,
3305 &self.configuration.track_events,
3306 self.configuration.referer.clone(),
3307 self.configuration.max_page_bytes,
3308 self.configuration.get_cache_options(),
3309 &self.configuration.cache_policy,
3310 &self.configuration.remote_multimodal,
3311 )
3312 .await;
3313 page.clone_from(&next_page);
3314 })
3315 .await
3316 {
3317 log::warn!("backoff timeout {elasped}");
3318 }
3319 } else {
3320 let next_page = Page::new(
3321 &self.url.inner(),
3322 &client,
3323 &chrome_page,
3324 &self.configuration.wait_for,
3325 &self.configuration.screenshot,
3326 false, &self.configuration.openai_config,
3328 &self.configuration.execution_scripts,
3329 &self.configuration.automation_scripts,
3330 &self.configuration.viewport,
3331 &self.configuration.request_timeout,
3332 &self.configuration.track_events,
3333 self.configuration.referer.clone(),
3334 self.configuration.max_page_bytes,
3335 self.configuration.get_cache_options(),
3336 &self.configuration.cache_policy,
3337 &self.configuration.remote_multimodal,
3338 )
3339 .await;
3340 page.clone_from(&next_page);
3341 }
3342
3343 if let Some(final_redirect_destination) = &page.final_redirect_destination {
3345 if final_redirect_destination == "chrome-error://chromewebdata/"
3346 && page.status_code.is_success()
3347 && page.is_empty()
3348 && self.configuration.proxies.is_some()
3349 {
3350 page.error_status = Some("Invalid proxy configuration.".into());
3351 page.should_retry = true;
3352 page.status_code = *crate::page::CHROME_UNKNOWN_STATUS_ERROR;
3353 }
3354 }
3355 }
3356
3357 if let Some(h) = intercept_handle {
3358 let abort_handle = h.abort_handle();
3359 if let Err(elasped) =
3360 tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await
3361 {
3362 log::warn!("Handler timeout exceeded {elasped}");
3363 abort_handle.abort();
3364 }
3365 }
3366
3367 if let Some(domain) = &page.final_redirect_destination {
3368 let domain: Box<CaseInsensitiveString> = CaseInsensitiveString::new(&domain).into();
3369 let prior_domain = self.domain_parsed.take();
3370 self.domain_parsed = parse_absolute_url(&domain);
3371 self.url = domain;
3372
3373 let s = self.setup_selectors();
3374 base.0 = s.0;
3375 base.1 = s.1;
3376
3377 if let Some(pdname) = prior_domain {
3378 if let Some(dname) = pdname.host_str() {
3379 base.2 = dname.into();
3380 }
3381 }
3382 }
3383
3384 emit_log(&self.url.inner());
3385
3386 if let Some(sid) = page.signature {
3387 self.insert_signature(sid).await;
3388 }
3389
3390 let url = match &self.on_link_find_callback {
3391 Some(cb) => cb(*self.url.clone(), None).0,
3392 _ => *self.url.clone(),
3393 };
3394
3395 self.insert_link(url).await;
3396
3397 if self.configuration.return_page_links && page.page_links.is_none() {
3399 page.page_links = Some(Box::new(Default::default()));
3400 }
3401
3402 let xml_file = page.get_html_bytes_u8().starts_with(b"<?xml");
3403
3404 let mut links = if !page.is_empty() && !xml_file {
3405 page.links_ssg(&base, &client, &self.domain_parsed).await
3406 } else {
3407 Default::default()
3408 };
3409
3410 if xml_file {
3411 page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
3412 .await;
3413 }
3414
3415 self.initial_status_code = page.status_code;
3416 self.initial_html_length = page.get_html_bytes_u8().len();
3417 self.initial_anti_bot_tech = page.anti_bot_tech;
3418 self.initial_page_should_retry = page.should_retry;
3419 self.initial_page_waf_check = page.waf_check;
3420
3421 self.set_crawl_initial_status(&page, &links);
3422
3423 if let Some(ref cb) = self.on_should_crawl_callback {
3424 if !cb.call(&page) {
3425 page.blocked_crawl = true;
3426 channel_send_page(&self.channel, page, &self.channel_guard);
3427 return Default::default();
3428 }
3429 }
3430
3431 channel_send_page(&self.channel, page, &self.channel_guard);
3432
3433 links
3434 } else {
3435 HashSet::new()
3436 }
3437 }
3438
3439 #[cfg(all(not(feature = "decentralized"), feature = "chrome",))]
3441 pub async fn crawl_establish_chrome_one(
3442 &self,
3443 client: &Client,
3444 base: &mut RelativeSelectors,
3445 url: &Option<&str>,
3446 chrome_page: &chromiumoxide::Page,
3447 ) -> HashSet<CaseInsensitiveString> {
3448 if self
3449 .is_allowed_default(&self.get_base_link())
3450 .eq(&ProcessLinkStatus::Allowed)
3451 {
3452 let (_, intercept_handle) = tokio::join!(
3453 crate::features::chrome::setup_chrome_events(chrome_page, &self.configuration),
3454 self.setup_chrome_interception(&chrome_page)
3455 );
3456
3457 let mut page = Page::new(
3458 url.unwrap_or(&self.url.inner()),
3459 &client,
3460 &chrome_page,
3461 &self.configuration.wait_for,
3462 &self.configuration.screenshot,
3463 false, &self.configuration.openai_config,
3465 &self.configuration.execution_scripts,
3466 &self.configuration.automation_scripts,
3467 &self.configuration.viewport,
3468 &self.configuration.request_timeout,
3469 &self.configuration.track_events,
3470 self.configuration.referer.clone(),
3471 self.configuration.max_page_bytes,
3472 self.configuration.get_cache_options(),
3473 &self.configuration.cache_policy,
3474 &self.configuration.remote_multimodal,
3475 )
3476 .await;
3477
3478 let mut retry_count = self.configuration.retry;
3479
3480 if let Some(final_redirect_destination) = &page.final_redirect_destination {
3481 if final_redirect_destination == "chrome-error://chromewebdata/"
3482 && page.status_code.is_success()
3483 && page.is_empty()
3484 && self.configuration.proxies.is_some()
3485 {
3486 page.error_status = Some("Invalid proxy configuration.".into());
3487 page.should_retry = true;
3488 page.status_code = *crate::page::CHROME_UNKNOWN_STATUS_ERROR;
3489 }
3490 }
3491
3492 while page.should_retry && retry_count > 0 {
3493 retry_count -= 1;
3494 if let Some(timeout) = page.get_timeout() {
3495 tokio::time::sleep(timeout).await;
3496 }
3497 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
3498 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
3499 let next_page = Page::new(
3500 &self.url.inner(),
3501 &client,
3502 &chrome_page,
3503 &self.configuration.wait_for,
3504 &self.configuration.screenshot,
3505 false, &self.configuration.openai_config,
3507 &self.configuration.execution_scripts,
3508 &self.configuration.automation_scripts,
3509 &self.configuration.viewport,
3510 &self.configuration.request_timeout,
3511 &self.configuration.track_events,
3512 self.configuration.referer.clone(),
3513 self.configuration.max_page_bytes,
3514 self.configuration.get_cache_options(),
3515 &self.configuration.cache_policy,
3516 &self.configuration.remote_multimodal,
3517 )
3518 .await;
3519 page.clone_from(&next_page);
3520 })
3521 .await
3522 {
3523 log::warn!("backoff timeout {elasped}");
3524 }
3525 } else {
3526 let next_page = Page::new(
3527 &self.url.inner(),
3528 &client,
3529 &chrome_page,
3530 &self.configuration.wait_for,
3531 &self.configuration.screenshot,
3532 false, &self.configuration.openai_config,
3534 &self.configuration.execution_scripts,
3535 &self.configuration.automation_scripts,
3536 &self.configuration.viewport,
3537 &self.configuration.request_timeout,
3538 &self.configuration.track_events,
3539 self.configuration.referer.clone(),
3540 self.configuration.max_page_bytes,
3541 self.configuration.get_cache_options(),
3542 &self.configuration.cache_policy,
3543 &self.configuration.remote_multimodal,
3544 )
3545 .await;
3546 page.clone_from(&next_page);
3547 }
3548
3549 if let Some(final_redirect_destination) = &page.final_redirect_destination {
3551 if final_redirect_destination == "chrome-error://chromewebdata/"
3552 && page.status_code.is_success()
3553 && page.is_empty()
3554 && self.configuration.proxies.is_some()
3555 {
3556 page.error_status = Some("Invalid proxy configuration.".into());
3557 page.should_retry = true;
3558 page.status_code = *crate::page::CHROME_UNKNOWN_STATUS_ERROR;
3559 }
3560 }
3561 }
3562
3563 if let Some(h) = intercept_handle {
3564 let abort_handle = h.abort_handle();
3565 if let Err(elasped) =
3566 tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await
3567 {
3568 log::warn!("Handler timeout exceeded {elasped}");
3569 abort_handle.abort();
3570 }
3571 }
3572
3573 if let Some(domain) = &page.final_redirect_destination {
3574 let domain: Box<CaseInsensitiveString> = CaseInsensitiveString::new(&domain).into();
3575 let s = self.setup_selectors();
3576
3577 base.0 = s.0;
3578 base.1 = s.1;
3579
3580 if let Some(pdname) = parse_absolute_url(&domain) {
3581 if let Some(dname) = pdname.host_str() {
3582 base.2 = dname.into();
3583 }
3584 }
3585 }
3586
3587 emit_log(&self.url.inner());
3588
3589 if self.configuration.return_page_links && page.page_links.is_none() {
3590 page.page_links = Some(Box::new(Default::default()));
3591 }
3592
3593 let xml_file = page.get_html_bytes_u8().starts_with(b"<?xml");
3594
3595 let mut links = if !page.is_empty() && !xml_file {
3596 page.links_ssg(&base, &client, &self.domain_parsed).await
3597 } else {
3598 Default::default()
3599 };
3600
3601 if xml_file {
3602 page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
3603 .await;
3604 }
3605
3606 if let Some(ref cb) = self.on_should_crawl_callback {
3607 if !cb.call(&page) {
3608 page.blocked_crawl = true;
3609 channel_send_page(&self.channel, page, &self.channel_guard);
3610 return Default::default();
3611 }
3612 }
3613
3614 channel_send_page(&self.channel, page, &self.channel_guard);
3615
3616 links
3617 } else {
3618 HashSet::new()
3619 }
3620 }
3621
3622 #[cfg(all(feature = "webdriver", not(feature = "decentralized"), not(feature = "chrome")))]
3624 pub async fn crawl_establish_webdriver_one(
3625 &self,
3626 client: &Client,
3627 base: &mut RelativeSelectors,
3628 url: &Option<&str>,
3629 driver: &std::sync::Arc<thirtyfour::WebDriver>,
3630 ) -> HashSet<CaseInsensitiveString> {
3631 if self
3632 .is_allowed_default(&self.get_base_link())
3633 .eq(&ProcessLinkStatus::Allowed)
3634 {
3635 let timeout = self
3636 .configuration
3637 .webdriver_config
3638 .as_ref()
3639 .and_then(|c| c.timeout);
3640
3641 crate::features::webdriver::setup_driver_events(driver, &self.configuration).await;
3643
3644 let mut page = Page::new_page_webdriver(
3645 url.unwrap_or(&self.url.inner()),
3646 driver,
3647 timeout,
3648 )
3649 .await;
3650
3651 let mut retry_count = self.configuration.retry;
3652
3653 while page.should_retry && retry_count > 0 {
3654 retry_count -= 1;
3655 if let Some(timeout_duration) = page.get_timeout() {
3656 tokio::time::sleep(timeout_duration).await;
3657 }
3658 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
3659 if let Err(elapsed) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
3660 let next_page = Page::new_page_webdriver(
3661 &self.url.inner(),
3662 driver,
3663 timeout,
3664 )
3665 .await;
3666 page.clone_from(&next_page);
3667 })
3668 .await
3669 {
3670 log::warn!("backoff timeout {elapsed}");
3671 }
3672 } else {
3673 let next_page = Page::new_page_webdriver(
3674 &self.url.inner(),
3675 driver,
3676 timeout,
3677 )
3678 .await;
3679 page.clone_from(&next_page);
3680 }
3681 }
3682
3683 if let Some(domain) = &page.final_redirect_destination {
3684 let domain: Box<CaseInsensitiveString> = CaseInsensitiveString::new(&domain).into();
3685 let s = self.setup_selectors();
3686
3687 base.0 = s.0;
3688 base.1 = s.1;
3689
3690 if let Some(pdname) = parse_absolute_url(&domain) {
3691 if let Some(dname) = pdname.host_str() {
3692 base.2 = dname.into();
3693 }
3694 }
3695 }
3696
3697 emit_log(&self.url.inner());
3698
3699 if self.configuration.return_page_links && page.page_links.is_none() {
3700 page.page_links = Some(Box::new(Default::default()));
3701 }
3702
3703 let xml_file = page.get_html_bytes_u8().starts_with(b"<?xml");
3704
3705 let mut links = if !page.is_empty() && !xml_file {
3706 page.links_ssg(&base, &client, &self.domain_parsed).await
3707 } else {
3708 Default::default()
3709 };
3710
3711 if xml_file {
3712 page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
3713 .await;
3714 }
3715
3716 if let Some(ref cb) = self.on_should_crawl_callback {
3717 if !cb.call(&page) {
3718 page.blocked_crawl = true;
3719 channel_send_page(&self.channel, page, &self.channel_guard);
3720 return Default::default();
3721 }
3722 }
3723
3724 channel_send_page(&self.channel, page, &self.channel_guard);
3725
3726 links
3727 } else {
3728 HashSet::new()
3729 }
3730 }
3731
3732 #[cfg(all(not(feature = "glob"), feature = "decentralized"))]
3734 pub async fn crawl_establish(
3735 &mut self,
3736 client: &Client,
3737 _: &(CompactString, smallvec::SmallVec<[CompactString; 2]>),
3738 http_worker: bool,
3739 ) -> HashSet<CaseInsensitiveString> {
3740 let links: HashSet<CaseInsensitiveString> = if self
3742 .is_allowed_default(&self.get_base_link())
3743 .eq(&ProcessLinkStatus::Allowed)
3744 {
3745 let link = self.url.inner();
3746
3747 let mut page = Page::new(
3748 &if http_worker && link.starts_with("https") {
3749 link.replacen("https", "http", 1)
3750 } else {
3751 link.to_string()
3752 },
3753 &client,
3754 )
3755 .await;
3756
3757 if let Some(sid) = page.signature {
3758 self.insert_signature(sid).await;
3759 }
3760
3761 self.insert_link(match &self.on_link_find_callback {
3762 Some(cb) => cb(*self.url.to_owned(), None).0,
3763 _ => *self.url.to_owned(),
3764 })
3765 .await;
3766
3767 self.initial_status_code = page.status_code;
3768 self.initial_html_length = page.get_html_bytes_u8().len();
3769 self.initial_anti_bot_tech = page.anti_bot_tech;
3770 self.initial_page_should_retry = page.should_retry;
3771 self.initial_page_waf_check = page.waf_check;
3772
3773 if self.configuration.return_page_links {
3775 page.page_links = Some(page.links.clone().into());
3776 }
3777
3778 let links = HashSet::from(page.links.clone());
3779
3780 self.set_crawl_initial_status(&page, &links);
3781
3782 channel_send_page(&self.channel, page, &self.channel_guard);
3783
3784 links
3785 } else {
3786 HashSet::new()
3787 };
3788
3789 links
3790 }
3791
3792 #[cfg(all(feature = "glob", feature = "decentralized"))]
3794 pub async fn crawl_establish(
3795 &mut self,
3796 client: &Client,
3797 _: &(CompactString, smallvec::SmallVec<[CompactString; 2]>),
3798 http_worker: bool,
3799 ) -> HashSet<CaseInsensitiveString> {
3800 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
3801 let expanded = self.get_expanded_links(&self.url.inner().as_str());
3802 self.configuration.configure_allowlist();
3803
3804 for link in expanded {
3805 let allowed = self.is_allowed(&link);
3806
3807 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
3808 break;
3809 }
3810 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
3811 continue;
3812 }
3813
3814 let mut page = Page::new(
3815 &if http_worker && link.as_ref().starts_with("https") {
3816 link.inner().replacen("https", "http", 1).to_string()
3817 } else {
3818 link.inner().to_string()
3819 },
3820 &client,
3821 )
3822 .await;
3823
3824 let u = page.get_url();
3825 let u = if u.is_empty() { link } else { u.into() };
3826
3827 let link_result = match &self.on_link_find_callback {
3828 Some(cb) => cb(u, None),
3829 _ => (u, None),
3830 };
3831
3832 if let Some(sid) = page.signature {
3833 self.insert_signature(sid).await;
3834 }
3835
3836 self.insert_link(link_result.0).await;
3837
3838 if self.configuration.return_page_links {
3839 page.page_links = Some(Default::default());
3840 }
3841
3842 channel_send_page(&self.channel, page.clone(), &self.channel_guard);
3843
3844 let page_links = HashSet::from(page.links);
3845
3846 links.extend(page_links);
3847 }
3848
3849 links
3850 }
3851
3852 #[cfg(all(feature = "glob", feature = "chrome", not(feature = "decentralized")))]
3854 pub async fn crawl_establish(
3855 &mut self,
3856 client: &Client,
3857 base: &mut RelativeSelectors,
3858 _: bool,
3859 page: &chromiumoxide::Page,
3860 ) -> HashSet<CaseInsensitiveString> {
3861 if self.skip_initial {
3862 return Default::default();
3863 }
3864 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
3865 let expanded = self.get_expanded_links(&self.url.inner().as_str());
3866 self.configuration.configure_allowlist();
3867
3868 for link in expanded {
3869 let allowed = self.is_allowed(&link);
3870
3871 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
3872 break;
3873 }
3874 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
3875 continue;
3876 }
3877
3878 let mut page = Page::new(
3879 &link.inner().as_str(),
3880 &client,
3881 &page,
3882 &self.configuration.wait_for,
3883 &self.configuration.screenshot,
3884 false, &self.configuration.openai_config,
3886 &self.configuration.execution_scripts,
3887 &self.configuration.automation_scripts,
3888 &self.configuration.viewport,
3889 &self.configuration.request_timeout,
3890 &self.configuration.track_events,
3891 self.configuration.referer.clone(),
3892 self.configuration.max_page_bytes,
3893 self.configuration.get_cache_options(),
3894 &self.configuration.cache_policy,
3895 &self.configuration.remote_multimodal,
3896 )
3897 .await;
3898
3899 let u = page.get_url();
3900 let u = if u.is_empty() { link } else { u.into() };
3901
3902 let link_result = match &self.on_link_find_callback {
3903 Some(cb) => cb(u, None),
3904 _ => (u, None),
3905 };
3906
3907 if let Some(sid) = page.signature {
3908 self.insert_signature(sid).await;
3909 }
3910
3911 self.insert_link(link_result.0).await;
3912
3913 if self.configuration.return_page_links {
3914 page.page_links = Some(Default::default());
3915 let next_links = HashSet::from(page.links(&base, &self.domain_parsed).await);
3916
3917 channel_send_page(&self.channel, page.clone(), &self.channel_guard);
3918
3919 links.extend(next_links);
3920 } else {
3921 channel_send_page(&self.channel, page.clone(), &self.channel_guard);
3922 let next_links = HashSet::from(page.links(&base, &self.domain_parsed).await);
3923
3924 links.extend(next_links);
3925 }
3926 }
3927
3928 links
3929 }
3930
3931 #[cfg(feature = "glob")]
3933 async fn _crawl_establish(
3934 &mut self,
3935 client: &Client,
3936 base: &mut RelativeSelectors,
3937 _: bool,
3938 ) -> HashSet<CaseInsensitiveString> {
3939 if self.skip_initial {
3940 return Default::default();
3941 }
3942 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
3943 let domain_name = self.url.inner();
3944 let expanded = self.get_expanded_links(&domain_name.as_str());
3945
3946 self.configuration.configure_allowlist();
3947
3948 for url in expanded {
3949 #[cfg(feature = "regex")]
3950 let url_ref: &CaseInsensitiveString = &url;
3951 #[cfg(not(feature = "regex"))]
3952 let url_ref: &CompactString = url.inner();
3953 if self
3954 .is_allowed_default(url_ref)
3955 .eq(&ProcessLinkStatus::Allowed)
3956 {
3957 let mut links_ssg = HashSet::new();
3958 let mut links_pages = if self.configuration.return_page_links {
3959 Some(HashSet::new())
3960 } else {
3961 None
3962 };
3963 let mut page_links_settings =
3964 PageLinkBuildSettings::new(true, self.configuration.full_resources);
3965
3966 page_links_settings.subdomains = self.configuration.subdomains;
3967 page_links_settings.tld = self.configuration.tld;
3968 page_links_settings.normalize = self.configuration.normalize;
3969
3970 let mut domain_parsed = self.domain_parsed.take();
3971
3972 let mut page = Page::new_page_streaming(
3973 &url,
3974 client,
3975 false,
3976 base,
3977 &self.configuration.external_domains_caseless,
3978 &page_links_settings,
3979 &mut links,
3980 Some(&mut links_ssg),
3981 &mut domain_parsed, &mut self.domain_parsed,
3983 &mut links_pages,
3984 )
3985 .await;
3986
3987 if self.domain_parsed.is_none() {
3988 if let Some(mut domain_parsed) = domain_parsed.take() {
3989 convert_abs_url(&mut domain_parsed);
3990 self.domain_parsed.replace(domain_parsed);
3991 }
3992 }
3993
3994 let mut retry_count = self.configuration.retry;
3995 let domains_caseless = &self.configuration.external_domains_caseless;
3996
3997 while page.should_retry && retry_count > 0 {
3998 retry_count -= 1;
3999 if let Some(timeout) = page.get_timeout() {
4000 tokio::time::sleep(timeout).await;
4001 }
4002
4003 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
4004 let mut domain_parsed_clone = self.domain_parsed.clone();
4005
4006 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
4007 page.clone_from(
4008 &Page::new_page_streaming(
4009 &url,
4010 client,
4011 false,
4012 base,
4013 domains_caseless,
4014 &page_links_settings,
4015 &mut links,
4016 Some(&mut links_ssg),
4017 &mut domain_parsed,
4018 &mut domain_parsed_clone,
4019 &mut links_pages,
4020 )
4021 .await,
4022 );
4023 })
4024 .await
4025 {
4026 log::info!("backoff gateway timeout exceeded {elasped}");
4027 }
4028
4029 self.domain_parsed = domain_parsed_clone;
4030 } else {
4031 page.clone_from(
4032 &Page::new_page_streaming(
4033 &url,
4034 client,
4035 false,
4036 base,
4037 &self.configuration.external_domains_caseless,
4038 &page_links_settings,
4039 &mut links,
4040 Some(&mut links_ssg),
4041 &mut domain_parsed,
4042 &mut self.domain_parsed,
4043 &mut links_pages,
4044 )
4045 .await,
4046 );
4047 }
4048 }
4049
4050 emit_log(&url);
4051
4052 if let Some(signature) = page.signature {
4053 if !self.is_signature_allowed(signature).await {
4054 return Default::default();
4055 }
4056 self.insert_signature(signature).await;
4057 }
4058
4059 self.insert_link(
4060 self.on_link_find_callback
4061 .as_ref()
4062 .map(|cb| cb(*self.url.clone(), None).0)
4063 .unwrap_or_else(|| *self.url.clone()),
4064 )
4065 .await;
4066
4067 if self.configuration.return_page_links {
4068 page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new);
4069 }
4070
4071 links.extend(links_ssg);
4072
4073 self.initial_status_code = page.status_code;
4074 self.initial_html_length = page.get_html_bytes_u8().len();
4075 self.initial_anti_bot_tech = page.anti_bot_tech;
4076 self.initial_page_should_retry = page.should_retry;
4077 self.initial_page_waf_check = page.waf_check;
4078
4079 self.set_crawl_initial_status(&page, &links);
4080
4081 if let Some(ref cb) = self.on_should_crawl_callback {
4082 if !cb.call(&page) {
4083 page.blocked_crawl = true;
4084 channel_send_page(&self.channel, page, &self.channel_guard);
4085 return Default::default();
4086 }
4087 }
4088
4089 channel_send_page(&self.channel, page, &self.channel_guard);
4090 }
4091 }
4092
4093 links
4094 }
4095
4096 #[cfg(all(not(feature = "decentralized"), feature = "smart"))]
4098 pub async fn crawl_establish_smart(
4099 &mut self,
4100 client: &Client,
4101 mut base: &mut RelativeSelectors,
4102 browser: &crate::features::chrome::OnceBrowser,
4103 ) -> HashSet<CaseInsensitiveString> {
4104 if self.skip_initial {
4105 return Default::default();
4106 }
4107
4108 let links: HashSet<CaseInsensitiveString> = if self
4109 .is_allowed_default(&self.get_base_link())
4110 .eq(&ProcessLinkStatus::Allowed)
4111 {
4112 let url = self.url.inner();
4113
4114 let mut page = if let Some(seeded_page) = self.build_seed_page() {
4115 seeded_page
4116 } else {
4117 Page::new_page(&url, &client).await
4118 };
4119
4120 let mut retry_count = self.configuration.retry;
4121
4122 while page.should_retry && retry_count > 0 {
4123 retry_count -= 1;
4124 if let Some(timeout) = page.get_timeout() {
4125 tokio::time::sleep(timeout).await;
4126 }
4127 let client_error = page.status_code.is_client_error();
4128
4129 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
4130 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
4131 if retry_count.is_power_of_two() {
4132 Website::render_chrome_page(
4133 &self.configuration,
4134 client,
4135 &mut page,
4136 url,
4137 &self.domain_parsed,
4138 browser,
4139 )
4140 .await;
4141 } else {
4142 let next_page = Page::new_page(url, &client).await;
4143 page.clone_from(&next_page);
4144 };
4145 })
4146 .await
4147 {
4148 log::warn!("backoff timeout {elasped}");
4149 }
4150 } else {
4151 if retry_count.is_power_of_two() || client_error {
4152 Website::render_chrome_page(
4153 &self.configuration,
4154 client,
4155 &mut page,
4156 url,
4157 &self.domain_parsed,
4158 browser,
4159 )
4160 .await
4161 } else {
4162 page.clone_from(&Page::new_page(url, &client).await);
4163 }
4164 }
4165 }
4166
4167 let (page_links, bytes_transferred): (HashSet<CaseInsensitiveString>, Option<f64>) =
4168 page.smart_links(
4169 &base,
4170 &self.configuration,
4171 &self.domain_parsed,
4172 &browser,
4173 Some(&self.cookie_jar),
4174 )
4175 .await;
4176
4177 if let Some(domain) = &page.final_redirect_destination {
4178 let prior_domain = self.domain_parsed.take();
4179 crate::utils::modify_selectors(
4180 &prior_domain,
4181 domain,
4182 &mut self.domain_parsed,
4183 &mut self.url,
4184 &mut base,
4185 AllowedDomainTypes::new(self.configuration.subdomains, self.configuration.tld),
4186 );
4187 }
4188
4189 emit_log(&self.url.inner());
4190
4191 if let Some(sid) = page.signature {
4192 self.insert_signature(sid).await;
4193 }
4194
4195 self.insert_link(
4196 self.on_link_find_callback
4197 .as_ref()
4198 .map(|cb| cb(*self.url.clone(), None).0)
4199 .unwrap_or_else(|| *self.url.clone()),
4200 )
4201 .await;
4202
4203 let links = if !page_links.is_empty() {
4204 page_links
4205 } else {
4206 Default::default()
4207 };
4208
4209 page.bytes_transferred = bytes_transferred;
4210
4211 self.initial_status_code = page.status_code;
4212 self.initial_html_length = page.get_html_bytes_u8().len();
4213 self.initial_anti_bot_tech = page.anti_bot_tech;
4214 self.initial_page_should_retry = page.should_retry;
4215 self.initial_page_waf_check = page.waf_check;
4216
4217 self.set_crawl_initial_status(&page, &links);
4218
4219 if self.configuration.return_page_links {
4220 page.page_links = if links.is_empty() {
4221 None
4222 } else {
4223 Some(Box::new(links.clone()))
4224 };
4225 }
4226
4227 if let Some(cb) = &mut self.on_should_crawl_callback {
4228 if !cb.call(&page) {
4229 page.blocked_crawl = true;
4230 channel_send_page(&self.channel, page, &self.channel_guard);
4231 return Default::default();
4232 }
4233 }
4234
4235 channel_send_page(&self.channel, page, &self.channel_guard);
4236
4237 links
4238 } else {
4239 HashSet::new()
4240 };
4241
4242 links
4243 }
4244
4245 #[cfg(all(not(feature = "decentralized"), feature = "smart"))]
4247 pub async fn render_chrome_page(
4248 config: &Configuration,
4249 client: &Client,
4250 page: &mut Page,
4251 url: &str,
4252 base: &Option<Box<Url>>,
4253 browser: &crate::features::chrome::OnceBrowser,
4254 ) {
4255 if let Some(browser_controller) = browser
4256 .get_or_init(|| crate::website::Website::setup_browser_base(&config, &base, None))
4257 .await
4258 {
4259 if let Ok(chrome_page) = crate::features::chrome::attempt_navigation(
4260 "about:blank",
4261 &browser_controller.browser.0,
4262 &config.request_timeout,
4263 &browser_controller.browser.2,
4264 &config.viewport,
4265 )
4266 .await
4267 {
4268 let (_, intercept_handle) = tokio::join!(
4269 crate::features::chrome::setup_chrome_events(&chrome_page, &config),
4270 crate::features::chrome::setup_chrome_interception_base(
4271 &chrome_page,
4272 config.chrome_intercept.enabled,
4273 &config.auth_challenge_response,
4274 config.chrome_intercept.block_visuals,
4275 &url,
4276 )
4277 );
4278
4279 let next_page = Page::new(
4280 &url,
4281 &client,
4282 &chrome_page,
4283 &config.wait_for,
4284 &config.screenshot,
4285 false, &config.openai_config,
4287 &config.execution_scripts,
4288 &config.automation_scripts,
4289 &config.viewport,
4290 &config.request_timeout,
4291 &config.track_events,
4292 config.referer.clone(),
4293 config.max_page_bytes,
4294 config.get_cache_options(),
4295 &config.cache_policy,
4296 &config.remote_multimodal,
4297 )
4298 .await;
4299
4300 page.clone_from(&next_page);
4301
4302 if let Some(h) = intercept_handle {
4303 let abort_handle = h.abort_handle();
4304 if let Err(elasped) =
4305 tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await
4306 {
4307 log::warn!("Handler timeout exceeded {elasped}");
4308 abort_handle.abort();
4309 }
4310 }
4311 }
4312 }
4313 }
4314
4315 pub fn set_crawl_status(&mut self) {
4317 if self.status == CrawlStatus::Start || self.status == CrawlStatus::Active {
4318 self.status = if self.domain_parsed.is_none() {
4319 CrawlStatus::Invalid
4320 } else {
4321 CrawlStatus::Idle
4322 };
4323 }
4324 }
4325
4326 pub fn setup_semaphore(&self) -> Arc<Semaphore> {
4328 if self.configuration.shared_queue {
4329 SEM_SHARED.clone()
4330 } else {
4331 Arc::new(Semaphore::const_new(
4332 self.configuration
4333 .concurrency_limit
4334 .unwrap_or(*DEFAULT_PERMITS),
4335 ))
4336 }
4337 }
4338
4339 pub async fn crawl(&mut self) {
4341 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4342 self.start();
4343 let (client, handle) = self.setup().await;
4344 let (handle, join_handle) = match handle {
4345 Some(h) => (Some(h.0), Some(h.1)),
4346 _ => (None, None),
4347 };
4348 self.crawl_concurrent(&client, &handle).await;
4349 self.sitemap_crawl_chain(&client, &handle, false).await;
4350 self.set_crawl_status();
4351 if let Some(h) = join_handle {
4352 h.abort()
4353 }
4354 self.client.replace(client);
4355 }
4356 }
4357
4358 pub async fn crawl_sitemap(&mut self) {
4360 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4361 self.start();
4362 let (client, handle) = self.setup().await;
4363 let (handle, join_handle) = match handle {
4364 Some(h) => (Some(h.0), Some(h.1)),
4365 _ => (None, None),
4366 };
4367 self.sitemap_crawl(&client, &handle, false).await;
4368 self.set_crawl_status();
4369 if let Some(h) = join_handle {
4370 h.abort()
4371 }
4372 self.client.replace(client);
4373 }
4374 }
4375
4376 #[cfg(all(
4378 feature = "sitemap",
4379 feature = "chrome",
4380 not(feature = "decentralized")
4381 ))]
4382 pub async fn crawl_sitemap_chrome(&mut self) {
4383 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4384 self.start();
4385 let (client, handle) = self.setup().await;
4386 let (handle, join_handle) = match handle {
4387 Some(h) => (Some(h.0), Some(h.1)),
4388 _ => (None, None),
4389 };
4390 self.sitemap_crawl_chrome(&client, &handle, false).await;
4391 self.set_crawl_status();
4392 if let Some(h) = join_handle {
4393 h.abort()
4394 }
4395 self.client.replace(client);
4396 }
4397 }
4398
4399 pub async fn configure_setup(&mut self) {
4401 self.status = CrawlStatus::Active;
4402 self.start();
4403 self.setup().await;
4404 self.configuration.configure_allowlist();
4405 self.send_configured = true;
4406 }
4407
4408 pub fn configure_setup_norobots(&mut self) {
4411 self.status = CrawlStatus::Active;
4412 self.start();
4413 self.setup_base();
4414 self.configuration.configure_allowlist();
4415 self.send_configured = true;
4416 }
4417
4418 #[cfg(not(feature = "decentralized"))]
4419 pub async fn crawl_raw_send(&self, url: Option<&str>) {
4424 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4425 let (client, handle) = (
4426 match &self.client {
4427 Some(c) => c.to_owned(),
4428 _ => self.configure_http_client(),
4429 },
4430 self.configure_handler(),
4431 );
4432 let (handle, join_handle) = match handle {
4433 Some(h) => (Some(h.0), Some(h.1)),
4434 _ => (None, None),
4435 };
4436 self.crawl_concurrent_raw_send(&client, &handle, &url).await;
4437 if let Some(h) = join_handle {
4438 h.abort()
4439 }
4440 }
4441 }
4442
4443 #[cfg(all(feature = "chrome", not(feature = "decentralized")))]
4444 pub async fn crawl_chrome_send(&self, url: Option<&str>) {
4448 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4449 let (client, handle) = (
4450 match &self.client {
4451 Some(c) => c.to_owned(),
4452 _ => self.configure_http_client(),
4453 },
4454 self.configure_handler(),
4455 );
4456 let (handle, join_handle) = match handle {
4457 Some(h) => (Some(h.0), Some(h.1)),
4458 _ => (None, None),
4459 };
4460 self.crawl_concurrent_send(&client, &handle, &url).await;
4461 if let Some(h) = join_handle {
4462 h.abort()
4463 }
4464 }
4465 }
4466
4467 #[cfg(all(feature = "chrome", not(feature = "decentralized")))]
4468 pub async fn fetch_chrome(&self, url: Option<&str>) {
4470 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4471 let (client, handle) = (
4472 match &self.client {
4473 Some(c) => c.to_owned(),
4474 _ => self.configure_http_client(),
4475 },
4476 self.configure_handler(),
4477 );
4478 let (_handle, join_handle) = match handle {
4479 Some(h) => (Some(h.0), Some(h.1)),
4480 _ => (None, None),
4481 };
4482 self._fetch_chrome(&client, &url).await;
4483 if let Some(h) = join_handle {
4484 h.abort()
4485 }
4486 }
4487 }
4488
4489 #[cfg(all(feature = "chrome", not(feature = "decentralized")))]
4490 pub async fn fetch_chrome_persisted(
4492 &self,
4493 url: Option<&str>,
4494 browser: &crate::features::chrome::BrowserController,
4495 ) {
4496 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4497 let (client, handle) = (
4498 match &self.client {
4499 Some(c) => c.to_owned(),
4500 _ => self.configure_http_client(),
4501 },
4502 self.configure_handler(),
4503 );
4504 let (_handle, join_handle) = match handle {
4505 Some(h) => (Some(h.0), Some(h.1)),
4506 _ => (None, None),
4507 };
4508 self._fetch_chrome_persisted(&client, &url, &browser).await;
4509 if let Some(h) = join_handle {
4510 h.abort()
4511 }
4512 }
4513 }
4514
4515 #[cfg(all(feature = "decentralized", feature = "smart"))]
4516 pub async fn crawl_smart(&mut self) {
4518 self.crawl().await;
4519 }
4520
4521 #[cfg(all(feature = "decentralized", not(feature = "smart")))]
4522 pub async fn crawl_smart(&mut self) {
4524 self.crawl().await;
4525 }
4526
4527 #[cfg(all(not(feature = "decentralized"), feature = "smart"))]
4528 pub async fn crawl_smart(&mut self) {
4530 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4531 self.start();
4532 let (client, handle) = self.setup().await;
4533 let (handle, join_handle) = match handle {
4534 Some(h) => (Some(h.0), Some(h.1)),
4535 _ => (None, None),
4536 };
4537 self.crawl_concurrent_smart(&client, &handle).await;
4538 self.set_crawl_status();
4539 if let Some(h) = join_handle {
4540 h.abort()
4541 }
4542 self.client.replace(client);
4543 }
4544 }
4545
4546 #[cfg(all(not(feature = "decentralized"), not(feature = "smart")))]
4547 pub async fn crawl_smart(&mut self) {
4549 self.crawl().await
4550 }
4551
4552 pub async fn crawl_raw(&mut self) {
4554 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4555 self.start();
4556 let (client, handle) = self.setup().await;
4557 let (handle, join_handle) = match handle {
4558 Some(h) => (Some(h.0), Some(h.1)),
4559 _ => (None, None),
4560 };
4561 self.crawl_concurrent_raw(&client, &handle).await;
4562 self.sitemap_crawl_chain(&client, &handle, false).await;
4563 self.set_crawl_status();
4564 if let Some(h) = join_handle {
4565 h.abort()
4566 }
4567 self.client.replace(client);
4568 }
4569 }
4570
4571 pub async fn scrape(&mut self) {
4573 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4574 let mut w = self.clone();
4575 let mut rx2 = w.subscribe(0).expect("receiver enabled");
4576
4577 if self.pages.is_none() {
4578 self.pages = Some(Vec::new());
4579 }
4580
4581 let (done_tx, mut done_rx) = tokio::sync::oneshot::channel::<()>();
4583
4584 let crawl = async move {
4585 w.crawl().await;
4586 w.unsubscribe();
4587 let _ = done_tx.send(());
4589 };
4590
4591 let sub = async {
4592 loop {
4593 tokio::select! {
4594 biased;
4595 _ = &mut done_rx => {
4597 break;
4598 }
4599 result = rx2.recv() => {
4600 if let Ok(page) = result {
4601 if let Some(sid) = page.signature {
4602 self.insert_signature(sid).await;
4603 }
4604 self.insert_link(page.get_url().into()).await;
4605 if let Some(p) = self.pages.as_mut() {
4606 p.push(page);
4607 }
4608 } else {
4609 break;
4610 }
4611 }
4612 }
4613 }
4614 };
4615
4616 tokio::join!(sub, crawl);
4617 self.unsubscribe();
4619 }
4620 }
4621
4622 pub async fn scrape_raw(&mut self) {
4624 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4625 let mut w = self.clone();
4626 let mut rx2 = w.subscribe(0).expect("receiver enabled");
4627
4628 if self.pages.is_none() {
4629 self.pages = Some(Vec::new());
4630 }
4631
4632 let (done_tx, mut done_rx) = tokio::sync::oneshot::channel::<()>();
4633
4634 let crawl = async move {
4635 w.crawl_raw().await;
4636 w.unsubscribe();
4637 let _ = done_tx.send(());
4638 };
4639
4640 let sub = async {
4641 loop {
4642 tokio::select! {
4643 biased;
4644 _ = &mut done_rx => break,
4645 result = rx2.recv() => {
4646 if let Ok(page) = result {
4647 if let Some(sid) = page.signature {
4648 self.insert_signature(sid).await;
4649 }
4650 self.insert_link(page.get_url().into()).await;
4651 if let Some(p) = self.pages.as_mut() {
4652 p.push(page);
4653 }
4654 } else {
4655 break;
4656 }
4657 }
4658 }
4659 }
4660 };
4661
4662 tokio::join!(sub, crawl);
4663 self.unsubscribe();
4664 }
4665 }
4666
4667 pub async fn scrape_smart(&mut self) {
4669 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4670 let mut w = self.clone();
4671 let mut rx2 = w.subscribe(0).expect("receiver enabled");
4672
4673 if self.pages.is_none() {
4674 self.pages = Some(Vec::new());
4675 }
4676
4677 let (done_tx, mut done_rx) = tokio::sync::oneshot::channel::<()>();
4678
4679 let crawl = async move {
4680 w.crawl_smart().await;
4681 w.unsubscribe();
4682 let _ = done_tx.send(());
4683 };
4684
4685 let sub = async {
4686 loop {
4687 tokio::select! {
4688 biased;
4689 _ = &mut done_rx => break,
4690 result = rx2.recv() => {
4691 if let Ok(page) = result {
4692 if let Some(sid) = page.signature {
4693 self.insert_signature(sid).await;
4694 }
4695 self.insert_link(page.get_url().into()).await;
4696 if let Some(p) = self.pages.as_mut() {
4697 p.push(page);
4698 }
4699 } else {
4700 break;
4701 }
4702 }
4703 }
4704 }
4705 };
4706
4707 tokio::join!(sub, crawl);
4708 self.unsubscribe();
4709 }
4710 }
4711
4712 pub async fn scrape_sitemap(&mut self) {
4714 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4715 let mut w = self.clone();
4716 let mut rx2 = w.subscribe(0).expect("receiver enabled");
4717
4718 if self.pages.is_none() {
4719 self.pages = Some(Vec::new());
4720 }
4721
4722 let (done_tx, mut done_rx) = tokio::sync::oneshot::channel::<()>();
4723
4724 let crawl = async move {
4725 w.crawl_sitemap().await;
4726 w.unsubscribe();
4727 let _ = done_tx.send(());
4728 };
4729
4730 let sub = async {
4731 loop {
4732 tokio::select! {
4733 biased;
4734 _ = &mut done_rx => break,
4735 result = rx2.recv() => {
4736 if let Ok(page) = result {
4737 if let Some(sid) = page.signature {
4738 self.insert_signature(sid).await;
4739 }
4740 self.insert_link(page.get_url().into()).await;
4741 if let Some(p) = self.pages.as_mut() {
4742 p.push(page);
4743 }
4744 } else {
4745 break;
4746 }
4747 }
4748 }
4749 }
4750 };
4751
4752 tokio::join!(sub, crawl);
4753 self.unsubscribe();
4754 }
4755 }
4756
4757 async fn dequeue(
4759 &mut self,
4760 q: &mut Option<tokio::sync::broadcast::Receiver<String>>,
4761 links: &mut HashSet<CaseInsensitiveString>,
4762 exceeded_budget: &mut bool,
4763 ) {
4764 if let Some(q) = q {
4765 while let Ok(link) = q.try_recv() {
4766 let s = link.into();
4767 let allowed = self.is_allowed_budgetless(&s);
4768
4769 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
4770 *exceeded_budget = true;
4771 break;
4772 }
4773
4774 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&s).await {
4775 continue;
4776 }
4777
4778 self.links_visited.extend_with_new_links(links, s);
4779 }
4780 }
4781 }
4782
4783 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
4785 async fn crawl_concurrent_raw(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
4786 self.start();
4787 self.status = CrawlStatus::Active;
4788 let client_rotator = self.client_rotator.clone();
4789 let mut selector: (
4790 CompactString,
4791 smallvec::SmallVec<[CompactString; 2]>,
4792 CompactString,
4793 ) = self.setup_selectors();
4794 if self.single_page() {
4795 self._crawl_establish(client, &mut selector, false).await;
4796 } else {
4797 let on_should_crawl_callback = self.on_should_crawl_callback.clone();
4798 let full_resources = self.configuration.full_resources;
4799 let return_page_links = self.configuration.return_page_links;
4800 let only_html = self.configuration.only_html && !full_resources;
4801 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
4802
4803 let (mut interval, throttle) = self.setup_crawl();
4804
4805 let mut links: HashSet<CaseInsensitiveString> = self.drain_extra_links().collect();
4806
4807 links.extend(self._crawl_establish(client, &mut selector, false).await);
4808
4809 self.configuration.configure_allowlist();
4810
4811 let semaphore = self.setup_semaphore();
4812
4813 let shared = Arc::new((
4814 client.to_owned(),
4815 selector,
4816 self.channel.clone(),
4817 self.configuration.external_domains_caseless.clone(),
4818 self.channel_guard.clone(),
4819 self.configuration.retry,
4820 self.configuration.full_resources,
4821 PageLinkBuildSettings::new_full(
4822 false,
4823 self.configuration.full_resources,
4824 self.configuration.subdomains,
4825 self.configuration.tld,
4826 self.configuration.normalize,
4827 ),
4828 self.domain_parsed.clone(),
4829 self.on_link_find_callback.clone(),
4830 self.configuration.remote_multimodal.clone(),
4831 ));
4832
4833 let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> = JoinSet::new();
4834
4835 let mut exceeded_budget = false;
4837 let concurrency = throttle.is_zero();
4838
4839 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
4840
4841 if !concurrency && !links.is_empty() {
4842 tokio::time::sleep(*throttle).await;
4843 }
4844
4845 let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
4846 Some(Instant::now())
4847 } else {
4848 None
4849 };
4850
4851 'outer: loop {
4852 let mut stream =
4853 tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect());
4854
4855 loop {
4856 if !concurrency {
4857 tokio::time::sleep(*throttle).await;
4858 }
4859
4860 let semaphore =
4861 get_semaphore(&semaphore, !self.configuration.shared_queue).await;
4862
4863 tokio::select! {
4864 biased;
4865 Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
4866 if !self.handle_process(handle, &mut interval, async {
4867 emit_log_shutdown(link.inner());
4868 let permits = set.len();
4869 set.shutdown().await;
4870 semaphore.add_permits(permits);
4871 }).await {
4872 while let Some(links) = stream.next().await {
4873 self.extra_links.insert(links);
4874 }
4875 break 'outer;
4876 }
4877 let allowed = self.is_allowed(&link);
4878
4879 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
4880 exceeded_budget = true;
4881 break;
4882 }
4883
4884 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
4885 continue;
4886 }
4887
4888 emit_log(link.inner());
4889
4890 self.insert_link(link.clone()).await;
4891
4892 if let Ok(permit) = semaphore.clone().acquire_owned().await {
4893 let shared = shared.clone();
4894 let on_should_crawl_callback = on_should_crawl_callback.clone();
4895 let rotator = client_rotator.clone();
4896 spawn_set("page_fetch", &mut set, async move {
4897 let link_result = match &shared.9 {
4898 Some(cb) => cb(link, None),
4899 _ => (link, None),
4900 };
4901
4902 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
4903 let mut links_pages = if return_page_links {
4904 Some(links.clone())
4905 } else {
4906 None
4907 };
4908 let mut relative_selectors = shared.1.clone();
4909 let mut r_settings = shared.7;
4910 r_settings.ssg_build = true;
4911 let target_url = link_result.0.as_ref();
4912 let external_domains_caseless = &shared.3;
4913 let client = match &rotator {
4914 Some(r) => r.next(),
4915 None => &shared.0,
4916 };
4917
4918 let mut domain_parsed = None;
4919
4920 let mut page = Page::new_page_streaming(
4921 target_url,
4922 client, only_html,
4923 &mut relative_selectors,
4924 external_domains_caseless,
4925 &r_settings,
4926 &mut links,
4927 None,
4928 &shared.8,
4929 &mut domain_parsed,
4930 &mut links_pages).await;
4931
4932 let mut retry_count = shared.5;
4933
4934 while page.should_retry && retry_count > 0 {
4935 retry_count -= 1;
4936
4937 if let Some(timeout) = page.get_timeout() {
4938 tokio::time::sleep(timeout).await;
4939 }
4940
4941 let retry_client = match &rotator {
4942 Some(r) => r.next(),
4943 None => &shared.0,
4944 };
4945
4946 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
4947 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
4948 let mut domain_parsed = None;
4949 let next_page = Page::new_page_streaming(
4950 target_url,
4951 retry_client, only_html,
4952 &mut relative_selectors.clone(),
4953 external_domains_caseless,
4954 &r_settings,
4955 &mut links,
4956 None,
4957 &shared.8,
4958 &mut domain_parsed,
4959 &mut links_pages).await;
4960
4961 page.clone_from(&next_page);
4962
4963 }).await
4964 {
4965 log::warn!("Handler timeout exceeded {elasped}");
4966 }
4967
4968 } else {
4969 page.clone_from(&Page::new_page_streaming(
4970 target_url,
4971 retry_client,
4972 only_html,
4973 &mut relative_selectors.clone(),
4974 external_domains_caseless,
4975 &r_settings,
4976 &mut links,
4977 None,
4978 &shared.8,
4979 &mut domain_parsed,
4980 &mut links_pages).await);
4981 }
4982 }
4983
4984 if return_page_links {
4985 page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new);
4986 }
4987
4988 #[cfg(all(feature = "agent", feature = "serde"))]
4990 if shared.10.is_some() {
4991 let html = page.get_html();
4992 if !html.is_empty() {
4993 use crate::features::automation::{run_remote_multimodal_extraction, AutomationResultExt};
4994 let title = page.metadata.as_ref().and_then(|m| m.title.as_ref()).map(|t| t.as_str());
4995 if let Ok(Some(result)) = run_remote_multimodal_extraction(
4996 &shared.10,
4997 &html,
4998 target_url,
4999 title,
5000 ).await {
5001 match page.remote_multimodal_usage.as_mut() {
5003 Some(v) => v.push(result.usage.clone()),
5004 None => page.remote_multimodal_usage = Some(vec![result.usage.clone()]),
5005 }
5006 if result.extracted.is_some() || result.screenshot.is_some() {
5008 let automation_result = result.to_automation_results();
5009 match page.extra_remote_multimodal_data.as_mut() {
5010 Some(v) => v.push(automation_result),
5011 None => page.extra_remote_multimodal_data = Some(vec![automation_result]),
5012 }
5013 }
5014 }
5015 }
5016 }
5017
5018 if let Some(ref cb) = on_should_crawl_callback {
5019 if !cb.call(&page) {
5020 page.blocked_crawl = true;
5021 channel_send_page(&shared.2, page, &shared.4);
5022 drop(permit);
5023 return Default::default()
5024 }
5025 }
5026
5027 let signature = page.signature;
5028
5029 channel_send_page(&shared.2, page, &shared.4);
5030
5031 drop(permit);
5032
5033 (links, signature)
5034 });
5035 }
5036
5037 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5038 },
5039 Some(result) = set.join_next(), if !set.is_empty() => {
5040 if let Ok(res) = result {
5041 match res.1 {
5042 Some(signature) => {
5043 if self.is_signature_allowed(signature).await {
5044 self.insert_signature(signature).await;
5045 self.links_visited.extend_links(&mut links, res.0);
5046 }
5047 }
5048 _ => {
5049 self.links_visited.extend_links(&mut links, res.0);
5050 }
5051 }
5052 } else {
5053 break;
5054 }
5055 }
5056 else => break,
5057 }
5058
5059 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5060
5061 if links.is_empty() && set.is_empty() || exceeded_budget {
5062 if exceeded_budget {
5064 while let Some(links) = stream.next().await {
5065 self.extra_links.insert(links);
5066 }
5067 while let Some(links) = set.join_next().await {
5068 if let Ok(links) = links {
5069 self.extra_links.extend(links.0);
5070 }
5071 }
5072 }
5073 break 'outer;
5074 }
5075 }
5076
5077 self.subscription_guard().await;
5078 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5079
5080 if links.is_empty() && set.is_empty() {
5081 break;
5082 }
5083 }
5084
5085 if !links.is_empty() {
5087 self.extra_links.extend(links);
5088 }
5089 }
5090 }
5091
5092 #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
5094 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
5095 async fn crawl_concurrent(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
5096 use crate::features::chrome::attempt_navigation;
5097 self.start();
5098
5099 match self.setup_browser().await {
5100 Some(mut b) => {
5101 match attempt_navigation(
5102 "about:blank",
5103 &b.browser.0,
5104 &self.configuration.request_timeout,
5105 &b.browser.2,
5106 &self.configuration.viewport,
5107 )
5108 .await
5109 {
5110 Ok(new_page) => {
5111 let mut selectors = self.setup_selectors();
5112 self.status = CrawlStatus::Active;
5113
5114 if self.single_page() {
5115 self.crawl_establish(&client, &mut selectors, false, &new_page)
5116 .await;
5117 drop(new_page);
5118 self.subscription_guard().await;
5119 b.dispose();
5120 } else {
5121 let semaphore: Arc<Semaphore> = self.setup_semaphore();
5122 let (mut interval, throttle) = self.setup_crawl();
5123
5124 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
5125
5126 let base_links = self
5127 .crawl_establish(&client, &mut selectors, false, &new_page)
5128 .await;
5129
5130 drop(new_page);
5131
5132 let mut links: HashSet<CaseInsensitiveString> =
5133 self.drain_extra_links().collect();
5134
5135 links.extend(base_links);
5136
5137 self.configuration.configure_allowlist();
5138
5139 let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> =
5140 JoinSet::new();
5141
5142 let shared = Arc::new((
5143 client.to_owned(),
5144 selectors,
5145 self.channel.clone(),
5146 self.configuration.external_domains_caseless.clone(),
5147 self.channel_guard.clone(),
5148 b.browser.0.clone(),
5149 self.configuration.clone(),
5150 self.url.inner().to_string(),
5151 b.browser.2.clone(),
5152 self.domain_parsed.clone(),
5153 self.on_link_find_callback.clone(),
5154 ));
5155
5156 let add_external = shared.3.len() > 0;
5157 let on_should_crawl_callback = self.on_should_crawl_callback.clone();
5158 let full_resources = self.configuration.full_resources;
5159 let return_page_links = self.configuration.return_page_links;
5160 let mut exceeded_budget = false;
5161 let concurrency = throttle.is_zero();
5162
5163 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5164
5165 if !concurrency && !links.is_empty() {
5166 tokio::time::sleep(*throttle).await;
5167 }
5168
5169 let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
5170 Some(Instant::now())
5171 } else {
5172 None
5173 };
5174
5175 'outer: loop {
5176 let mut stream = tokio_stream::iter::<HashSet<CaseInsensitiveString>>(
5177 links.drain().collect(),
5178 );
5179
5180 loop {
5181 if !concurrency {
5182 tokio::time::sleep(*throttle).await;
5183 }
5184
5185 let semaphore =
5186 get_semaphore(&semaphore, !self.configuration.shared_queue)
5187 .await;
5188
5189 tokio::select! {
5190 biased;
5191 Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
5192 if !self
5193 .handle_process(
5194 handle,
5195 &mut interval,
5196 async {
5197 emit_log_shutdown(&link.inner());
5198 let permits = set.len();
5199 set.shutdown().await;
5200 semaphore.add_permits(permits);
5201 },
5202 )
5203 .await
5204 {
5205 break 'outer;
5206 }
5207
5208 let allowed = self.is_allowed(&link);
5209
5210 if allowed
5211 .eq(&ProcessLinkStatus::BudgetExceeded)
5212 {
5213 exceeded_budget = true;
5214 break;
5215 }
5216 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
5217 continue;
5218 }
5219
5220 emit_log(&link.inner());
5221
5222 self.insert_link(link.clone()).await;
5223
5224 if let Ok(permit) = semaphore.clone().acquire_owned().await {
5225 let shared = shared.clone();
5226 let on_should_crawl_callback = on_should_crawl_callback.clone();
5227 spawn_set("page_fetch", &mut set, async move {
5228 let results = match attempt_navigation("about:blank", &shared.5, &shared.6.request_timeout, &shared.8, &shared.6.viewport).await {
5229 Ok(new_page) => {
5230 let (_, intercept_handle) = tokio::join!(
5231 crate::features::chrome::setup_chrome_events(&new_page, &shared.6),
5232 crate::features::chrome::setup_chrome_interception_base(
5233 &new_page,
5234 shared.6.chrome_intercept.enabled,
5235 &shared.6.auth_challenge_response,
5236 shared.6.chrome_intercept.block_visuals,
5237 &shared.7,
5238 )
5239 );
5240
5241 let link_result =
5242 match &shared.10 {
5243 Some(cb) => cb(link, None),
5244 _ => (link, None),
5245 };
5246
5247 let target_url = link_result.0.as_ref();
5248
5249 let mut page = Page::new(
5250 &target_url,
5251 &shared.0,
5252 &new_page,
5253 &shared.6.wait_for,
5254 &shared.6.screenshot,
5255 false,
5256 &shared.6.openai_config,
5257 &shared.6.execution_scripts,
5258 &shared.6.automation_scripts,
5259 &shared.6.viewport,
5260 &shared.6.request_timeout,
5261 &shared.6.track_events,
5262 shared.6.referer.clone(),
5263 shared.6.max_page_bytes,
5264 shared.6.get_cache_options(),
5265 &shared.6.cache_policy,
5266 &shared.6.remote_multimodal,
5267 )
5268 .await;
5269
5270 let mut retry_count = shared.6.retry;
5271
5272 while page.should_retry && retry_count > 0 {
5273 retry_count -= 1;
5274 if let Some(timeout) = page.get_timeout() {
5275 tokio::time::sleep(timeout).await;
5276 }
5277 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
5278 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
5279 let p = Page::new(
5280 &target_url,
5281 &shared.0,
5282 &new_page,
5283 &shared.6.wait_for,
5284 &shared.6.screenshot,
5285 false,
5286 &shared.6.openai_config,
5287 &shared.6.execution_scripts,
5288 &shared.6.automation_scripts,
5289 &shared.6.viewport,
5290 &shared.6.request_timeout,
5291 &shared.6.track_events,
5292 shared.6.referer.clone(),
5293 shared.6.max_page_bytes,
5294 shared.6.get_cache_options(),
5295 &shared.6.cache_policy,
5296 &shared.6.remote_multimodal,
5297 ).await;
5298 page.clone_from(&p);
5299
5300 }).await {
5301 log::info!("{target_url} backoff gateway timeout exceeded {elasped}");
5302 }
5303 } else {
5304 page.clone_from(
5305 &Page::new(
5306 &target_url,
5307 &shared.0,
5308 &new_page,
5309 &shared.6.wait_for,
5310 &shared.6.screenshot,
5311 false,
5312 &shared.6.openai_config,
5313 &shared.6.execution_scripts,
5314 &shared.6.automation_scripts,
5315 &shared.6.viewport,
5316 &shared.6.request_timeout,
5317 &shared.6.track_events,
5318 shared.6.referer.clone(),
5319 shared.6.max_page_bytes,
5320 shared.6.get_cache_options(),
5321 &shared.6.cache_policy,
5322 &shared.6.remote_multimodal,
5323 )
5324 .await,
5325 );
5326 }
5327 }
5328
5329 if let Some(h) = intercept_handle {
5330 let abort_handle = h.abort_handle();
5331 if let Err(elasped) = tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await {
5332 log::warn!("Handler timeout exceeded {elasped}");
5333 abort_handle.abort();
5334 }
5335 }
5336
5337 if add_external {
5338 page.set_external(shared.3.clone());
5339 }
5340
5341 let prev_domain = page.base;
5342
5343 page.base = shared.9.as_deref().cloned();
5344
5345 if return_page_links {
5346 page.page_links = Some(Default::default());
5347 }
5348
5349 let links = if full_resources {
5350 page.links_full(&shared.1, &shared.9).await
5351 } else {
5352 page.links(&shared.1, &shared.9).await
5353 };
5354
5355 page.base = prev_domain;
5356
5357 if shared.6.normalize {
5358 page.signature.replace(crate::utils::hash_html(&page.get_html_bytes_u8()).await);
5359 }
5360
5361 if let Some(ref cb) = on_should_crawl_callback {
5362 if !cb.call(&page) {
5363 page.blocked_crawl = true;
5364 channel_send_page(&shared.2, page, &shared.4);
5365 drop(permit);
5366 return Default::default()
5367 }
5368 }
5369
5370 let signature = page.signature;
5371
5372 channel_send_page(
5373 &shared.2, page, &shared.4,
5374 );
5375
5376 (links, signature)
5377 }
5378 _ => Default::default(),
5379 };
5380
5381
5382 drop(permit);
5383
5384 results
5385 });
5386 }
5387
5388 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5389 }
5390 Some(result) = set.join_next(), if !set.is_empty() => {
5391 if let Ok(res) = result {
5392 match res.1 {
5393 Some(signature) => {
5394 if self.is_signature_allowed(signature).await {
5395 self.insert_signature(signature).await;
5396 self.links_visited.extend_links(&mut links, res.0);
5397 }
5398 }
5399 _ => {
5400 self.links_visited.extend_links(&mut links, res.0);
5401 }
5402 }
5403 } else{
5404 break
5405 }
5406 }
5407 else => break,
5408 };
5409
5410 if links.is_empty() && set.is_empty() || exceeded_budget {
5411 if exceeded_budget {
5412 while set.join_next().await.is_some() {}
5413 }
5414 break 'outer;
5415 }
5416 }
5417
5418 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5419
5420 if links.is_empty() && set.is_empty() {
5421 break;
5422 }
5423 }
5424
5425 self.subscription_guard().await;
5426 b.dispose();
5427 if !links.is_empty() {
5429 self.extra_links.extend(links);
5430 }
5431 }
5432 }
5433 Err(err) => {
5434 b.dispose();
5435 log::error!("{}", err)
5436 }
5437 }
5438 }
5439 _ => log::error!("Chrome initialization failed."),
5440 }
5441 }
5442
5443 #[cfg_attr(
5445 all(feature = "tracing", not(feature = "decentralized")),
5446 tracing::instrument(skip_all)
5447 )]
5448 async fn crawl_concurrent_raw_send(
5449 &self,
5450 client: &Client,
5451 handle: &Option<Arc<AtomicI8>>,
5452 url: &Option<&str>,
5453 ) -> Website {
5454 let mut selector: (
5455 CompactString,
5456 smallvec::SmallVec<[CompactString; 2]>,
5457 CompactString,
5458 ) = self.setup_selectors();
5459
5460 let mut website = self.clone();
5461
5462 if let Some(u) = url {
5463 match &website.domain_parsed {
5464 Some(domain_url) => {
5465 if domain_url.as_str().starts_with(u) {
5466 website.set_url_only(u);
5467 } else {
5468 website.set_url(u);
5469 }
5470 }
5471 _ => {
5472 website.set_url(u);
5473 }
5474 }
5475 }
5476
5477 if !website.send_configured {
5478 website.configure_setup().await;
5479 }
5480
5481 if self.single_page() {
5482 website._crawl_establish(client, &mut selector, false).await;
5483 website
5484 } else {
5485 let client_rotator = self.client_rotator.clone();
5486 let on_should_crawl_callback = self.on_should_crawl_callback.clone();
5487 let full_resources = self.configuration.full_resources;
5488 let return_page_links = self.configuration.return_page_links;
5489 let only_html = self.configuration.only_html && !full_resources;
5490 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
5491
5492 let (mut interval, throttle) = self.setup_crawl();
5493
5494 let mut links: HashSet<CaseInsensitiveString> = website.drain_extra_links().collect();
5495
5496 links.extend(website._crawl_establish(client, &mut selector, false).await);
5497
5498 let semaphore = self.setup_semaphore();
5499
5500 let shared = Arc::new((
5501 client.to_owned(),
5502 selector,
5503 self.channel.clone(),
5504 self.configuration.external_domains_caseless.clone(),
5505 self.channel_guard.clone(),
5506 self.configuration.retry,
5507 self.configuration.full_resources,
5508 PageLinkBuildSettings::new_full(
5509 false,
5510 self.configuration.full_resources,
5511 self.configuration.subdomains,
5512 self.configuration.tld,
5513 self.configuration.normalize,
5514 ),
5515 self.domain_parsed.clone(),
5516 self.on_link_find_callback.clone(),
5517 self.configuration.remote_multimodal.clone(),
5518 ));
5519
5520 let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> = JoinSet::new();
5521
5522 let mut exceeded_budget = false;
5524 let concurrency = throttle.is_zero();
5525
5526 website
5527 .dequeue(&mut q, &mut links, &mut exceeded_budget)
5528 .await;
5529
5530 if !concurrency && !links.is_empty() {
5531 tokio::time::sleep(*throttle).await;
5532 }
5533
5534 let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
5535 Some(Instant::now())
5536 } else {
5537 None
5538 };
5539
5540 'outer: loop {
5541 let mut stream =
5542 tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect());
5543
5544 loop {
5545 if !concurrency {
5546 tokio::time::sleep(*throttle).await;
5547 }
5548
5549 let semaphore =
5550 get_semaphore(&semaphore, !self.configuration.shared_queue).await;
5551
5552 tokio::select! {
5553 biased;
5554 Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
5555 if !self.handle_process(handle, &mut interval, async {
5556 emit_log_shutdown(link.inner());
5557 let permits = set.len();
5558 set.shutdown().await;
5559 semaphore.add_permits(permits);
5560 }).await {
5561 break 'outer;
5562 }
5563 let allowed = website.is_allowed(&link);
5564
5565 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
5566 exceeded_budget = true;
5567 break;
5568 }
5569
5570 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
5571 continue;
5572 }
5573
5574 emit_log(link.inner());
5575
5576 website.insert_link(link.clone()).await;
5577
5578 if let Ok(permit) = semaphore.clone().acquire_owned().await {
5579 let shared = shared.clone();
5580 let on_should_crawl_callback = on_should_crawl_callback.clone();
5581 let rotator = client_rotator.clone();
5582 spawn_set("page_fetch", &mut set, async move {
5583 let link_result = match &shared.9 {
5584 Some(cb) => cb(link, None),
5585 _ => (link, None),
5586 };
5587
5588 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
5589 let mut links_pages = if return_page_links {
5590 Some(links.clone())
5591 } else {
5592 None
5593 };
5594 let mut relative_selectors = shared.1.clone();
5595 let mut r_settings = shared.7;
5596 r_settings.ssg_build = true;
5597 let target_url = link_result.0.as_ref();
5598 let external_domains_caseless = &shared.3;
5599 let client = match &rotator {
5600 Some(r) => r.next(),
5601 None => &shared.0,
5602 };
5603
5604 let mut domain_parsed = None;
5605
5606 let mut page = Page::new_page_streaming(
5607 target_url,
5608 client, only_html,
5609 &mut relative_selectors,
5610 external_domains_caseless,
5611 &r_settings,
5612 &mut links,
5613 None,
5614 &shared.8,
5615 &mut domain_parsed,
5616 &mut links_pages).await;
5617
5618 let mut retry_count = shared.5;
5619
5620 while page.should_retry && retry_count > 0 {
5621 retry_count -= 1;
5622
5623 if let Some(timeout) = page.get_timeout() {
5624 tokio::time::sleep(timeout).await;
5625 }
5626
5627 let retry_client = match &rotator {
5628 Some(r) => r.next(),
5629 None => &shared.0,
5630 };
5631
5632 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
5633 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
5634 let mut domain_parsed = None;
5635 let next_page = Page::new_page_streaming(
5636 target_url,
5637 retry_client, only_html,
5638 &mut relative_selectors.clone(),
5639 external_domains_caseless,
5640 &r_settings,
5641 &mut links,
5642 None,
5643 &shared.8,
5644 &mut domain_parsed,
5645 &mut links_pages).await;
5646
5647 page.clone_from(&next_page);
5648
5649 }).await
5650 {
5651 log::warn!("Handler timeout exceeded {elasped}");
5652 }
5653
5654 } else {
5655 page.clone_from(&Page::new_page_streaming(
5656 target_url,
5657 retry_client,
5658 only_html,
5659 &mut relative_selectors.clone(),
5660 external_domains_caseless,
5661 &r_settings,
5662 &mut links,
5663 None,
5664 &shared.8,
5665 &mut domain_parsed,
5666 &mut links_pages).await);
5667 }
5668 }
5669
5670 if return_page_links {
5671 page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new);
5672 }
5673
5674 #[cfg(all(feature = "agent", feature = "serde"))]
5676 if shared.10.is_some() {
5677 let html = page.get_html();
5678 if !html.is_empty() {
5679 use crate::features::automation::{run_remote_multimodal_extraction, AutomationResultExt};
5680 let title = page.metadata.as_ref().and_then(|m| m.title.as_ref()).map(|t| t.as_str());
5681 if let Ok(Some(result)) = run_remote_multimodal_extraction(
5682 &shared.10,
5683 &html,
5684 target_url,
5685 title,
5686 ).await {
5687 match page.remote_multimodal_usage.as_mut() {
5689 Some(v) => v.push(result.usage.clone()),
5690 None => page.remote_multimodal_usage = Some(vec![result.usage.clone()]),
5691 }
5692 if result.extracted.is_some() || result.screenshot.is_some() {
5694 let automation_result = result.to_automation_results();
5695 match page.extra_remote_multimodal_data.as_mut() {
5696 Some(v) => v.push(automation_result),
5697 None => page.extra_remote_multimodal_data = Some(vec![automation_result]),
5698 }
5699 }
5700 }
5701 }
5702 }
5703
5704 if let Some(ref cb) = on_should_crawl_callback {
5705 if !cb.call(&page) {
5706 page.blocked_crawl = true;
5707 channel_send_page(&shared.2, page, &shared.4);
5708 drop(permit);
5709 return Default::default()
5710 }
5711 }
5712
5713 let signature = page.signature;
5714
5715 channel_send_page(&shared.2, page, &shared.4);
5716
5717 drop(permit);
5718
5719 (links, signature)
5720 });
5721 }
5722
5723 website.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5724 },
5725 Some(result) = set.join_next(), if !set.is_empty() => {
5726 if let Ok(res) = result {
5727 match res.1 {
5728 Some(signature) => {
5729 if website.is_signature_allowed(signature).await {
5730 website.insert_signature(signature).await;
5731 website.links_visited.extend_links(&mut links, res.0);
5732 }
5733 }
5734 _ => {
5735 website.links_visited.extend_links(&mut links, res.0);
5736 }
5737 }
5738 } else {
5739 break;
5740 }
5741 }
5742 else => break,
5743 }
5744
5745 website
5746 .dequeue(&mut q, &mut links, &mut exceeded_budget)
5747 .await;
5748
5749 if links.is_empty() && set.is_empty() || exceeded_budget {
5750 if exceeded_budget {
5752 while set.join_next().await.is_some() {}
5753 }
5754 break 'outer;
5755 }
5756 }
5757
5758 website.subscription_guard().await;
5759 website
5760 .dequeue(&mut q, &mut links, &mut exceeded_budget)
5761 .await;
5762
5763 if links.is_empty() && set.is_empty() {
5764 break;
5765 }
5766 }
5767 website
5768 }
5769 }
5770
5771 #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
5773 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
5774 async fn crawl_concurrent_send(
5775 &self,
5776 client: &Client,
5777 handle: &Option<Arc<AtomicI8>>,
5778 url: &Option<&str>,
5779 ) -> Website {
5780 use crate::features::chrome::attempt_navigation;
5781
5782 match self.setup_browser().await {
5783 Some(mut b) => {
5784 match attempt_navigation(
5785 "about:blank",
5786 &b.browser.0,
5787 &self.configuration.request_timeout,
5788 &b.browser.2,
5789 &self.configuration.viewport,
5790 )
5791 .await
5792 {
5793 Ok(new_page) => {
5794 let mut selectors = self.setup_selectors();
5795 let mut website = self.to_owned();
5796
5797 if let Some(u) = url {
5798 match &website.domain_parsed {
5799 Some(domain_url) => {
5800 if domain_url.as_str().starts_with(u) {
5801 website.set_url_only(u);
5802 } else {
5803 website.set_url(u);
5804 }
5805 }
5806 _ => {
5807 website.set_url(u);
5808 }
5809 }
5810 }
5811
5812 if !website.send_configured {
5813 website.configure_setup().await;
5814 }
5815
5816 let base_links = website
5817 .crawl_establish(&client, &mut selectors, false, &new_page)
5818 .await;
5819
5820 drop(new_page);
5821
5822 if self.single_page() {
5823 website.subscription_guard().await;
5824 b.dispose();
5825 website
5826 } else {
5827 let semaphore: Arc<Semaphore> = self.setup_semaphore();
5828 let (mut interval, throttle) = self.setup_crawl();
5829
5830 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
5831
5832 let mut links: HashSet<CaseInsensitiveString> =
5833 *self.extra_links.clone();
5834
5835 links.extend(base_links);
5836
5837 let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> =
5838 JoinSet::new();
5839
5840 let shared = Arc::new((
5841 client.to_owned(),
5842 selectors,
5843 self.channel.clone(),
5844 self.configuration.external_domains_caseless.clone(),
5845 self.channel_guard.clone(),
5846 b.browser.0.clone(),
5847 self.configuration.clone(),
5848 self.url.inner().to_string(),
5849 b.browser.2.clone(),
5850 self.domain_parsed.clone(),
5851 self.on_link_find_callback.clone(),
5852 ));
5853
5854 let add_external = shared.3.len() > 0;
5855 let on_should_crawl_callback = self.on_should_crawl_callback.clone();
5856 let full_resources = self.configuration.full_resources;
5857 let return_page_links = self.configuration.return_page_links;
5858 let mut exceeded_budget = false;
5859 let concurrency = throttle.is_zero();
5860
5861 website
5862 .dequeue(&mut q, &mut links, &mut exceeded_budget)
5863 .await;
5864
5865 if !concurrency && !links.is_empty() {
5866 tokio::time::sleep(*throttle).await;
5867 }
5868
5869 let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
5870 Some(Instant::now())
5871 } else {
5872 None
5873 };
5874
5875 'outer: loop {
5876 let mut stream = tokio_stream::iter::<HashSet<CaseInsensitiveString>>(
5877 links.drain().collect(),
5878 );
5879
5880 loop {
5881 if !concurrency {
5882 tokio::time::sleep(*throttle).await;
5883 }
5884
5885 let semaphore =
5886 get_semaphore(&semaphore, !self.configuration.shared_queue)
5887 .await;
5888
5889 tokio::select! {
5890 biased;
5891 Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
5892 if !self
5893 .handle_process(
5894 handle,
5895 &mut interval,
5896 async {
5897 emit_log_shutdown(&link.inner());
5898 let permits = set.len();
5899 set.shutdown().await;
5900 semaphore.add_permits(permits);
5901 },
5902 )
5903 .await
5904 {
5905 break 'outer;
5906 }
5907
5908 let allowed = website.is_allowed(&link);
5909
5910 if allowed
5911 .eq(&ProcessLinkStatus::BudgetExceeded)
5912 {
5913 exceeded_budget = true;
5914 break;
5915 }
5916 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
5917 continue;
5918 }
5919
5920 emit_log(&link.inner());
5921
5922 website.insert_link(link.clone()).await;
5923
5924 if let Ok(permit) = semaphore.clone().acquire_owned().await {
5925 let shared = shared.clone();
5926 let on_should_crawl_callback = on_should_crawl_callback.clone();
5927 spawn_set("page_fetch", &mut set, async move {
5928 let results = match attempt_navigation("about:blank", &shared.5, &shared.6.request_timeout, &shared.8, &shared.6.viewport).await {
5929 Ok(new_page) => {
5930 let (_, intercept_handle) = tokio::join!(
5931 crate::features::chrome::setup_chrome_events(&new_page, &shared.6),
5932 crate::features::chrome::setup_chrome_interception_base(
5933 &new_page,
5934 shared.6.chrome_intercept.enabled,
5935 &shared.6.auth_challenge_response,
5936 shared.6.chrome_intercept.block_visuals,
5937 &shared.7,
5938 )
5939 );
5940
5941 let link_result =
5942 match &shared.10 {
5943 Some(cb) => cb(link, None),
5944 _ => (link, None),
5945 };
5946
5947 let target_url = link_result.0.as_ref();
5948
5949 let mut page = Page::new(
5950 &target_url,
5951 &shared.0,
5952 &new_page,
5953 &shared.6.wait_for,
5954 &shared.6.screenshot,
5955 false,
5956 &shared.6.openai_config,
5957 &shared.6.execution_scripts,
5958 &shared.6.automation_scripts,
5959 &shared.6.viewport,
5960 &shared.6.request_timeout,
5961 &shared.6.track_events,
5962 shared.6.referer.clone(),
5963 shared.6.max_page_bytes,
5964 shared.6.get_cache_options(),
5965 &shared.6.cache_policy,
5966 &shared.6.remote_multimodal,
5967 )
5968 .await;
5969
5970 let mut retry_count = shared.6.retry;
5971
5972 while page.should_retry && retry_count > 0 {
5973 retry_count -= 1;
5974 if let Some(timeout) = page.get_timeout() {
5975 tokio::time::sleep(timeout).await;
5976 }
5977 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
5978 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
5979 let p = Page::new(
5980 &target_url,
5981 &shared.0,
5982 &new_page,
5983 &shared.6.wait_for,
5984 &shared.6.screenshot,
5985 false,
5986 &shared.6.openai_config,
5987 &shared.6.execution_scripts,
5988 &shared.6.automation_scripts,
5989 &shared.6.viewport,
5990 &shared.6.request_timeout,
5991 &shared.6.track_events,
5992 shared.6.referer.clone(),
5993 shared.6.max_page_bytes,
5994 shared.6.get_cache_options(),
5995 &shared.6.cache_policy,
5996 &shared.6.remote_multimodal,
5997 ).await;
5998 page.clone_from(&p);
5999
6000 }).await {
6001 log::info!("{target_url} backoff gateway timeout exceeded {elasped}");
6002 }
6003 } else {
6004 page.clone_from(
6005 &Page::new(
6006 &target_url,
6007 &shared.0,
6008 &new_page,
6009 &shared.6.wait_for,
6010 &shared.6.screenshot,
6011 false,
6012 &shared.6.openai_config,
6013 &shared.6.execution_scripts,
6014 &shared.6.automation_scripts,
6015 &shared.6.viewport,
6016 &shared.6.request_timeout,
6017 &shared.6.track_events,
6018 shared.6.referer.clone(),
6019 shared.6.max_page_bytes,
6020 shared.6.get_cache_options(),
6021 &shared.6.cache_policy,
6022 &shared.6.remote_multimodal,
6023 )
6024 .await,
6025 );
6026 }
6027 }
6028
6029 if let Some(h) = intercept_handle {
6030 let abort_handle = h.abort_handle();
6031 if let Err(elasped) = tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await {
6032 log::warn!("Handler timeout exceeded {elasped}");
6033 abort_handle.abort();
6034 }
6035 }
6036
6037 if add_external {
6038 page.set_external(shared.3.clone());
6039 }
6040
6041 let prev_domain = page.base;
6042
6043 page.base = shared.9.as_deref().cloned();
6044
6045 if return_page_links {
6046 page.page_links = Some(Default::default());
6047 }
6048
6049 let links = if full_resources {
6050 page.links_full(&shared.1, &shared.9).await
6051 } else {
6052 page.links(&shared.1, &shared.9).await
6053 };
6054
6055 page.base = prev_domain;
6056
6057 if shared.6.normalize {
6058 page.signature.replace(crate::utils::hash_html(&page.get_html_bytes_u8()).await);
6059 }
6060
6061 if let Some(ref cb) = on_should_crawl_callback {
6062 if !cb.call(&page) {
6063 page.blocked_crawl = true;
6064 channel_send_page(&shared.2, page, &shared.4);
6065 drop(permit);
6066 return Default::default()
6067 }
6068 }
6069
6070 let signature = page.signature;
6071
6072 channel_send_page(
6073 &shared.2, page, &shared.4,
6074 );
6075
6076 (links, signature)
6077 }
6078 _ => Default::default(),
6079 };
6080
6081
6082 drop(permit);
6083
6084 results
6085 });
6086 }
6087
6088 website.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6089 }
6090 Some(result) = set.join_next(), if !set.is_empty() => {
6091 if let Ok(res) = result {
6092 match res.1 {
6093 Some(signature) => {
6094 if website.is_signature_allowed(signature).await {
6095 website.insert_signature(signature).await;
6096 website.links_visited.extend_links(&mut links, res.0);
6097 }
6098 }
6099 _ => {
6100 website.links_visited.extend_links(&mut links, res.0);
6101 }
6102 }
6103 } else{
6104 break
6105 }
6106 }
6107 else => break,
6108 };
6109
6110 if links.is_empty() && set.is_empty() || exceeded_budget {
6111 if exceeded_budget {
6112 while set.join_next().await.is_some() {}
6113 }
6114 break 'outer;
6115 }
6116 }
6117
6118 website
6119 .dequeue(&mut q, &mut links, &mut exceeded_budget)
6120 .await;
6121
6122 if links.is_empty() && set.is_empty() {
6123 break;
6124 }
6125 }
6126
6127 website.subscription_guard().await;
6128 b.dispose();
6129
6130 website
6131 }
6132 }
6133 Err(err) => {
6134 b.dispose();
6135 log::error!("{}", err);
6136 self.clone()
6137 }
6138 }
6139 }
6140 _ => {
6141 log::error!("Chrome initialization failed.");
6142 self.clone()
6143 }
6144 }
6145 }
6146
6147 #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
6149 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6150 async fn _fetch_chrome(&self, client: &Client, url: &Option<&str>) {
6151 use crate::features::chrome::attempt_navigation;
6152
6153 match self.setup_browser().await {
6154 Some(mut b) => {
6155 match attempt_navigation(
6156 "about:blank",
6157 &b.browser.0,
6158 &self.configuration.request_timeout,
6159 &b.browser.2,
6160 &self.configuration.viewport,
6161 )
6162 .await
6163 {
6164 Ok(new_page) => {
6165 let mut selectors = self.setup_selectors();
6166 self.crawl_establish_chrome_one(&client, &mut selectors, url, &new_page)
6167 .await;
6168 self.subscription_guard().await;
6169 b.dispose();
6170 }
6171 Err(err) => {
6172 b.dispose();
6173 log::error!("{}", err);
6174 }
6175 }
6176 }
6177 _ => {
6178 log::error!("Chrome initialization failed.");
6179 }
6180 }
6181 }
6182
6183 #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
6185 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6186 async fn _fetch_chrome_persisted(
6187 &self,
6188 client: &Client,
6189 url: &Option<&str>,
6190 b: &crate::features::chrome::BrowserController,
6191 ) {
6192 use crate::features::chrome::attempt_navigation;
6193 match attempt_navigation(
6194 "about:blank",
6195 &b.browser.0,
6196 &self.configuration.request_timeout,
6197 &b.browser.2,
6198 &self.configuration.viewport,
6199 )
6200 .await
6201 {
6202 Ok(new_page) => {
6203 let mut selectors = self.setup_selectors();
6204 self.crawl_establish_chrome_one(&client, &mut selectors, url, &new_page)
6205 .await;
6206 self.subscription_guard().await;
6207 }
6208 Err(err) => {
6209 log::error!("{}", err);
6210 }
6211 }
6212 }
6213
6214 #[cfg(all(not(feature = "decentralized"), not(feature = "chrome"), feature = "webdriver"))]
6216 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6217 async fn crawl_concurrent_webdriver(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
6218 self.start();
6219
6220 match self.setup_webdriver().await {
6221 Some(mut controller) => {
6222 let driver = controller.driver();
6223 let mut selectors = self.setup_selectors();
6224 self.status = CrawlStatus::Active;
6225
6226 if self.single_page() {
6227 self.crawl_establish_webdriver_one(&client, &mut selectors, &None, driver)
6228 .await;
6229 self.subscription_guard().await;
6230 controller.dispose();
6231 } else {
6232 let semaphore: Arc<Semaphore> = self.setup_semaphore();
6233 let (mut interval, throttle) = self.setup_crawl();
6234
6235 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
6236
6237 let base_links = self
6238 .crawl_establish_webdriver_one(&client, &mut selectors, &None, driver)
6239 .await;
6240
6241 let mut links: HashSet<CaseInsensitiveString> =
6242 self.drain_extra_links().collect();
6243
6244 links.extend(base_links);
6245
6246 self.configuration.configure_allowlist();
6247
6248 let timeout = self
6249 .configuration
6250 .webdriver_config
6251 .as_ref()
6252 .and_then(|c| c.timeout);
6253
6254 let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> =
6255 JoinSet::new();
6256
6257 let shared = Arc::new((
6258 client.to_owned(),
6259 selectors,
6260 self.channel.clone(),
6261 self.configuration.external_domains_caseless.clone(),
6262 self.channel_guard.clone(),
6263 driver.clone(),
6264 self.configuration.clone(),
6265 self.url.inner().to_string(),
6266 self.domain_parsed.clone(),
6267 self.on_link_find_callback.clone(),
6268 timeout,
6269 ));
6270
6271 let add_external = shared.3.len() > 0;
6272 let on_should_crawl_callback = self.on_should_crawl_callback.clone();
6273 let full_resources = self.configuration.full_resources;
6274 let return_page_links = self.configuration.return_page_links;
6275 let mut exceeded_budget = false;
6276 let concurrency = throttle.is_zero();
6277
6278 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6279
6280 if !concurrency && !links.is_empty() {
6281 tokio::time::sleep(*throttle).await;
6282 }
6283
6284 let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
6285 Some(Instant::now())
6286 } else {
6287 None
6288 };
6289
6290 'outer: loop {
6291 let mut stream = tokio_stream::iter::<HashSet<CaseInsensitiveString>>(
6292 links.drain().collect(),
6293 );
6294
6295 loop {
6296 if !concurrency {
6297 tokio::time::sleep(*throttle).await;
6298 }
6299
6300 let semaphore =
6301 get_semaphore(&semaphore, !self.configuration.shared_queue)
6302 .await;
6303
6304 tokio::select! {
6305 biased;
6306 Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
6307 if !self
6308 .handle_process(
6309 handle,
6310 &mut interval,
6311 async {
6312 emit_log_shutdown(&link.inner());
6313 let permits = set.len();
6314 set.shutdown().await;
6315 semaphore.add_permits(permits);
6316 },
6317 )
6318 .await
6319 {
6320 break 'outer;
6321 }
6322
6323 let allowed = self.is_allowed(&link);
6324
6325 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
6326 exceeded_budget = true;
6327 break;
6328 }
6329 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
6330 continue;
6331 }
6332
6333 emit_log(&link.inner());
6334
6335 self.insert_link(link.clone()).await;
6336
6337 if let Ok(permit) = semaphore.clone().acquire_owned().await {
6338 let shared = shared.clone();
6339 let on_should_crawl_callback = on_should_crawl_callback.clone();
6340
6341 spawn_set("page_fetch_webdriver", &mut set, async move {
6342 let link_result = match &shared.9 {
6343 Some(cb) => cb(link, None),
6344 _ => (link, None),
6345 };
6346
6347 let target_url = link_result.0.as_ref();
6348
6349 crate::features::webdriver::setup_driver_events(&shared.5, &shared.6).await;
6351
6352 let mut page = Page::new_page_webdriver(
6353 target_url,
6354 &shared.5,
6355 shared.10,
6356 )
6357 .await;
6358
6359 let mut retry_count = shared.6.retry;
6360
6361 while page.should_retry && retry_count > 0 {
6362 retry_count -= 1;
6363 if let Some(timeout_duration) = page.get_timeout() {
6364 tokio::time::sleep(timeout_duration).await;
6365 }
6366 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
6367 if let Err(elapsed) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
6368 let p = Page::new_page_webdriver(
6369 target_url,
6370 &shared.5,
6371 shared.10,
6372 ).await;
6373 page.clone_from(&p);
6374 }).await {
6375 log::info!("{target_url} backoff gateway timeout exceeded {elapsed}");
6376 }
6377 } else {
6378 page.clone_from(
6379 &Page::new_page_webdriver(
6380 target_url,
6381 &shared.5,
6382 shared.10,
6383 )
6384 .await,
6385 );
6386 }
6387 }
6388
6389 if add_external {
6390 page.set_external(shared.3.clone());
6391 }
6392
6393 let prev_domain = page.base;
6394 page.base = shared.8.as_deref().cloned();
6395
6396 if return_page_links {
6397 page.page_links = Some(Default::default());
6398 }
6399
6400 let links = if full_resources {
6401 page.links_full(&shared.1, &shared.8).await
6402 } else {
6403 page.links(&shared.1, &shared.8).await
6404 };
6405
6406 page.base = prev_domain;
6407
6408 if shared.6.normalize {
6409 page.signature.replace(crate::utils::hash_html(&page.get_html_bytes_u8()).await);
6410 }
6411
6412 if let Some(ref cb) = on_should_crawl_callback {
6413 if !cb.call(&page) {
6414 page.blocked_crawl = true;
6415 channel_send_page(&shared.2, page, &shared.4);
6416 drop(permit);
6417 return Default::default();
6418 }
6419 }
6420
6421 let signature = page.signature;
6422
6423 channel_send_page(&shared.2, page, &shared.4);
6424
6425 drop(permit);
6426
6427 (links, signature)
6428 });
6429 }
6430
6431 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6432 }
6433 Some(result) = set.join_next(), if !set.is_empty() => {
6434 if let Ok(res) = result {
6435 match res.1 {
6436 Some(signature) => {
6437 if self.is_signature_allowed(signature).await {
6438 self.insert_signature(signature).await;
6439 self.links_visited.extend_links(&mut links, res.0);
6440 }
6441 }
6442 _ => {
6443 self.links_visited.extend_links(&mut links, res.0);
6444 }
6445 }
6446 } else {
6447 break
6448 }
6449
6450 if links.is_empty() && set.is_empty() || exceeded_budget {
6451 if exceeded_budget {
6452 while set.join_next().await.is_some() {}
6453 }
6454 break 'outer;
6455 }
6456 }
6457 else => break,
6458 };
6459
6460 if links.is_empty() && set.is_empty() || exceeded_budget {
6461 if exceeded_budget {
6462 while set.join_next().await.is_some() {}
6463 }
6464 break 'outer;
6465 }
6466 }
6467
6468 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6469
6470 if links.is_empty() && set.is_empty() {
6471 break;
6472 }
6473 }
6474
6475 self.subscription_guard().await;
6476 controller.dispose();
6477
6478 if !links.is_empty() {
6479 self.extra_links.extend(links);
6480 }
6481 }
6482 }
6483 None => {
6484 log::error!("WebDriver initialization failed.");
6485 }
6486 }
6487 }
6488
6489 #[cfg(all(not(feature = "decentralized"), not(feature = "chrome"), feature = "webdriver"))]
6491 pub async fn crawl_concurrent(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
6492 if self.configuration.webdriver_config.is_some() {
6494 self.crawl_concurrent_webdriver(client, handle).await
6495 } else {
6496 self.crawl_concurrent_raw(client, handle).await
6497 }
6498 }
6499
6500 #[cfg(all(not(feature = "decentralized"), not(feature = "chrome"), not(feature = "webdriver")))]
6502 pub async fn crawl_concurrent(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
6503 self.crawl_concurrent_raw(client, handle).await
6504 }
6505
6506 #[cfg(feature = "decentralized")]
6508 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6509 pub async fn crawl_concurrent(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
6510 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
6511
6512 self.configuration.configure_allowlist();
6513 let domain = self.url.inner().as_str();
6514 let mut interval = Box::pin(tokio::time::interval(Duration::from_millis(10)));
6515 let throttle = Box::pin(self.get_delay());
6516 let on_link_find_callback = self.on_link_find_callback.clone();
6517 let http_worker = std::env::var("SPIDER_WORKER")
6519 .unwrap_or_else(|_| "http:".to_string())
6520 .starts_with("http:");
6521
6522 let mut links: HashSet<CaseInsensitiveString> = self
6523 .crawl_establish(
6524 &client,
6525 &mut (domain.into(), Default::default()),
6526 http_worker,
6527 )
6528 .await;
6529
6530 let mut set: JoinSet<HashSet<CaseInsensitiveString>> = JoinSet::new();
6531 let mut exceeded_budget = false;
6532
6533 'outer: loop {
6534 let stream =
6535 tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect())
6536 .throttle(*throttle);
6537 tokio::pin!(stream);
6538
6539 loop {
6540 match stream.next().await {
6541 Some(link) => {
6542 if !self
6543 .handle_process(handle, &mut interval, async {
6544 emit_log_shutdown(&link.inner());
6545 set.shutdown().await;
6546 })
6547 .await
6548 {
6549 break 'outer;
6550 }
6551
6552 let allowed = self.is_allowed(&link);
6553
6554 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
6555 exceeded_budget = true;
6556 break;
6557 }
6558 if allowed.eq(&ProcessLinkStatus::Blocked)
6559 || !self.is_allowed_disk(&link).await
6560 {
6561 continue;
6562 }
6563
6564 emit_log(&link.inner());
6565
6566 self.insert_link(link.clone()).await;
6567
6568 if let Ok(permit) = SEM.acquire().await {
6569 let client = client.clone();
6570 let on_link_find_callback = on_link_find_callback.clone();
6571
6572 spawn_set("page_fetch", &mut set, async move {
6573 let link_results = match &on_link_find_callback.clone() {
6574 Some(cb) => cb(link, None),
6575 _ => (link, None),
6576 };
6577 let link_results = link_results.0.as_ref();
6578 let page = Page::new_links_only(
6579 &if http_worker && link_results.starts_with("https") {
6580 link_results.replacen("https", "http", 1).to_string()
6581 } else {
6582 link_results.to_string()
6583 },
6584 &client,
6585 )
6586 .await;
6587
6588 drop(permit);
6589
6590 page.links
6591 });
6592
6593 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6594 }
6595 }
6596 _ => break,
6597 }
6598 if exceeded_budget {
6599 break;
6600 }
6601 }
6602
6603 while let Some(res) = set.join_next().await {
6604 if let Ok(msg) = res {
6605 self.links_visited.extend_links(&mut links, msg);
6606 }
6607 }
6608
6609 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6610
6611 if links.is_empty() || exceeded_budget {
6612 break;
6613 }
6614 }
6615
6616 if !links.is_empty() {
6617 self.extra_links.extend(links);
6618 }
6619 }
6620
6621 #[cfg(all(feature = "chrome", feature = "real_browser"))]
6622 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6623 pub async fn warm_up_gemini(&mut self) {
6625 use crate::features::chrome::attempt_navigation;
6626
6627 if let Some(mut b) = self.setup_browser().await {
6628 if let Ok(page) = attempt_navigation(
6629 "about:blank",
6630 &b.browser.0,
6631 &self.configuration.request_timeout,
6632 &b.browser.2,
6633 &self.configuration.viewport,
6634 )
6635 .await
6636 {
6637 let _ = crate::features::solvers::warm_gemini_model(&page).await;
6638 b.dispose();
6639 }
6640 }
6641 }
6642
6643 #[cfg(all(not(feature = "decentralized"), feature = "smart"))]
6645 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6646 pub async fn crawl_concurrent_smart(
6647 &mut self,
6648 client: &Client,
6649 handle: &Option<Arc<AtomicI8>>,
6650 ) {
6651 use tokio::sync::OnceCell;
6652 self.start();
6653 self.status = CrawlStatus::Active;
6654 let browser: OnceBrowser = OnceCell::new();
6655
6656 let mut selectors: (
6657 CompactString,
6658 smallvec::SmallVec<[CompactString; 2]>,
6659 CompactString,
6660 ) = self.setup_selectors();
6661
6662 if self.single_page() {
6663 self.subscription_guard().await;
6664 self.crawl_establish_smart(&client, &mut selectors, &browser)
6665 .await;
6666 } else {
6667 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
6668
6669 let mut links: HashSet<CaseInsensitiveString> = self.drain_extra_links().collect();
6670
6671 let (mut interval, throttle) = self.setup_crawl();
6672 let on_should_crawl_callback = self.on_should_crawl_callback.clone();
6673 let return_page_links = self.configuration.return_page_links;
6674
6675 links.extend(
6676 self.crawl_establish_smart(&client, &mut selectors, &browser)
6677 .await,
6678 );
6679
6680 self.configuration.configure_allowlist();
6681
6682 let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> = JoinSet::new();
6683 let semaphore = self.setup_semaphore();
6684
6685 let shared = Arc::new((
6686 client.to_owned(),
6687 selectors,
6688 self.channel.clone(),
6689 self.channel_guard.clone(),
6690 self.configuration.clone(),
6691 self.domain_parsed.clone(),
6692 browser,
6693 self.on_link_find_callback.clone(),
6694 self.cookie_jar.clone(),
6695 ));
6696
6697 let add_external = self.configuration.external_domains_caseless.len() > 0;
6698 let mut exceeded_budget = false;
6699 let concurrency = throttle.is_zero();
6700
6701 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6702
6703 if !concurrency && !links.is_empty() {
6704 tokio::time::sleep(*throttle).await;
6705 }
6706
6707 let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
6708 Some(Instant::now())
6709 } else {
6710 None
6711 };
6712
6713 'outer: loop {
6714 let mut stream =
6715 tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect());
6716
6717 loop {
6718 if !concurrency {
6719 tokio::time::sleep(*throttle).await;
6720 }
6721
6722 let semaphore =
6723 get_semaphore(&semaphore, !self.configuration.shared_queue).await;
6724
6725 tokio::select! {
6726 biased;
6727 Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
6728 if !self
6729 .handle_process(
6730 handle,
6731 &mut interval,
6732 async {
6733 emit_log_shutdown(&link.inner());
6734 let permits = set.len();
6735 set.shutdown().await;
6736 semaphore.add_permits(permits);
6737
6738 },
6739 )
6740 .await
6741 {
6742 break 'outer;
6743 }
6744
6745 let allowed = self.is_allowed(&link);
6746
6747 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
6748 exceeded_budget = true;
6749 break;
6750 }
6751 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
6752 continue;
6753 }
6754
6755 emit_log(&link.inner());
6756 self.insert_link(link.clone()).await;
6757
6758 if let Ok(permit) = semaphore.clone().acquire_owned().await {
6759 let shared = shared.clone();
6760 let on_should_crawl_callback = on_should_crawl_callback.clone();
6761 spawn_set("page_fetch", &mut set, async move {
6762 let link_result = match &shared.7 {
6763 Some(cb) => cb(link, None),
6764 _ => (link, None),
6765 };
6766
6767 let url = link_result.0.as_ref();
6768 let mut page =
6769 Page::new_page(&url, &shared.0).await;
6770
6771 let mut retry_count = shared.4.retry;
6772
6773 while page.should_retry && retry_count > 0 {
6774 retry_count -= 1;
6775
6776 if let Some(timeout) = page.get_timeout() {
6777 tokio::time::sleep(timeout).await;
6778 }
6779
6780 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
6781
6782 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
6783 if retry_count.is_power_of_two() {
6784 Website::render_chrome_page(
6785 &shared.4, &shared.0,
6786 &mut page, url,
6787 &shared.5,
6788 &shared.6,
6789 )
6790 .await;
6791 } else {
6792 let next_page = Page::new_page(url, &shared.0).await;
6793
6794 page.clone_from(&next_page)
6795 };
6796
6797 }).await
6798 {
6799 log::info!("backoff gateway timeout exceeded {elasped}");
6800 }
6801
6802 } else {
6803
6804 if retry_count.is_power_of_two() {
6805 Website::render_chrome_page(
6806 &shared.4, &shared.0,
6807 &mut page, url,
6808 &shared.5,
6809 &shared.6,
6810 )
6811 .await;
6812 } else {
6813 page.clone_from(
6814 &Page::new_page(url, &shared.0)
6815 .await,
6816 );
6817 }
6818 }
6819 }
6820
6821 if add_external {
6822 page.set_external(
6823 shared
6824 .4
6825 .external_domains_caseless
6826 .clone(),
6827 );
6828 }
6829
6830 let prev_domain = page.base;
6831
6832 page.base = shared.5.as_deref().cloned();
6833
6834 if return_page_links {
6835 page.page_links = Some(Default::default());
6836 }
6837
6838 let (links, bytes_transferred ) = page
6839 .smart_links(
6840 &shared.1, &shared.4, &shared.5, &shared.6, Some(&shared.8)
6841 )
6842 .await;
6843
6844 page.base = prev_domain;
6845 page.bytes_transferred = bytes_transferred;
6846
6847 if shared.4.normalize {
6848 page.signature.replace(crate::utils::hash_html(&page.get_html_bytes_u8()).await);
6849 }
6850
6851 #[cfg(all(feature = "agent", feature = "serde"))]
6853 if shared.4.remote_multimodal.is_some() {
6854 let html = page.get_html();
6855 if !html.is_empty() {
6856 use crate::features::automation::{run_remote_multimodal_extraction, AutomationResultExt};
6857 let title = page.metadata.as_ref().and_then(|m| m.title.as_ref()).map(|t| t.as_str());
6858 if let Ok(Some(result)) = run_remote_multimodal_extraction(
6859 &shared.4.remote_multimodal,
6860 &html,
6861 url,
6862 title,
6863 ).await {
6864 match page.remote_multimodal_usage.as_mut() {
6866 Some(v) => v.push(result.usage.clone()),
6867 None => page.remote_multimodal_usage = Some(vec![result.usage.clone()]),
6868 }
6869 if result.extracted.is_some() || result.screenshot.is_some() {
6871 let automation_result = result.to_automation_results();
6872 match page.extra_remote_multimodal_data.as_mut() {
6873 Some(v) => v.push(automation_result),
6874 None => page.extra_remote_multimodal_data = Some(vec![automation_result]),
6875 }
6876 }
6877 }
6878 }
6879 }
6880
6881 if let Some(ref cb) = on_should_crawl_callback {
6882 if !cb.call(&page) {
6883 page.blocked_crawl = true;
6884 channel_send_page(&shared.2, page, &shared.3);
6885 drop(permit);
6886 return Default::default()
6887 }
6888 }
6889
6890 let signature = page.signature;
6891
6892 channel_send_page(&shared.2, page, &shared.3);
6893
6894 drop(permit);
6895
6896 (links, signature)
6897 });
6898 }
6899
6900 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6901 }
6902 Some(result) = set.join_next(), if !set.is_empty() => {
6903 if let Ok(res) = result {
6904 match res.1 {
6905 Some(signature) => {
6906 if self.is_signature_allowed(signature).await {
6907 self.insert_signature(signature).await;
6908 self.links_visited.extend_links(&mut links, res.0);
6909 }
6910 }
6911 _ => {
6912 self.links_visited.extend_links(&mut links, res.0);
6913 }
6914 }
6915 } else{
6916 break
6917 }
6918 }
6919 else => break,
6920 }
6921
6922 if links.is_empty() && set.is_empty() || exceeded_budget {
6923 if exceeded_budget {
6924 while set.join_next().await.is_some() {}
6925 }
6926 break 'outer;
6927 }
6928 }
6929
6930 self.subscription_guard().await;
6931 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6932
6933 if links.is_empty() && set.is_empty() {
6934 break;
6935 }
6936 }
6937
6938 if !links.is_empty() {
6939 self.extra_links.extend(links);
6940 }
6941 }
6942 }
6943
6944 #[cfg(not(feature = "sitemap"))]
6946 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6947 pub async fn sitemap_crawl(
6948 &mut self,
6949 _client: &Client,
6950 _handle: &Option<Arc<AtomicI8>>,
6951 _scrape: bool,
6952 ) {
6953 }
6954
6955 #[cfg(not(feature = "sitemap"))]
6957 pub async fn sitemap_crawl_chain(
6958 &mut self,
6959 _client: &Client,
6960 _handle: &Option<Arc<AtomicI8>>,
6961 _scrape: bool,
6962 ) {
6963 }
6964
6965 #[cfg(feature = "sitemap")]
6967 pub(crate) fn get_sitemap_setup(&self, domain: &str) -> (&str, bool) {
6968 let (sitemap_path, needs_trailing) = match &self.configuration.sitemap_url {
6969 Some(sitemap_path) => {
6970 let sitemap_path = sitemap_path.as_str();
6971 if domain.ends_with('/') && sitemap_path.starts_with('/') {
6972 (&sitemap_path[1..], false)
6973 } else if !domain.ends_with('/')
6974 && !sitemap_path.is_empty()
6975 && !sitemap_path.starts_with('/')
6976 {
6977 (sitemap_path, true)
6978 } else {
6979 (sitemap_path, false)
6980 }
6981 }
6982 _ => ("sitemap.xml", !domain.ends_with("/")),
6983 };
6984
6985 (sitemap_path, needs_trailing)
6986 }
6987
6988 #[cfg(feature = "sitemap")]
6990 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6991 pub(crate) async fn sitemap_crawl_raw(
6992 &mut self,
6993 client: &Client,
6994 handle: &Option<Arc<AtomicI8>>,
6995 scrape: bool,
6996 ) {
6997 let mut exceeded_budget = self.is_over_wild_budget(&self.configuration.budget);
6998
6999 if !exceeded_budget {
7000 let selectors = self.setup_selectors();
7001 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
7002 let domain = self.url.inner().as_str();
7003 self.domain_parsed = parse_absolute_url(&domain);
7004
7005 let persist_links = self.status == CrawlStatus::Start;
7006
7007 let mut interval: Interval = tokio::time::interval(Duration::from_millis(15));
7008
7009 let (sitemap_path, needs_trailing) = self.get_sitemap_setup(&domain);
7010
7011 self.configuration.sitemap_url = Some(Box::new(
7012 string_concat!(domain, if needs_trailing { "/" } else { "" }, sitemap_path).into(),
7013 ));
7014
7015 self.configuration.configure_allowlist();
7016
7017 let domain_parsed_ref = self.domain_parsed.as_deref().cloned().map(Box::new);
7018
7019 let shared = Arc::new((
7020 self.channel.clone(),
7021 self.channel_guard.clone(),
7022 selectors,
7023 domain_parsed_ref,
7024 ));
7025 let mut sitemaps = match &self.configuration.sitemap_url {
7026 Some(sitemap) => Vec::from([sitemap.to_owned()]),
7027 _ => Default::default(),
7028 };
7029
7030 let return_page_links = self.configuration.return_page_links;
7031
7032 let mut extra_links = self.extra_links.clone();
7033 self.dequeue(&mut q, &mut *extra_links, &mut exceeded_budget)
7034 .await;
7035 self.extra_links.clone_from(&extra_links);
7036
7037 let whitelist_changes = self.configuration.add_sitemap_to_whitelist();
7038
7039 if whitelist_changes.modified() {
7040 self.configuration.set_whitelist();
7041 }
7042
7043 'outer: loop {
7044 let stream =
7045 tokio_stream::iter::<Vec<Box<CompactString>>>(sitemaps.drain(..).collect());
7046 tokio::pin!(stream);
7047
7048 let mut first_request = false;
7049 let mut attempted_correct = false;
7050
7051 while let Some(mut sitemap_url) = stream.next().await {
7052 if !self.handle_process(handle, &mut interval, async {}).await {
7053 break 'outer;
7054 }
7055
7056 let link = <CompactString as Clone>::clone(&(*sitemap_url)).into();
7057
7058 let allowed = self.is_allowed_budgetless(&link);
7059
7060 if allowed.eq(&ProcessLinkStatus::Blocked) {
7061 continue;
7062 }
7063
7064 self.insert_link(link).await;
7065
7066 let (tx, mut rx) = tokio::sync::mpsc::channel::<Page>(100);
7067
7068 let shared = shared.clone();
7069
7070 let handles = crate::utils::spawn_task("page_fetch", async move {
7071 let mut pages = Vec::new();
7072
7073 while let Some(mut page) = rx.recv().await {
7074 if page.page_links.is_none() {
7075 let links = page.links(&shared.2, &shared.3).await;
7076 page.page_links = Some(links.into());
7077 }
7078
7079 if scrape || persist_links {
7080 pages.push(page.clone());
7081 };
7082
7083 if !return_page_links {
7085 page.page_links = None;
7086 }
7087
7088 if shared.0.is_some() {
7089 channel_send_page(&shared.0, page, &shared.1);
7090 }
7091 }
7092
7093 pages
7094 });
7095
7096 while !first_request {
7097 match client.get(sitemap_url.as_str()).send().await {
7099 Ok(response) => {
7100 let limit = *crate::utils::MAX_SIZE_BYTES as u64;
7101
7102 if let Some(response_content_length) = response.content_length() {
7103 if limit > 0 && response_content_length >= limit {
7104 first_request = true;
7106 log::info!(
7107 "{} exceeded parse limit: {:?}",
7108 sitemap_url,
7109 limit
7110 );
7111 break;
7112 }
7113 }
7114
7115 if response.status() == 404 {
7116 if !self
7117 .sitemap_parse(
7118 client,
7119 &mut first_request,
7120 &mut sitemap_url,
7121 &mut attempted_correct,
7122 )
7123 .await
7124 {
7125 break;
7126 }
7127 } else {
7128 match response.bytes().await {
7129 Ok(b) => {
7130 first_request = true;
7131 self.sitemap_parse_crawl(
7132 client,
7133 handle,
7134 b,
7135 &mut interval,
7136 &mut exceeded_budget,
7137 &tx,
7138 &mut sitemaps,
7139 true,
7140 )
7141 .await;
7142 }
7143 Err(err) => {
7144 first_request = true;
7145 log::info!("http parse error: {:?}", err.to_string())
7146 }
7147 };
7148 }
7149 }
7150 Err(err) => {
7151 if attempted_correct {
7153 first_request = true;
7154 break;
7155 }
7156
7157 log::info!("attempting to find sitemap path: {}", err.to_string());
7158
7159 if !self
7160 .sitemap_parse(
7161 client,
7162 &mut first_request,
7163 &mut sitemap_url,
7164 &mut attempted_correct,
7165 )
7166 .await
7167 {
7168 break;
7169 }
7170 }
7171 };
7172 }
7173
7174 drop(tx);
7175
7176 if let Ok(mut handle) = handles.await {
7177 for page in handle.iter_mut() {
7178 if let Some(mut links) = page.page_links.clone() {
7179 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7180 self.extra_links.extend(*links)
7181 }
7182 }
7183 if scrape {
7184 if let Some(p) = self.pages.as_mut() {
7185 p.extend(handle);
7186 }
7187 }
7188 }
7189
7190 if exceeded_budget {
7191 break;
7192 }
7193 }
7194
7195 if sitemaps.len() == 0 || exceeded_budget {
7196 break;
7197 }
7198 }
7199
7200 self.configuration
7201 .remove_sitemap_from_whitelist(whitelist_changes);
7202 }
7203 }
7204
7205 #[cfg(all(
7207 feature = "sitemap",
7208 feature = "chrome",
7209 not(feature = "decentralized")
7210 ))]
7211 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7212 pub(crate) async fn sitemap_crawl_chrome(
7213 &mut self,
7214 client: &Client,
7215 handle: &Option<Arc<AtomicI8>>,
7216 scrape: bool,
7217 ) {
7218 use crate::features::chrome::attempt_navigation;
7219 use sitemap::{
7220 reader::{SiteMapEntity, SiteMapReader},
7221 structs::Location,
7222 };
7223
7224 let mut exceeded_budget = self.is_over_wild_budget(&self.configuration.budget);
7225
7226 if !exceeded_budget {
7227 if let Some(mut b) = self.setup_browser().await {
7228 let selectors = self.setup_selectors();
7229 let semaphore: Arc<Semaphore> = self.setup_semaphore();
7230 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
7231 let domain = self.url.inner().as_str();
7232 self.domain_parsed = parse_absolute_url(&domain);
7233 let persist_links = self.status == CrawlStatus::Start;
7234
7235 let mut interval = tokio::time::interval(Duration::from_millis(15));
7236
7237 let (sitemap_path, needs_trailing) = self.get_sitemap_setup(&domain);
7238
7239 self.configuration.sitemap_url = Some(Box::new(
7240 string_concat!(domain, if needs_trailing { "/" } else { "" }, sitemap_path)
7241 .into(),
7242 ));
7243
7244 self.configuration.configure_allowlist();
7245
7246 let domain_parsed_ref = self.domain_parsed.as_deref().cloned().map(Box::new);
7247
7248 let shared = Arc::new((
7249 self.channel.clone(),
7250 self.channel_guard.clone(),
7251 b.browser.0.clone(),
7252 self.configuration.clone(),
7253 self.url.inner().to_string(),
7254 b.browser.2.clone(),
7255 selectors.clone(),
7256 domain_parsed_ref,
7257 ));
7258
7259 let mut sitemaps = match &self.configuration.sitemap_url {
7260 Some(sitemap) => Vec::from([sitemap.to_owned()]),
7261 _ => Default::default(),
7262 };
7263
7264 let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
7265 Some(Instant::now())
7266 } else {
7267 None
7268 };
7269
7270 let mut extra_links = self.extra_links.clone();
7271 self.dequeue(&mut q, &mut *extra_links, &mut exceeded_budget)
7272 .await;
7273 self.extra_links.clone_from(&extra_links);
7274 let mut set: JoinSet<Option<Page>> = JoinSet::new();
7275
7276 let whitelist_changes = self.configuration.add_sitemap_to_whitelist();
7277
7278 if whitelist_changes.modified() {
7279 self.configuration.set_whitelist();
7280 }
7281
7282 'outer: loop {
7283 let stream: tokio_stream::Iter<std::vec::IntoIter<Box<CompactString>>> =
7284 tokio_stream::iter::<Vec<Box<CompactString>>>(sitemaps.drain(..).collect());
7285 tokio::pin!(stream);
7286
7287 tokio::select! {
7288 biased;
7289 Some(sitemap_url) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
7290 if !self.handle_process(handle, &mut interval, async {}).await {
7291 break 'outer;
7292 }
7293
7294 let link = <CompactString as Clone>::clone(&(*sitemap_url)).into();
7295
7296 let allowed = self.is_allowed_budgetless(&link);
7297
7298 if allowed.eq(&ProcessLinkStatus::Blocked) {
7299 continue;
7300 }
7301
7302 self.insert_link(link).await;
7303
7304 match attempt_navigation(
7305 "about:blank",
7306 &shared.2,
7307 &self.configuration.request_timeout,
7308 &shared.5,
7309 &self.configuration.viewport,
7310 )
7311 .await {
7312 Ok(new_page) => {
7313 let (_, intercept_handle) = tokio::join!(
7314 crate::features::chrome::setup_chrome_events(
7315 &new_page,
7316 &self.configuration
7317 ),
7318 self.setup_chrome_interception(&new_page)
7319 );
7320
7321 let mut page = Page::new(
7322 &sitemap_url,
7323 &client,
7324 &new_page,
7325 &self.configuration.wait_for,
7326 &self.configuration.screenshot,
7327 false, &self.configuration.openai_config,
7329 &self.configuration.execution_scripts,
7330 &self.configuration.automation_scripts,
7331 &self.configuration.viewport,
7332 &self.configuration.request_timeout,
7333 &self.configuration.track_events,
7334 self.configuration.referer.clone(),
7335 self.configuration.max_page_bytes,
7336 self.configuration.get_cache_options(),
7337 &self.configuration.cache_policy,
7338 &self.configuration.remote_multimodal,
7339 )
7340 .await;
7341
7342 if let Some(h) = intercept_handle {
7343 let abort_handle = h.abort_handle();
7344 if let Err(elasped) =
7345 tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await
7346 {
7347 log::warn!("Handler timeout exceeded {elasped}");
7348 abort_handle.abort();
7349 }
7350 }
7351
7352 drop(new_page);
7353
7354 let is_xml_entry = page.get_html_bytes_u8().starts_with(b"<?xml");
7355 let is_xml = is_xml_entry
7356 && !page.get_html_bytes_u8().ends_with(b"</html>");
7357
7358 if is_xml {
7359 let reader = SiteMapReader::new(&*page.get_html_bytes_u8());
7360 let mut stream = tokio_stream::iter(reader);
7361
7362 while let Some(entity) = stream.next().await {
7363 if !self.handle_process(handle, &mut interval, async {}).await {
7364 break;
7365 }
7366 match entity {
7367 SiteMapEntity::Url(url_entry) => match url_entry.loc {
7368 Location::Url(url) => {
7369 let link: CaseInsensitiveString = url.as_str().into();
7370
7371 let allowed = self.is_allowed(&link);
7372
7373 if allowed.eq(&ProcessLinkStatus::Blocked) {
7374 continue;
7375 }
7376 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
7377 exceeded_budget = true;
7378 break;
7379 }
7380
7381 self.insert_link(link.clone()).await;
7382
7383 let client = client.clone();
7384 let shared = shared.clone();
7385
7386 spawn_set("page_fetch", &mut set, async move {
7387 if let Ok(new_page) = attempt_navigation(
7388 "about:blank",
7389 &shared.2,
7390 &shared.3.request_timeout,
7391 &shared.5,
7392 &shared.3.viewport,
7393 )
7394 .await
7395 {
7396 let (_, intercept_handle) = tokio::join!(
7397 crate::features::chrome::setup_chrome_events(
7398 &new_page, &shared.3,
7399 ),
7400 crate::features::chrome::setup_chrome_interception_base(
7401 &new_page,
7402 shared.3.chrome_intercept.enabled,
7403 &shared.3.auth_challenge_response,
7404 shared.3.chrome_intercept.block_visuals,
7405 &shared.4,
7406 )
7407 );
7408
7409 let mut page = Page::new(
7410 &link.inner(),
7411 &client,
7412 &new_page,
7413 &shared.3.wait_for,
7414 &shared.3.screenshot,
7415 false,
7416 &shared.3.openai_config,
7417 &shared.3.execution_scripts,
7418 &shared.3.automation_scripts,
7419 &shared.3.viewport,
7420 &shared.3.request_timeout,
7421 &shared.3.track_events,
7422 shared.3.referer.clone(),
7423 shared.3.max_page_bytes,
7424 shared.3.get_cache_options(),
7425 &shared.3.cache_policy,
7426 &shared.3.remote_multimodal,
7427 )
7428 .await;
7429
7430 if let Some(intercept_handle) = intercept_handle
7431 {
7432 let abort_handle =
7433 intercept_handle.abort_handle();
7434
7435 if let Err(elasped) = tokio::time::timeout(
7436 tokio::time::Duration::from_secs(10),
7437 async { intercept_handle.await },
7438 )
7439 .await
7440 {
7441 log::warn!("Handler timeout exceeded {elasped}");
7442 abort_handle.abort();
7443 }
7444 }
7445
7446 if page.page_links.is_none() {
7447 let links =
7448 page.links(&shared.6, &shared.7).await;
7449 page.page_links = Some(links.into());
7450 }
7451
7452 Some(page)
7453 } else {
7454 None
7455 }
7456 });
7457 }
7458 Location::None | Location::ParseErr(_) => (),
7459 },
7460 SiteMapEntity::SiteMap(sitemap_entry) => {
7461 match sitemap_entry.loc {
7462 Location::Url(url) => {
7463 sitemaps.push(Box::new(CompactString::new(
7464 &url.as_str(),
7465 )));
7466 }
7467 Location::None | Location::ParseErr(_) => (),
7468 }
7469 }
7470 SiteMapEntity::Err(err) => {
7471 log::info!("incorrect sitemap error: {:?}", err.msg(),)
7472 }
7473 };
7474
7475 if exceeded_budget {
7476 break;
7477 }
7478 }
7479 } else {
7480
7481 if is_xml_entry {
7482 page.modify_xml_html();
7483 }
7484
7485 let links = page.links(&shared.6, &shared.7).await;
7486
7487 let mut stream = tokio_stream::iter(links);
7488
7489 while let Some(link) = stream.next().await {
7490 if !self.handle_process(handle, &mut interval, async {}).await {
7491 break;
7492 }
7493
7494 if link.ends_with(".xml") {
7495 sitemaps.push(Box::new(link.inner().clone()));
7496 continue;
7497 }
7498
7499 let allowed = self.is_allowed(&link);
7500
7501 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
7502 exceeded_budget = true;
7503 break;
7504 }
7505 if allowed.eq(&ProcessLinkStatus::Blocked) {
7506 continue;
7507 }
7508
7509 self.insert_link(link.clone()).await;
7510
7511 let client = client.clone();
7512 let shared = shared.clone();
7513
7514 spawn_set("page_fetch", &mut set, async move {
7515 match attempt_navigation(
7516 "about:blank",
7517 &shared.2,
7518 &shared.3.request_timeout,
7519 &shared.5,
7520 &shared.3.viewport,
7521 )
7522 .await {
7523 Ok(new_page) => {
7524 let (_, intercept_handle) = tokio::join!(
7525 crate::features::chrome::setup_chrome_events(
7526 &new_page, &shared.3,
7527 ),
7528 crate::features::chrome::setup_chrome_interception_base(
7529 &new_page,
7530 shared.3.chrome_intercept.enabled,
7531 &shared.3.auth_challenge_response,
7532 shared.3.chrome_intercept.block_visuals,
7533 &shared.4,
7534 )
7535 );
7536
7537 let mut page = Page::new(
7538 &link.inner(),
7539 &client,
7540 &new_page,
7541 &shared.3.wait_for,
7542 &shared.3.screenshot,
7543 false,
7544 &shared.3.openai_config,
7545 &shared.3.execution_scripts,
7546 &shared.3.automation_scripts,
7547 &shared.3.viewport,
7548 &shared.3.request_timeout,
7549 &shared.3.track_events,
7550 shared.3.referer.clone(),
7551 shared.3.max_page_bytes,
7552 shared.3.get_cache_options(),
7553 &shared.3.cache_policy,
7554 &shared.3.remote_multimodal,
7555 )
7556 .await;
7557
7558 if let Some(intercept_handle) = intercept_handle {
7559 let abort_handle = intercept_handle.abort_handle();
7560
7561 if let Err(elasped) = tokio::time::timeout(
7562 tokio::time::Duration::from_secs(10),
7563 async { intercept_handle.await },
7564 )
7565 .await
7566 {
7567 log::warn!("Handler timeout exceeded {elasped}");
7568 abort_handle.abort();
7569 }
7570 }
7571
7572 if page.page_links.is_none() {
7573 let links = page.links(&shared.6, &shared.7).await;
7574 page.page_links = Some(links.into());
7575 }
7576
7577 Some(page)
7578 }
7579 Err(err) => {
7580 log::error!("chrome failed to open: {:?}", err);
7581 None
7582 }
7583 }
7584 });
7585
7586 if exceeded_budget {
7587 break;
7588 }
7589 }
7590 }
7591 }
7592 Err(err) => {
7593 log::error!("chrome failed to open: {:?}", err);
7594 }
7595 }
7596
7597
7598 },
7599 Some(result) = set.join_next(), if !set.is_empty() => {
7600 if let Ok(res) = result {
7601 match res {
7602 Some(page) => {
7603 if let Some(signature) = page.signature {
7604 if self.is_signature_allowed(signature).await {
7605 if let Some(mut links) = page.page_links.clone() {
7606 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7607 self.extra_links.extend(*links)
7608 }
7609 self.insert_signature(signature).await;
7610
7611 channel_send_page(
7612 &shared.0, page.clone(), &shared.1,
7613 );
7614
7615 if scrape || persist_links {
7616 if let Some(p) = self.pages.as_mut() {
7617 p.push(page);
7618 }
7619 }
7620 }
7621 } else {
7622 if let Some(mut links) = page.page_links.clone() {
7623 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7624 self.extra_links.extend(*links)
7625 }
7626 channel_send_page(
7627 &shared.0, page.clone(), &shared.1,
7628 );
7629 if scrape || persist_links {
7630 if let Some(p) = self.pages.as_mut() {
7631 p.push(page);
7632 }
7633 }
7634 }
7635 }
7636 _ => ()
7637 }
7638 } else {
7639 break;
7640 }
7641 }
7642 else => break,
7643 }
7644
7645 if sitemaps.len() == 0 || exceeded_budget {
7646 break;
7647 }
7648 }
7649
7650 while let Some(result) = set.join_next().await {
7651 if let Ok(res) = result {
7652 match res {
7653 Some(page) => {
7654 if let Some(signature) = page.signature {
7655 if self.is_signature_allowed(signature).await {
7656 if let Some(mut links) = page.page_links.clone() {
7657 self.dequeue(&mut q, &mut links, &mut exceeded_budget)
7658 .await;
7659 self.extra_links.extend(*links)
7660 }
7661 self.insert_signature(signature).await;
7662 channel_send_page(&shared.0, page.clone(), &shared.1);
7663 if scrape || persist_links {
7664 if let Some(p) = self.pages.as_mut() {
7665 p.push(page);
7666 }
7667 }
7668 }
7669 } else {
7670 if let Some(mut links) = page.page_links.clone() {
7671 self.dequeue(&mut q, &mut links, &mut exceeded_budget)
7672 .await;
7673 self.extra_links.extend(*links)
7674 }
7675 channel_send_page(&shared.0, page.clone(), &shared.1);
7676 if scrape || persist_links {
7677 if let Some(p) = self.pages.as_mut() {
7678 p.push(page);
7679 }
7680 }
7681 }
7682 }
7683 _ => (),
7684 }
7685 }
7686 }
7687 b.dispose();
7688 self.configuration
7689 .remove_sitemap_from_whitelist(whitelist_changes);
7690 }
7691 }
7692 }
7693
7694 #[cfg(feature = "sitemap")]
7696 pub async fn sitemap_crawl(
7697 &mut self,
7698 client: &Client,
7699 handle: &Option<Arc<AtomicI8>>,
7700 scrape: bool,
7701 ) {
7702 self.sitemap_crawl_raw(client, handle, scrape).await
7703 }
7704
7705 #[cfg(all(
7707 feature = "sitemap",
7708 any(not(feature = "chrome"), feature = "decentralized")
7709 ))]
7710 async fn sitemap_crawl_chain(
7711 &mut self,
7712 client: &Client,
7713 handle: &Option<Arc<AtomicI8>>,
7714 scrape: bool,
7715 ) {
7716 if !self.configuration.ignore_sitemap {
7717 self.sitemap_crawl_raw(client, handle, scrape).await
7718 }
7719 }
7720
7721 #[cfg(all(
7723 feature = "sitemap",
7724 feature = "chrome",
7725 not(feature = "decentralized")
7726 ))]
7727 pub async fn sitemap_crawl_chain(
7728 &mut self,
7729 client: &Client,
7730 handle: &Option<Arc<AtomicI8>>,
7731 scrape: bool,
7732 ) {
7733 if !self.configuration.ignore_sitemap {
7734 self.sitemap_crawl_chrome(client, handle, scrape).await
7735 }
7736 }
7737
7738 #[cfg(feature = "sitemap")]
7740 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7741 pub async fn sitemap_parse(
7742 &mut self,
7743 client: &Client,
7744 first_request: &mut bool,
7745 sitemap_url: &mut Box<CompactString>,
7746 attempted_correct: &mut bool,
7747 ) -> bool {
7748 let mut valid = *attempted_correct == false;
7749
7750 if valid {
7751 if let Some(domain) = &self.domain_parsed {
7752 match client.get(domain.as_str()).send().await {
7754 Ok(response) => {
7755 let limit = *crate::utils::MAX_SIZE_BYTES as u64;
7756
7757 if let Some(response_content_length) = response.content_length() {
7758 if limit > 0 && response_content_length >= limit {
7759 log::info!("{} exceeded parse limit: {:?}", domain, limit);
7760 *first_request = true;
7761 *attempted_correct = true;
7762 valid = false;
7763 }
7764 }
7765
7766 if valid {
7767 let cell = tokio::sync::OnceCell::new();
7769
7770 let rewriter_settings = lol_html::Settings {
7771 element_content_handlers: vec![lol_html::element!(
7772 r#"link[rel="sitemap"]"#,
7773 |el| {
7774 if let Some(href) = el.get_attribute("href") {
7775 let _ = cell.set(href);
7776 }
7777 Ok(())
7778 }
7779 )],
7780 adjust_charset_on_meta_tag: false,
7781 ..lol_html::send::Settings::new_for_handler_types()
7782 };
7783
7784 let mut rewriter = lol_html::send::HtmlRewriter::new(
7785 rewriter_settings,
7786 |_c: &[u8]| {},
7787 );
7788
7789 let mut wrote_error = false;
7790 let mut stream = response.bytes_stream();
7791
7792 while let Some(chunk) = stream.next().await {
7793 if let Ok(chunk) = chunk {
7794 if rewriter.write(&chunk).is_err() {
7795 wrote_error = true;
7796 break;
7797 }
7798 }
7799 if cell.initialized() {
7800 break;
7801 }
7802 }
7803
7804 if !wrote_error {
7805 let _ = rewriter.end();
7806 }
7807
7808 if let Some(sitemap) = cell.get() {
7809 if sitemap.is_empty() {
7810 *first_request = true;
7811 }
7812
7813 if let Err(_) = domain.join(sitemap) {
7814 *first_request = true;
7815 }
7816 *sitemap_url = Box::new(sitemap.into());
7818 *attempted_correct = true;
7819 } else {
7820 *first_request = true;
7821 }
7822 }
7823 }
7824 Err(err) => {
7825 *first_request = true;
7826 valid = false;
7827 log::info!("http parse error: {:?}", err.to_string())
7828 }
7829 };
7830 }
7831 }
7832
7833 valid
7834 }
7835 #[cfg(feature = "sitemap")]
7837 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7838 async fn sitemap_parse_crawl(
7839 &mut self,
7840 client: &Client,
7841 handle: &Option<Arc<AtomicI8>>,
7842 b: bytes::Bytes,
7843 mut interval: &mut Interval,
7844 exceeded_budget: &mut bool,
7845 tx: &tokio::sync::mpsc::Sender<Page>,
7846 sitemaps: &mut Vec<Box<CompactString>>,
7847 crawl: bool,
7848 ) {
7849 use sitemap::reader::{SiteMapEntity, SiteMapReader};
7850 use sitemap::structs::Location;
7851
7852 if !b.is_empty() && b.starts_with(b"<?xml") {
7853 let mut stream = tokio_stream::iter(SiteMapReader::new(&*b));
7854
7855 let retry = self.configuration.retry;
7856
7857 while let Some(entity) = stream.next().await {
7858 if !self.handle_process(handle, &mut interval, async {}).await {
7859 break;
7860 }
7861 match entity {
7862 SiteMapEntity::Url(url_entry) => match url_entry.loc {
7863 Location::Url(url) => {
7864 let link: CaseInsensitiveString = url.as_str().into();
7865
7866 let allowed = self.is_allowed(&link);
7867
7868 if allowed.eq(&ProcessLinkStatus::Blocked) {
7869 continue;
7870 }
7871
7872 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
7873 *exceeded_budget = true;
7874 break;
7875 }
7876
7877 self.insert_link(link.clone()).await;
7878
7879 if crawl {
7880 let client = client.clone();
7881 let tx = tx.clone();
7882
7883 crate::utils::spawn_task("page_fetch", async move {
7884 let mut page = Page::new_page(&link.inner(), &client).await;
7885
7886 let mut retry_count = retry;
7887
7888 while page.should_retry && retry_count > 0 {
7889 if let Some(timeout) = page.get_timeout() {
7890 tokio::time::sleep(timeout).await;
7891 }
7892 page.clone_from(
7893 &Page::new_page(link.inner(), &client).await,
7894 );
7895 retry_count -= 1;
7896 }
7897
7898 if let Ok(permit) = tx.reserve().await {
7899 permit.send(page);
7900 }
7901 });
7902 }
7903 }
7904 Location::None | Location::ParseErr(_) => (),
7905 },
7906 SiteMapEntity::SiteMap(sitemap_entry) => match sitemap_entry.loc {
7907 Location::Url(url) => {
7908 sitemaps.push(Box::new(CompactString::new(&url.as_str())));
7909 }
7910 Location::None | Location::ParseErr(_) => (),
7911 },
7912 SiteMapEntity::Err(err) => {
7913 log::info!("incorrect sitemap error: {:?}", err.msg())
7914 }
7915 };
7916
7917 if *exceeded_budget {
7918 break;
7919 }
7920 }
7921 }
7922 }
7923
7924 #[cfg(feature = "regex")]
7926 pub fn get_base_link(&self) -> &CaseInsensitiveString {
7927 &self.url
7928 }
7929
7930 #[cfg(not(feature = "regex"))]
7932 pub fn get_base_link(&self) -> &CompactString {
7933 self.url.inner()
7934 }
7935
7936 pub async fn subscription_guard(&self) {
7938 if let Some(channel) = &self.channel {
7939 if !channel.1.is_empty() {
7940 if let Some(guard_counter) = &self.channel_guard {
7941 guard_counter.lock().await
7942 }
7943 }
7944 }
7945 }
7946
7947 #[cfg(feature = "chrome")]
7949 pub async fn setup_browser_base(
7950 config: &Configuration,
7951 url_parsed: &Option<Box<Url>>,
7952 jar: Option<&Arc<reqwest::cookie::Jar>>,
7953 ) -> Option<crate::features::chrome::BrowserController> {
7954 match crate::features::chrome::launch_browser_cookies(&config, url_parsed, jar).await {
7955 Some((browser, browser_handle, context_id)) => {
7956 let browser: Arc<chromiumoxide::Browser> = Arc::new(browser);
7957 let b = (browser, Some(browser_handle), context_id);
7958
7959 Some(crate::features::chrome::BrowserController::new(b))
7960 }
7961 _ => None,
7962 }
7963 }
7964
7965 #[cfg(feature = "chrome")]
7967 pub async fn setup_browser(&self) -> Option<crate::features::chrome::BrowserController> {
7968 Website::setup_browser_base(
7969 &self.configuration,
7970 self.get_url_parsed(),
7971 Some(&self.cookie_jar),
7972 )
7973 .await
7974 }
7975
7976 #[cfg(feature = "webdriver")]
7978 pub async fn setup_webdriver(&self) -> Option<crate::features::webdriver::WebDriverController> {
7979 crate::features::webdriver::launch_driver(&self.configuration).await
7980 }
7981
7982 #[cfg(feature = "webdriver")]
7984 pub async fn render_webdriver_page(
7985 &self,
7986 url: &str,
7987 driver: &std::sync::Arc<thirtyfour::WebDriver>,
7988 ) -> Option<String> {
7989 use crate::features::webdriver::{attempt_navigation, get_page_content, setup_driver_events};
7990
7991 let timeout = self
7992 .configuration
7993 .webdriver_config
7994 .as_ref()
7995 .and_then(|c| c.timeout);
7996
7997 if let Err(e) = attempt_navigation(url, driver, &timeout).await {
7999 log::error!("WebDriver navigation failed: {:?}", e);
8000 return None;
8001 }
8002
8003 setup_driver_events(driver, &self.configuration).await;
8005
8006 match get_page_content(driver).await {
8008 Ok(content) => Some(content),
8009 Err(e) => {
8010 log::error!("Failed to get WebDriver page content: {:?}", e);
8011 None
8012 }
8013 }
8014 }
8015
8016 pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self {
8018 self.configuration
8019 .with_respect_robots_txt(respect_robots_txt);
8020 self
8021 }
8022
8023 pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self {
8025 self.configuration.with_subdomains(subdomains);
8026 self
8027 }
8028
8029 pub fn with_csp_bypass(&mut self, enabled: bool) -> &mut Self {
8031 self.configuration.with_csp_bypass(enabled);
8032 self
8033 }
8034
8035 #[cfg(feature = "webdriver")]
8038 pub fn with_webdriver(
8039 &mut self,
8040 webdriver_config: crate::features::webdriver_common::WebDriverConfig,
8041 ) -> &mut Self {
8042 self.configuration
8043 .with_webdriver_config(Some(webdriver_config));
8044 self
8045 }
8046
8047 #[cfg(not(feature = "webdriver"))]
8049 pub fn with_webdriver(&mut self, _webdriver_config: ()) -> &mut Self {
8050 self
8051 }
8052
8053 #[cfg(feature = "disk")]
8055 pub fn with_sqlite(&mut self, sqlite: bool) -> &mut Self {
8056 if sqlite {
8057 self.enable_sqlite = true;
8058 } else {
8059 self.enable_sqlite = false;
8060 self.sqlite = None;
8061 };
8062 self
8063 }
8064
8065 #[cfg(not(feature = "disk"))]
8067 pub fn with_sqlite(&mut self, _sqlite: bool) -> &mut Self {
8068 self
8069 }
8070
8071 pub fn with_tld(&mut self, tld: bool) -> &mut Self {
8073 self.configuration.with_tld(tld);
8074 self
8075 }
8076
8077 pub fn with_crawl_timeout(&mut self, crawl_timeout: Option<Duration>) -> &mut Self {
8079 self.configuration.with_crawl_timeout(crawl_timeout);
8080 self
8081 }
8082
8083 pub fn with_http2_prior_knowledge(&mut self, http2_prior_knowledge: bool) -> &mut Self {
8085 self.configuration
8086 .with_http2_prior_knowledge(http2_prior_knowledge);
8087 self
8088 }
8089
8090 pub fn with_delay(&mut self, delay: u64) -> &mut Self {
8092 self.configuration.with_delay(delay);
8093 self
8094 }
8095
8096 pub fn with_request_timeout(&mut self, request_timeout: Option<Duration>) -> &mut Self {
8098 self.configuration.with_request_timeout(request_timeout);
8099 self
8100 }
8101
8102 pub fn with_danger_accept_invalid_certs(&mut self, accept_invalid_certs: bool) -> &mut Self {
8104 self.configuration
8105 .with_danger_accept_invalid_certs(accept_invalid_certs);
8106 self
8107 }
8108
8109 pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self {
8111 self.configuration.with_user_agent(user_agent);
8112 self
8113 }
8114
8115 pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self {
8117 self.configuration.with_preserve_host_header(preserve);
8118 self
8119 }
8120
8121 #[cfg(feature = "sitemap")]
8122 pub fn with_sitemap(&mut self, sitemap_url: Option<&str>) -> &mut Self {
8124 self.configuration.with_sitemap(sitemap_url);
8125 self
8126 }
8127
8128 #[cfg(not(feature = "sitemap"))]
8129 pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self {
8131 self
8132 }
8133
8134 pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self {
8136 self.configuration.with_proxies(proxies);
8137 self
8138 }
8139
8140 pub fn with_proxies_direct(
8142 &mut self,
8143 proxies: Option<Vec<crate::configuration::RequestProxy>>,
8144 ) -> &mut Self {
8145 self.configuration.with_proxies_direct(proxies);
8146 self
8147 }
8148
8149 pub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self {
8151 self.configuration.with_concurrency_limit(limit);
8152 self
8153 }
8154
8155 #[cfg(not(feature = "control"))]
8157 pub fn with_crawl_id(&mut self, _crawl_id: String) -> &mut Self {
8158 self
8159 }
8160
8161 #[cfg(feature = "control")]
8163 pub fn with_crawl_id(&mut self, crawl_id: String) -> &mut Self {
8164 self.crawl_id = crawl_id.into();
8165 self
8166 }
8167
8168 pub fn with_blacklist_url<T>(&mut self, blacklist_url: Option<Vec<T>>) -> &mut Self
8170 where
8171 Vec<CompactString>: From<Vec<T>>,
8172 {
8173 self.configuration.with_blacklist_url(blacklist_url);
8174 self
8175 }
8176
8177 pub fn with_retry(&mut self, retry: u8) -> &mut Self {
8179 self.configuration.with_retry(retry);
8180 self
8181 }
8182
8183 pub fn with_no_control_thread(&mut self, no_control_thread: bool) -> &mut Self {
8185 self.configuration.with_no_control_thread(no_control_thread);
8186 self
8187 }
8188
8189 pub fn with_whitelist_url<T>(&mut self, whitelist_url: Option<Vec<T>>) -> &mut Self
8191 where
8192 Vec<CompactString>: From<Vec<T>>,
8193 {
8194 self.configuration.with_whitelist_url(whitelist_url);
8195 self
8196 }
8197
8198 #[cfg(feature = "chrome")]
8199 pub fn with_event_tracker(
8201 &mut self,
8202 track_events: Option<crate::configuration::ChromeEventTracker>,
8203 ) -> &mut Self {
8204 self.configuration.with_event_tracker(track_events);
8205 self
8206 }
8207
8208 pub fn with_headers(&mut self, headers: Option<reqwest::header::HeaderMap>) -> &mut Self {
8210 self.configuration.with_headers(headers);
8211 self
8212 }
8213
8214 pub fn with_modify_headers(&mut self, modify_headers: bool) -> &mut Self {
8216 self.configuration.with_modify_headers(modify_headers);
8217 self
8218 }
8219
8220 pub fn with_modify_http_client_headers(
8222 &mut self,
8223 modify_http_client_headers: bool,
8224 ) -> &mut Self {
8225 self.configuration
8226 .with_modify_http_client_headers(modify_http_client_headers);
8227 self
8228 }
8229
8230 pub fn with_budget(&mut self, budget: Option<HashMap<&str, u32>>) -> &mut Self {
8232 self.configuration.with_budget(budget);
8233 self
8234 }
8235
8236 pub fn set_crawl_budget(&mut self, budget: Option<HashMap<CaseInsensitiveString, u32>>) {
8238 self.configuration.budget = budget;
8239 }
8240
8241 pub fn with_depth(&mut self, depth: usize) -> &mut Self {
8243 self.configuration.with_depth(depth);
8244 self
8245 }
8246
8247 pub fn with_external_domains<'a, 'b>(
8249 &mut self,
8250 external_domains: Option<impl Iterator<Item = String> + 'a>,
8251 ) -> &mut Self {
8252 self.configuration.with_external_domains(external_domains);
8253 self
8254 }
8255
8256 pub fn with_on_link_find_callback(
8258 &mut self,
8259 on_link_find_callback: Option<OnLinkFindCallback>,
8260 ) -> &mut Self {
8261 match on_link_find_callback {
8262 Some(callback) => self.on_link_find_callback = Some(callback),
8263 _ => self.on_link_find_callback = None,
8264 };
8265 self
8266 }
8267
8268 pub fn set_on_link_find<F>(&mut self, f: F)
8270 where
8271 F: Fn(CaseInsensitiveString, Option<String>) -> (CaseInsensitiveString, Option<String>)
8272 + Send
8273 + Sync
8274 + 'static,
8275 {
8276 self.on_link_find_callback = Some(Arc::new(f));
8277 }
8278
8279 pub fn with_on_should_crawl_callback(
8281 &mut self,
8282 on_should_crawl_callback: Option<fn(&Page) -> bool>,
8283 ) -> &mut Self {
8284 match on_should_crawl_callback {
8285 Some(callback) => {
8286 self.on_should_crawl_callback = Some(OnShouldCrawlCallback::Fn(callback))
8287 }
8288 _ => self.on_should_crawl_callback = None,
8289 };
8290 self
8291 }
8292
8293 pub fn with_on_should_crawl_callback_closure<F: OnShouldCrawlClosure>(
8297 &mut self,
8298 on_should_crawl_closure: Option<F>,
8299 ) -> &mut Self {
8300 match on_should_crawl_closure {
8301 Some(callback) => {
8302 self.on_should_crawl_callback =
8303 Some(OnShouldCrawlCallback::Closure(Arc::new(callback)))
8304 }
8305 _ => self.on_should_crawl_callback = None,
8306 };
8307 self
8308 }
8309
8310 pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self {
8312 self.configuration.with_cookies(cookie_str);
8313 self
8314 }
8315
8316 pub fn with_cron(&mut self, cron_str: &str, cron_type: CronType) -> &mut Self {
8318 self.configuration.with_cron(cron_str, cron_type);
8319 self
8320 }
8321
8322 pub fn with_locale(&mut self, locale: Option<String>) -> &mut Self {
8324 self.configuration.with_locale(locale);
8325 self
8326 }
8327
8328 pub fn with_stealth(&mut self, stealth_mode: bool) -> &mut Self {
8330 self.configuration.with_stealth(stealth_mode);
8331 self
8332 }
8333
8334 #[cfg(feature = "chrome")]
8336 pub fn with_stealth_advanced(
8337 &mut self,
8338 stealth_mode: spider_fingerprint::configs::Tier,
8339 ) -> &mut Self {
8340 self.configuration.with_stealth_advanced(stealth_mode);
8341 self
8342 }
8343
8344 pub fn with_cache_policy(
8346 &mut self,
8347 cache_policy: Option<crate::utils::BasicCachePolicy>,
8348 ) -> &mut Self {
8349 self.configuration.with_cache_policy(cache_policy);
8350
8351 self
8352 }
8353
8354 pub fn with_openai(&mut self, openai_configs: Option<configuration::GPTConfigs>) -> &mut Self {
8356 self.configuration.with_openai(openai_configs);
8357 self
8358 }
8359
8360 #[cfg(feature = "chrome")]
8433 pub fn with_remote_multimodal(
8434 &mut self,
8435 cfg: Option<crate::features::automation::RemoteMultimodalConfigs>,
8436 ) -> &mut Self {
8437 self.configuration.with_remote_multimodal(cfg);
8438 self
8439 }
8440
8441 pub fn with_gemini(
8443 &mut self,
8444 gemini_configs: Option<configuration::GeminiConfigs>,
8445 ) -> &mut Self {
8446 self.configuration.with_gemini(gemini_configs);
8447 self
8448 }
8449
8450 pub fn with_caching(&mut self, cache: bool) -> &mut Self {
8452 self.configuration.with_caching(cache);
8453 self
8454 }
8455
8456 pub fn with_service_worker_enabled(&mut self, enabled: bool) -> &mut Self {
8458 self.configuration.with_service_worker_enabled(enabled);
8459 self
8460 }
8461
8462 pub fn with_auto_geolocation(&mut self, enabled: bool) -> &mut Self {
8464 self.configuration.with_auto_geolocation(enabled);
8465 self
8466 }
8467
8468 #[cfg(feature = "chrome")]
8469 pub fn with_fingerprint_advanced(
8471 &mut self,
8472 fingerprint: crate::configuration::Fingerprint,
8473 ) -> &mut Self {
8474 self.configuration.with_fingerprint_advanced(fingerprint);
8475 self
8476 }
8477
8478 pub fn with_fingerprint(&mut self, fingerprint: bool) -> &mut Self {
8480 self.configuration.with_fingerprint(fingerprint);
8481 self
8482 }
8483
8484 pub fn with_viewport(&mut self, viewport: Option<crate::configuration::Viewport>) -> &mut Self {
8486 self.configuration.with_viewport(viewport);
8487 self
8488 }
8489
8490 pub fn with_wait_for_idle_network(
8492 &mut self,
8493 wait_for_idle_network: Option<crate::configuration::WaitForIdleNetwork>,
8494 ) -> &mut Self {
8495 self.configuration
8496 .with_wait_for_idle_network(wait_for_idle_network);
8497 self
8498 }
8499
8500 pub fn with_wait_for_idle_network0(
8502 &mut self,
8503 wait_for_idle_network: Option<crate::configuration::WaitForIdleNetwork>,
8504 ) -> &mut Self {
8505 self.configuration
8506 .with_wait_for_idle_network0(wait_for_idle_network);
8507 self
8508 }
8509
8510 pub fn with_wait_for_almost_idle_network0(
8512 &mut self,
8513 wait_for_idle_network: Option<crate::configuration::WaitForIdleNetwork>,
8514 ) -> &mut Self {
8515 self.configuration
8516 .with_wait_for_almost_idle_network0(wait_for_idle_network);
8517 self
8518 }
8519
8520 pub fn with_wait_for_selector(
8522 &mut self,
8523 wait_for_selector: Option<crate::configuration::WaitForSelector>,
8524 ) -> &mut Self {
8525 self.configuration.with_wait_for_selector(wait_for_selector);
8526 self
8527 }
8528
8529 pub fn with_wait_for_idle_dom(
8531 &mut self,
8532 wait_for_selector: Option<crate::configuration::WaitForSelector>,
8533 ) -> &mut Self {
8534 self.configuration.with_wait_for_idle_dom(wait_for_selector);
8535 self
8536 }
8537
8538 pub fn with_wait_for_delay(
8540 &mut self,
8541 wait_for_delay: Option<crate::configuration::WaitForDelay>,
8542 ) -> &mut Self {
8543 self.configuration.with_wait_for_delay(wait_for_delay);
8544 self
8545 }
8546
8547 pub fn with_default_http_connect_timeout(
8549 &mut self,
8550 default_http_connect_timeout: Option<Duration>,
8551 ) -> &mut Self {
8552 self.configuration
8553 .with_default_http_connect_timeout(default_http_connect_timeout);
8554
8555 self
8556 }
8557
8558 pub fn with_default_http_read_timeout(
8560 &mut self,
8561 default_http_read_timeout: Option<Duration>,
8562 ) -> &mut Self {
8563 self.configuration
8564 .with_default_http_read_timeout(default_http_read_timeout);
8565
8566 self
8567 }
8568
8569 pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self {
8571 self.configuration.with_redirect_limit(redirect_limit);
8572 self
8573 }
8574
8575 pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self {
8577 self.configuration.with_redirect_policy(policy);
8578 self
8579 }
8580
8581 pub fn with_chrome_intercept(
8583 &mut self,
8584 chrome_intercept: RequestInterceptConfiguration,
8585 ) -> &mut Self {
8586 self.configuration
8587 .with_chrome_intercept(chrome_intercept, &self.domain_parsed);
8588 self
8589 }
8590
8591 pub fn with_referer(&mut self, referer: Option<String>) -> &mut Self {
8593 self.configuration.with_referer(referer);
8594 self
8595 }
8596
8597 pub fn with_referrer(&mut self, referer: Option<String>) -> &mut Self {
8599 self.configuration.with_referrer(referer);
8600 self
8601 }
8602
8603 pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self {
8605 self.configuration.with_full_resources(full_resources);
8606 self
8607 }
8608
8609 pub fn with_dismiss_dialogs(&mut self, full_resources: bool) -> &mut Self {
8611 self.configuration.with_dismiss_dialogs(full_resources);
8612 self
8613 }
8614
8615 #[cfg(feature = "wreq")]
8617 pub fn with_emulation(&mut self, emulation: Option<wreq_util::Emulation>) -> &mut Self {
8618 self.configuration.with_emulation(emulation);
8619 self
8620 }
8621
8622 pub fn with_ignore_sitemap(&mut self, ignore_sitemap: bool) -> &mut Self {
8624 self.configuration.with_ignore_sitemap(ignore_sitemap);
8625 self
8626 }
8627
8628 pub fn with_timezone_id(&mut self, timezone_id: Option<String>) -> &mut Self {
8630 self.configuration.with_timezone_id(timezone_id);
8631 self
8632 }
8633
8634 pub fn with_evaluate_on_new_document(
8636 &mut self,
8637 evaluate_on_new_document: Option<Box<String>>,
8638 ) -> &mut Self {
8639 self.configuration
8640 .with_evaluate_on_new_document(evaluate_on_new_document);
8641
8642 self
8643 }
8644
8645 pub fn with_limit(&mut self, limit: u32) -> &mut Self {
8647 self.configuration.with_limit(limit);
8648 self
8649 }
8650
8651 pub fn with_screenshot(
8653 &mut self,
8654 screenshot_config: Option<configuration::ScreenShotConfig>,
8655 ) -> &mut Self {
8656 self.configuration.with_screenshot(screenshot_config);
8657 self
8658 }
8659
8660 pub fn with_shared_queue(&mut self, shared_queue: bool) -> &mut Self {
8662 self.configuration.with_shared_queue(shared_queue);
8663 self
8664 }
8665
8666 pub fn with_auth_challenge_response(
8668 &mut self,
8669 auth_challenge_response: Option<configuration::AuthChallengeResponse>,
8670 ) -> &mut Self {
8671 self.configuration
8672 .with_auth_challenge_response(auth_challenge_response);
8673 self
8674 }
8675
8676 pub fn with_return_page_links(&mut self, return_page_links: bool) -> &mut Self {
8678 self.configuration.with_return_page_links(return_page_links);
8679 self
8680 }
8681
8682 pub fn with_chrome_connection(&mut self, chrome_connection_url: Option<String>) -> &mut Self {
8684 self.configuration
8685 .with_chrome_connection(chrome_connection_url);
8686 self
8687 }
8688
8689 pub fn with_execution_scripts(
8691 &mut self,
8692 execution_scripts: Option<ExecutionScriptsMap>,
8693 ) -> &mut Self {
8694 self.configuration.with_execution_scripts(execution_scripts);
8695 self
8696 }
8697
8698 pub fn with_automation_scripts(
8700 &mut self,
8701 automation_scripts: Option<AutomationScriptsMap>,
8702 ) -> &mut Self {
8703 self.configuration
8704 .with_automation_scripts(automation_scripts);
8705 self
8706 }
8707
8708 pub fn with_network_interface(&mut self, network_interface: Option<String>) -> &mut Self {
8710 self.configuration.with_network_interface(network_interface);
8711 self
8712 }
8713
8714 pub fn with_local_address(&mut self, local_address: Option<IpAddr>) -> &mut Self {
8716 self.configuration.with_local_address(local_address);
8717 self
8718 }
8719
8720 pub fn with_block_assets(&mut self, only_html: bool) -> &mut Self {
8722 self.configuration.with_block_assets(only_html);
8723 self
8724 }
8725
8726 pub fn with_normalize(&mut self, normalize: bool) -> &mut Self {
8728 self.configuration.with_normalize(normalize);
8729 self
8730 }
8731
8732 pub fn with_shared_state(&mut self, shared: bool) -> &mut Self {
8734 self.configuration.with_shared_state(shared);
8735 self
8736 }
8737
8738 pub fn with_max_page_bytes(&mut self, max_page_bytes: Option<f64>) -> &mut Self {
8740 self.configuration.with_max_page_bytes(max_page_bytes);
8741 self
8742 }
8743
8744 pub fn with_max_bytes_allowed(&mut self, max_bytes_allowed: Option<u64>) -> &mut Self {
8746 self.configuration.with_max_bytes_allowed(max_bytes_allowed);
8747 self
8748 }
8749
8750 pub fn with_config(&mut self, config: Configuration) -> &mut Self {
8752 self.configuration = config.into();
8753 self
8754 }
8755
8756 #[cfg(feature = "spider_cloud")]
8758 pub fn with_spider_cloud(&mut self, api_key: &str) -> &mut Self {
8759 self.configuration.with_spider_cloud(api_key);
8760 self
8761 }
8762
8763 #[cfg(not(feature = "spider_cloud"))]
8765 pub fn with_spider_cloud(&mut self, _api_key: &str) -> &mut Self {
8766 self
8767 }
8768
8769 #[cfg(feature = "spider_cloud")]
8771 pub fn with_spider_cloud_config(
8772 &mut self,
8773 config: crate::configuration::SpiderCloudConfig,
8774 ) -> &mut Self {
8775 self.configuration.with_spider_cloud_config(config);
8776 self
8777 }
8778
8779 #[cfg(not(feature = "spider_cloud"))]
8781 pub fn with_spider_cloud_config(&mut self, _config: ()) -> &mut Self {
8782 self
8783 }
8784
8785 pub fn build(&self) -> Result<Self, Self> {
8787 if self.domain_parsed.is_none() {
8788 Err(self.to_owned())
8789 } else {
8790 Ok(self.to_owned())
8791 }
8792 }
8793
8794 pub fn clear_headers(&mut self) {
8796 if let Some(headers) = self.configuration.headers.as_mut() {
8797 headers.0.clear();
8798 }
8799 }
8800
8801 pub fn determine_limits(&mut self) {
8803 self.configuration.configure_budget();
8804 if self.configuration.inner_budget.is_some() {
8805 let wild_card_budget = match &self.configuration.inner_budget {
8806 Some(budget) => budget.contains_key(&*WILD_CARD_PATH),
8807 _ => false,
8808 };
8809 self.configuration.wild_card_budgeting = wild_card_budget;
8810 }
8811 if self.configuration.depth > 0 && self.domain_parsed.is_some() {
8812 if let Some(domain) = &self.domain_parsed {
8813 if let Some(segments) = domain.path_segments() {
8814 let segments_cnt = segments.count();
8815
8816 if segments_cnt > self.configuration.depth {
8817 self.configuration.depth_distance = self.configuration.depth
8818 + self.configuration.depth.abs_diff(segments_cnt);
8819 } else {
8820 self.configuration.depth_distance = self.configuration.depth;
8821 }
8822 }
8823 }
8824 }
8825 }
8826
8827 #[cfg(not(feature = "sync"))]
8828 pub fn subscribe(&mut self, capacity: usize) -> Option<broadcast::Receiver<Page>> {
8857 None
8858 }
8859
8860 #[cfg(feature = "sync")]
8889 pub fn subscribe(&mut self, capacity: usize) -> Option<broadcast::Receiver<Page>> {
8890 let channel = self.channel.get_or_insert_with(|| {
8891 let (tx, rx) = broadcast::channel(
8892 (if capacity == 0 {
8893 *DEFAULT_PERMITS
8894 } else {
8895 capacity
8896 })
8897 .max(1),
8898 );
8899 (tx, Arc::new(rx))
8900 });
8901
8902 let rx2 = channel.0.subscribe();
8903
8904 Some(rx2)
8905 }
8906
8907 #[cfg(feature = "sync")]
8909 pub fn queue(&mut self, capacity: usize) -> Option<broadcast::Sender<String>> {
8910 let channel = self.channel_queue.get_or_insert_with(|| {
8911 let (tx, rx) = broadcast::channel(capacity);
8912 (tx, Arc::new(rx))
8913 });
8914
8915 Some(channel.0.to_owned())
8916 }
8917
8918 #[cfg(not(feature = "sync"))]
8920 pub fn queue(
8921 &mut self,
8922 _capacity: usize,
8923 ) -> Option<Arc<(broadcast::Sender<Page>, broadcast::Receiver<Page>)>> {
8924 None
8925 }
8926
8927 #[cfg(not(feature = "sync"))]
8929 pub fn unsubscribe(&mut self) {}
8930
8931 #[cfg(feature = "sync")]
8933 pub fn unsubscribe(&mut self) {
8934 self.channel.take();
8935 }
8936
8937 pub fn get_channel(
8939 &self,
8940 ) -> &Option<(broadcast::Sender<Page>, Arc<broadcast::Receiver<Page>>)> {
8941 &self.channel
8942 }
8943
8944 pub fn get_channel_guard(&self) -> &Option<ChannelGuard> {
8946 &self.channel_guard
8947 }
8948
8949 #[cfg(not(feature = "sync"))]
8986 pub fn subscribe_guard(&mut self) -> Option<ChannelGuard> {
8987 None
8988 }
8989
8990 #[cfg(feature = "sync")]
9027 pub fn subscribe_guard(&mut self) -> Option<ChannelGuard> {
9028 let channel_guard = self.channel_guard.get_or_insert_with(ChannelGuard::new);
9030 Some(channel_guard.clone())
9031 }
9032
9033 #[cfg(feature = "cron")]
9034 pub async fn run_cron(&self) -> Runner {
9036 async_job::Runner::new()
9037 .add(Box::new(self.clone()))
9038 .run()
9039 .await
9040 }
9041
9042 #[cfg(not(feature = "control"))]
9043 pub fn get_crawl_id(&self) -> Option<&Box<String>> {
9045 None
9046 }
9047
9048 #[cfg(feature = "control")]
9049 pub fn get_crawl_id(&self) -> Option<&Box<String>> {
9051 if self.crawl_id.is_empty() {
9052 None
9053 } else {
9054 Some(&self.crawl_id)
9055 }
9056 }
9057
9058 #[cfg(feature = "extra_information")]
9060 pub fn set_extra_info(&mut self, info: Option<String>) {
9061 self.extra_info = info.map(|f| f.into());
9062 }
9063
9064 #[cfg(feature = "extra_information")]
9066 pub fn get_extra_info(&self) -> Option<&Box<String>> {
9067 self.extra_info.as_ref()
9068 }
9069
9070 pub fn set_seeded_html(&mut self, html: Option<String>) {
9072 self.seed_html = html;
9073 }
9074
9075 pub fn get_seeded_html(&self) -> &Option<String> {
9077 &self.seed_html
9078 }
9079
9080 #[cfg(feature = "serde")]
9101 pub fn apply_prompt_configuration(
9102 &mut self,
9103 config: &crate::features::automation::PromptConfiguration,
9104 ) -> &mut Self {
9105 if let Some(v) = config.respect_robots_txt {
9107 self.configuration.respect_robots_txt = v;
9108 }
9109 if let Some(v) = config.subdomains {
9110 self.configuration.subdomains = v;
9111 }
9112 if let Some(v) = config.tld {
9113 self.configuration.tld = v;
9114 }
9115 if let Some(v) = config.depth {
9116 self.configuration.depth = v;
9117 }
9118 if let Some(v) = config.delay {
9119 self.configuration.delay = v;
9120 }
9121 if let Some(ms) = config.request_timeout_ms {
9122 self.configuration.request_timeout =
9123 Some(Box::new(std::time::Duration::from_millis(ms)));
9124 }
9125 if let Some(ms) = config.crawl_timeout_ms {
9126 self.configuration.crawl_timeout = Some(std::time::Duration::from_millis(ms));
9127 }
9128
9129 if let Some(ref urls) = config.blacklist_url {
9131 self.configuration.blacklist_url =
9132 Some(urls.iter().map(|s| s.as_str().into()).collect());
9133 }
9134 if let Some(ref urls) = config.whitelist_url {
9135 self.configuration.whitelist_url =
9136 Some(urls.iter().map(|s| s.as_str().into()).collect());
9137 }
9138 if let Some(ref domains) = config.external_domains {
9139 for domain in domains {
9140 self.configuration
9141 .external_domains_caseless
9142 .insert(case_insensitive_string::CaseInsensitiveString::new(domain));
9143 }
9144 }
9145
9146 if let Some(ref ua) = config.user_agent {
9148 self.configuration.user_agent = Some(Box::new(ua.as_str().into()));
9149 }
9150 if let Some(v) = config.http2_prior_knowledge {
9151 self.configuration.http2_prior_knowledge = v;
9152 }
9153 if let Some(v) = config.accept_invalid_certs {
9154 self.configuration.accept_invalid_certs = v;
9155 }
9156
9157 if let Some(v) = config.redirect_limit {
9159 self.configuration.redirect_limit = Box::new(v);
9160 }
9161 if let Some(ref budget_map) = config.budget {
9162 let mut budget = hashbrown::HashMap::new();
9163 for (k, v) in budget_map {
9164 budget.insert(
9165 case_insensitive_string::CaseInsensitiveString::new(k),
9166 *v,
9167 );
9168 }
9169 self.configuration.budget = Some(budget);
9170 }
9171 if let Some(v) = config.max_page_bytes {
9172 self.configuration.max_page_bytes = Some(v);
9173 }
9174
9175 if let Some(v) = config.full_resources {
9177 self.configuration.full_resources = v;
9178 }
9179 if let Some(v) = config.only_html {
9180 self.configuration.only_html = v;
9181 }
9182 if let Some(v) = config.return_page_links {
9183 self.configuration.return_page_links = v;
9184 }
9185
9186 #[cfg(feature = "chrome")]
9188 if let Some(true) = config.use_chrome {
9189 }
9191 if let Some(ref mode) = config.stealth_mode {
9192 self.configuration.stealth_mode = match mode.to_lowercase().as_str() {
9193 "basic" => spider_fingerprint::configs::Tier::Basic,
9194 "low" => spider_fingerprint::configs::Tier::Low,
9195 "mid" => spider_fingerprint::configs::Tier::Mid,
9196 "full" => spider_fingerprint::configs::Tier::Full,
9197 _ => spider_fingerprint::configs::Tier::None,
9198 };
9199 }
9200 if config.viewport_width.is_some() || config.viewport_height.is_some() {
9201 let width = config.viewport_width.unwrap_or(800);
9202 let height = config.viewport_height.unwrap_or(600);
9203 self.configuration.viewport = Some(crate::configuration::Viewport::new(width, height));
9204 }
9205 #[cfg(feature = "chrome")]
9206 {
9207 let mut wait_for = self
9208 .configuration
9209 .wait_for
9210 .take()
9211 .unwrap_or_default();
9212
9213 if let Some(true) = config.wait_for_idle_network {
9214 wait_for.idle_network = Some(
9215 crate::features::chrome_common::WaitForIdleNetwork::new(Some(
9216 std::time::Duration::from_secs(30),
9217 )),
9218 );
9219 }
9220 if let Some(ms) = config.wait_for_delay_ms {
9221 wait_for.delay = Some(crate::features::chrome_common::WaitForDelay::new(Some(
9222 std::time::Duration::from_millis(ms),
9223 )));
9224 }
9225 if let Some(ref selector) = config.wait_for_selector {
9226 wait_for.selector = Some(crate::features::chrome_common::WaitForSelector::new(
9227 Some(std::time::Duration::from_secs(30)),
9228 selector.clone(),
9229 ));
9230 }
9231
9232 if wait_for.idle_network.is_some()
9233 || wait_for.delay.is_some()
9234 || wait_for.selector.is_some()
9235 {
9236 self.configuration.wait_for = Some(wait_for);
9237 }
9238 }
9239 #[cfg(feature = "chrome")]
9240 if let Some(ref js) = config.evaluate_on_new_document {
9241 self.configuration.evaluate_on_new_document = Some(Box::new(js.clone()));
9242 }
9243
9244 if let Some(v) = config.shared_queue {
9246 self.configuration.shared_queue = v;
9247 }
9248 if let Some(v) = config.retry {
9249 self.configuration.retry = v;
9250 }
9251
9252 self
9253 }
9254
9255 #[cfg(all(feature = "agent", feature = "serde"))]
9280 pub async fn configure_from_prompt(
9281 &mut self,
9282 api_url: &str,
9283 model_name: &str,
9284 api_key: Option<&str>,
9285 prompt: &str,
9286 ) -> Result<&mut Self, crate::features::automation::EngineError> {
9287 let config = crate::features::automation::configure_crawler_from_prompt(
9288 api_url, model_name, api_key, prompt,
9289 )
9290 .await?;
9291 Ok(self.apply_prompt_configuration(&config))
9292 }
9293}
9294
9295pub fn channel_send_page(
9297 channel: &Option<(
9298 tokio::sync::broadcast::Sender<Page>,
9299 std::sync::Arc<tokio::sync::broadcast::Receiver<Page>>,
9300 )>,
9301 page: Page,
9302 channel_guard: &Option<ChannelGuard>,
9303) {
9304 if let Some(c) = channel {
9305 if c.0.send(page).is_ok() {
9306 if let Some(guard) = channel_guard {
9307 ChannelGuard::inc_guard(&guard.0 .1)
9308 }
9309 }
9310 }
9311}
9312
9313#[derive(Debug, Clone)]
9315pub struct ChannelGuard(Arc<(AtomicBool, AtomicUsize, AtomicUsize)>);
9316
9317impl ChannelGuard {
9318 pub(crate) fn new() -> ChannelGuard {
9320 ChannelGuard(Arc::new((
9321 AtomicBool::new(true),
9322 AtomicUsize::new(0),
9323 AtomicUsize::new(0),
9324 )))
9325 }
9326 pub(crate) async fn lock(&self) {
9328 if self.0 .0.load(Ordering::Relaxed) {
9329 let old = self.0 .1.load(Ordering::Relaxed);
9330
9331 while self
9332 .0
9333 .2
9334 .compare_exchange_weak(old, 0, Ordering::Acquire, Ordering::Relaxed)
9335 .is_err()
9336 {
9337 tokio::task::yield_now().await;
9338 }
9339 std::sync::atomic::fence(Ordering::Acquire);
9340 }
9341 }
9342
9343 pub fn guard(&mut self, guard: bool) {
9345 self.0 .0.store(guard, Ordering::Release);
9346 }
9347
9348 pub fn inc(&mut self) {
9351 self.0 .2.fetch_add(1, std::sync::atomic::Ordering::Release);
9352 }
9353
9354 pub(crate) fn inc_guard(guard: &AtomicUsize) {
9356 guard.fetch_add(1, std::sync::atomic::Ordering::Release);
9357 }
9358}
9359
9360impl Drop for ChannelGuard {
9361 fn drop(&mut self) {
9362 self.0 .0.store(false, Ordering::Release);
9363 }
9364}
9365
9366#[cfg(feature = "cron")]
9367pub async fn run_cron(website: Website) -> Runner {
9369 async_job::Runner::new().add(Box::new(website)).run().await
9370}
9371
9372#[cfg(feature = "cron")]
9373#[async_trait]
9374impl Job for Website {
9375 fn schedule(&self) -> Option<async_job::Schedule> {
9376 match self.configuration.cron_str.parse() {
9377 Ok(schedule) => Some(schedule),
9378 Err(e) => {
9379 log::error!("{:?}", e);
9380 None
9381 }
9382 }
9383 }
9384 async fn handle(&mut self) {
9385 log::info!(
9386 "CRON: {} - cron job running {}",
9387 self.get_url().as_ref(),
9388 self.now()
9389 );
9390 if self.configuration.cron_type == CronType::Crawl {
9391 self.crawl().await;
9392 } else {
9393 self.scrape().await;
9394 }
9395 }
9396}
9397
9398impl std::fmt::Display for Website {
9399 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
9400 write!(
9401 f,
9402 "Website:\n URL: {}\n ID: {:?}\n Configuration: {:?}",
9403 self.get_url(),
9404 self.get_crawl_id(),
9405 self.configuration
9406 )
9407 }
9408}
9409
9410impl std::error::Error for Website {}
9411
9412#[tokio::test]
9413#[cfg(not(feature = "decentralized"))]
9414async fn crawl() {
9415 let url = "https://choosealicense.com";
9416 let mut website: Website = Website::new(url);
9417 website.crawl().await;
9418 assert!(
9419 website
9420 .links_visited
9421 .contains(&"https://choosealicense.com/licenses/".into()),
9422 "{:?}",
9423 website.links_visited
9424 );
9425}
9426
9427#[tokio::test]
9428#[cfg(feature = "cron")]
9429async fn crawl_cron() {
9430 let url = "https://choosealicense.com";
9431 let mut website: Website = Website::new(&url)
9432 .with_cron("1/5 * * * * *", Default::default())
9433 .build()
9434 .unwrap();
9435 let mut rx2 = website.subscribe(16).unwrap();
9436
9437 let join_handle = tokio::spawn(async move {
9439 let mut links_visited = HashSet::new();
9440 while let Ok(res) = rx2.recv().await {
9441 let url = res.get_url();
9442 links_visited.insert(CaseInsensitiveString::new(url));
9443 }
9444 assert!(
9445 links_visited.contains(&CaseInsensitiveString::from(
9446 "https://choosealicense.com/licenses/"
9447 )),
9448 "{:?}",
9449 links_visited
9450 );
9451 });
9452
9453 let mut runner = website.run_cron().await;
9454 log::debug!("Starting the Runner for 10 seconds");
9455 tokio::time::sleep(Duration::from_secs(10)).await;
9456 runner.stop().await;
9457 join_handle.abort();
9458 let _ = join_handle.await;
9459}
9460
9461#[tokio::test]
9462#[cfg(feature = "cron")]
9463async fn crawl_cron_own() {
9464 let url = "https://choosealicense.com";
9465 let mut website: Website = Website::new(&url)
9466 .with_cron("1/5 * * * * *", Default::default())
9467 .build()
9468 .unwrap();
9469 let mut rx2 = website.subscribe(16).unwrap();
9470
9471 let join_handle = tokio::spawn(async move {
9473 let mut links_visited = HashSet::new();
9474 while let Ok(res) = rx2.recv().await {
9475 let url = res.get_url();
9476 links_visited.insert(CaseInsensitiveString::new(url));
9477 }
9478 assert!(
9479 links_visited.contains(&CaseInsensitiveString::from(
9480 "https://choosealicense.com/licenses/"
9481 )),
9482 "{:?}",
9483 links_visited
9484 );
9485 });
9486
9487 let mut runner = run_cron(website).await;
9488 log::debug!("Starting the Runner for 10 seconds");
9489 tokio::time::sleep(Duration::from_secs(10)).await;
9490 let _ = tokio::join!(runner.stop(), join_handle);
9491}
9492
9493#[tokio::test]
9494#[cfg(not(feature = "decentralized"))]
9495async fn scrape() {
9496 let mut website: Website = Website::new("https://choosealicense.com");
9497 website.scrape().await;
9498 assert!(
9499 website
9500 .links_visited
9501 .contains(&"https://choosealicense.com/licenses/".into()),
9502 "{:?}",
9503 website.links_visited
9504 );
9505
9506 assert!(!website.get_pages().unwrap()[0].get_html().is_empty());
9507}
9508
9509#[tokio::test]
9510#[cfg(not(feature = "decentralized"))]
9511async fn crawl_invalid() {
9512 let mut website: Website = Website::new("https://w.com");
9513 website.crawl().await;
9514 assert!(website.links_visited.len() <= 1); }
9516
9517#[tokio::test]
9518#[cfg(feature = "decentralized")]
9519async fn crawl_invalid() {
9520 let domain = "https://w.com";
9521 let mut website: Website = Website::new(domain);
9522 website.crawl().await;
9523 let mut uniq: Box<HashSet<CaseInsensitiveString>> = Box::new(HashSet::new());
9524 uniq.insert(format!("{}/", domain.to_string()).into()); assert_eq!(website.links_visited.get_links(), *uniq); }
9528
9529#[tokio::test]
9530async fn not_crawl_blacklist() {
9531 let mut website: Website = Website::new("https://choosealicense.com");
9532 website.configuration.blacklist_url = Some(Vec::from([CompactString::from(
9533 "https://choosealicense.com/licenses/",
9534 )]));
9535
9536 website.crawl().await;
9537 assert!(
9538 !website
9539 .links_visited
9540 .contains(&"https://choosealicense.com/licenses/".into()),
9541 "{:?}",
9542 website.links_visited
9543 );
9544}
9545
9546#[tokio::test]
9547#[cfg(feature = "regex")]
9548async fn not_crawl_blacklist_regex() {
9549 let mut website: Website = Website::new("https://choosealicense.com");
9550 website.with_blacklist_url(Some(Vec::from(["choosealicense.com".into()])));
9551 website.crawl().await;
9552 assert_eq!(website.links_visited.len(), 0);
9553}
9554
9555#[test]
9556#[cfg(feature = "ua_generator")]
9557fn randomize_website_agent() {
9558 assert_eq!(get_ua(false).is_empty(), false);
9559}
9560
9561#[tokio::test]
9562#[cfg(not(feature = "decentralized"))]
9563async fn test_respect_robots_txt() {
9564 let mut website: Website = Website::new("https://stackoverflow.com");
9565 website.configuration.respect_robots_txt = true;
9566 website.configuration.user_agent = Some(Box::new("*".into()));
9567
9568 let (client, _): (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) =
9569 website.setup().await;
9570
9571 website.configure_robots_parser(&client).await;
9572
9573 assert_eq!(website.configuration.delay, 0);
9574
9575 assert!(!&website
9576 .is_allowed(&"https://stackoverflow.com/posts/".into())
9577 .eq(&ProcessLinkStatus::Allowed));
9578
9579 let mut website_second: Website = Website::new("https://www.mongodb.com");
9581 website_second.configuration.respect_robots_txt = true;
9582 website_second.configuration.user_agent = Some(Box::new("bingbot".into()));
9583
9584 let (client_second, _): (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) =
9585 website_second.setup().await;
9586 website_second.configure_robots_parser(&client_second).await;
9587
9588 assert!(!&website
9589 .is_allowed(&"https://www.mongodb.com/community/forums/auth/".into())
9590 .eq(&ProcessLinkStatus::Allowed));
9591
9592 }
9594
9595#[tokio::test]
9596#[cfg(not(feature = "decentralized"))]
9597async fn test_crawl_subdomains() {
9598 let mut website: Website = Website::new("https://choosealicense.com");
9599 website.configuration.subdomains = true;
9600 website.crawl().await;
9601 assert!(
9602 website
9603 .links_visited
9604 .contains(&"https://choosealicense.com/licenses/".into()),
9605 "{:?}",
9606 website.links_visited
9607 );
9608}
9609
9610#[tokio::test]
9611#[cfg(all(
9612 not(feature = "regex"),
9613 not(feature = "openai"),
9614 not(feature = "gemini")
9615))]
9616async fn test_with_configuration() {
9617 let mut website = Website::new("https://choosealicense.com");
9618
9619 website
9620 .with_respect_robots_txt(true)
9621 .with_subdomains(true)
9622 .with_tld(false)
9623 .with_delay(0)
9624 .with_request_timeout(None)
9625 .with_http2_prior_knowledge(false)
9626 .with_user_agent(Some(crate::page::TEST_AGENT_NAME))
9627 .with_headers(None)
9628 .with_proxies(None);
9629
9630 let mut configuration = Box::new(configuration::Configuration::new());
9631
9632 configuration.respect_robots_txt = true;
9633 configuration.subdomains = true;
9634 configuration.tld = false;
9635 configuration.delay = 0;
9636 configuration.request_timeout = None;
9637 configuration.http2_prior_knowledge = false;
9638 configuration.user_agent = Some(Box::new(CompactString::new(crate::page::TEST_AGENT_NAME)));
9639 configuration.headers = None;
9640 configuration.proxies = None;
9641
9642 assert!(
9643 website.configuration == configuration,
9644 "Left\n{:?}\n\nRight\n{:?}",
9645 website.configuration,
9646 configuration
9647 );
9648}
9649
9650#[tokio::test]
9651#[cfg(all(feature = "glob", not(feature = "decentralized")))]
9652async fn test_crawl_glob() {
9653 let mut website: Website =
9654 Website::new("https://choosealicense.com/licenses/{mit,apache-2.0,mpl-2.0}/");
9655 website.crawl().await;
9656
9657 assert!(
9659 website
9660 .links_visited
9661 .contains(&"https://choosealicense.com/licenses/".into())
9662 || website
9663 .links_visited
9664 .contains(&"http://choosealicense.com/licenses/".into()),
9665 "{:?}",
9666 website.links_visited
9667 );
9668}
9669
9670#[tokio::test]
9671#[cfg(not(feature = "decentralized"))]
9672async fn test_crawl_tld() {
9673 let mut website: Website = Website::new("https://choosealicense.com");
9674 website.configuration.tld = true;
9675 website.crawl().await;
9676
9677 assert!(
9678 website
9679 .links_visited
9680 .contains(&"https://choosealicense.com/licenses/".into()),
9681 "{:?}",
9682 website.links_visited
9683 );
9684}
9685
9686#[tokio::test]
9687#[cfg(all(feature = "sync", not(feature = "decentralized")))]
9688async fn test_crawl_subscription() {
9689 let mut website: Website = Website::new("https://choosealicense.com");
9690 let mut rx2 = website.subscribe(100).unwrap();
9691
9692 let join_handle = tokio::spawn(async move {
9693 let mut count = 0;
9694
9695 while let Ok(_) = rx2.recv().await {
9696 count += 1;
9697 }
9698 count
9699 });
9700
9701 website.crawl().await;
9702 website.unsubscribe();
9703 let website_links = website.get_links().len();
9704 let count = join_handle.await.unwrap();
9705
9706 assert!(count == website_links, "{:?}", true);
9708}
9709
9710#[tokio::test]
9711#[cfg(all(feature = "socks", not(feature = "decentralized")))]
9712async fn test_crawl_proxy() {
9713 let mut website: Website = Website::new("https://choosealicense.com");
9714 website
9715 .configuration
9716 .proxies
9717 .get_or_insert(Default::default())
9718 .push("socks5://127.0.0.1:1080".into());
9719
9720 website.crawl().await;
9721
9722 let mut license_found = false;
9723
9724 for links_visited in website.get_links() {
9725 if links_visited.as_ref().contains("/licenses/") {
9728 license_found = true;
9729 };
9730 }
9731
9732 assert!(license_found, "{:?}", website.links_visited);
9733}
9734
9735#[tokio::test]
9736async fn test_link_duplicates() {
9737 fn has_unique_elements<T>(iter: T) -> bool
9738 where
9739 T: IntoIterator,
9740 T::Item: Eq + std::hash::Hash,
9741 {
9742 let mut uniq = HashSet::new();
9743 iter.into_iter().all(move |x| uniq.insert(x))
9744 }
9745
9746 let mut website: Website = Website::new("http://0.0.0.0:8000");
9747 website.crawl().await;
9748
9749 assert!(has_unique_elements(website.links_visited.get_links()));
9750}
9751
9752#[tokio::test]
9753async fn test_crawl_budget() {
9754 let mut website: Website = Website::new("https://choosealicense.com");
9755 website.with_budget(Some(HashMap::from([("*", 1), ("/licenses", 1)])));
9756 website.crawl().await;
9757
9758 assert!(website.links_visited.len() <= 1);
9759}
9760
9761#[tokio::test]
9762#[cfg(feature = "control")]
9763#[ignore]
9764async fn test_crawl_pause_resume() {
9765 use crate::utils::{pause, resume};
9766
9767 let domain = "https://choosealicense.com/";
9768 let mut website: Website = Website::new(&domain);
9769
9770 let start = tokio::time::Instant::now();
9771
9772 tokio::spawn(async move {
9773 pause(domain).await;
9774 tokio::time::sleep(Duration::from_millis(5000)).await;
9776 resume(domain).await;
9777 });
9778
9779 website.crawl().await;
9780
9781 let duration = start.elapsed();
9782
9783 assert!(duration.as_secs() >= 5, "{:?}", duration);
9784
9785 assert!(
9786 website
9787 .links_visited
9788 .contains(&"https://choosealicense.com/licenses/".into()),
9789 "{:?}",
9790 website.links_visited
9791 );
9792}
9793
9794#[cfg(feature = "control")]
9795#[ignore]
9796#[tokio::test]
9797async fn test_crawl_shutdown() {
9798 use crate::utils::shutdown;
9799
9800 let domain = "https://spider.cloud/";
9802 let mut website: Website = Website::new(&domain);
9803
9804 tokio::spawn(async move {
9805 shutdown(domain).await;
9806 });
9807
9808 website.crawl().await;
9809 let links_visited_count = website.links_visited.len();
9810
9811 assert!(links_visited_count <= 1, "{:?}", links_visited_count);
9812}
9813
9814#[tokio::test]
9815#[cfg(all(feature = "cache_request", not(feature = "decentralized")))]
9816async fn test_cache() {
9817 let domain = "https://choosealicense.com/";
9818 let mut website: Website = Website::new(&domain);
9819 website.configuration.cache = true;
9820
9821 let fresh_start = tokio::time::Instant::now();
9822 website.crawl().await;
9823 let fresh_duration = fresh_start.elapsed();
9824
9825 let cached_start = tokio::time::Instant::now();
9826 website.crawl().await;
9827 let cached_duration = cached_start.elapsed();
9828
9829 assert!(
9831 fresh_duration.as_millis() > cached_duration.as_millis() * 5,
9832 "{:?}",
9833 cached_duration
9834 );
9835}
9836
9837#[cfg(test)]
9838mod tests {
9839 use super::*;
9840
9841 #[cfg(not(feature = "decentralized"))]
9842 #[test]
9843 fn test_client_rotator_round_robin() {
9844 let clients: Vec<Client> = (0..3)
9846 .map(|_| {
9847 #[cfg(not(feature = "cache_request"))]
9848 {
9849 unsafe { crate::ClientBuilder::new().build().unwrap_unchecked() }
9850 }
9851 #[cfg(feature = "cache_request")]
9852 {
9853 reqwest_middleware::ClientBuilder::new(unsafe {
9854 reqwest::ClientBuilder::new().build().unwrap_unchecked()
9855 })
9856 .build()
9857 }
9858 })
9859 .collect();
9860
9861 let rotator = ClientRotator::new(clients);
9862 assert_eq!(rotator.len(), 3);
9863 assert!(!rotator.is_empty());
9864
9865 let _ = rotator.next(); let _ = rotator.next(); let _ = rotator.next(); let _ = rotator.next(); let current_idx = rotator.index.load(Ordering::Relaxed);
9874 assert_eq!(current_idx, 4);
9875 }
9876
9877 #[cfg(not(feature = "decentralized"))]
9878 #[test]
9879 fn test_build_rotated_clients_with_multiple_proxies() {
9880 let mut website = Website::new("http://example.com");
9881 website.configuration.with_proxies(Some(vec![
9882 "http://proxy1.example.com:8080".to_string(),
9883 "http://proxy2.example.com:8080".to_string(),
9884 "http://proxy3.example.com:8080".to_string(),
9885 ]));
9886
9887 let rotator = website.build_rotated_clients();
9888 assert!(rotator.is_some(), "Should build rotator with 3 proxies");
9889 let rotator = rotator.unwrap();
9890 assert_eq!(rotator.len(), 3);
9891 }
9892
9893 #[cfg(not(feature = "decentralized"))]
9894 #[test]
9895 fn test_build_rotated_clients_single_proxy_returns_none() {
9896 let mut website = Website::new("http://example.com");
9897 website.configuration.with_proxies(Some(vec![
9898 "http://proxy1.example.com:8080".to_string(),
9899 ]));
9900
9901 let rotator = website.build_rotated_clients();
9902 assert!(
9903 rotator.is_none(),
9904 "Should not build rotator with only 1 proxy"
9905 );
9906 }
9907
9908 #[cfg(not(feature = "decentralized"))]
9909 #[test]
9910 fn test_build_rotated_clients_no_proxies_returns_none() {
9911 let website = Website::new("http://example.com");
9912 let rotator = website.build_rotated_clients();
9913 assert!(
9914 rotator.is_none(),
9915 "Should not build rotator with no proxies"
9916 );
9917 }
9918}