1use crate::black_list::contains;
2use crate::client::redirect::Policy;
3use crate::compact_str::CompactString;
4use crate::configuration::{
5 self, get_ua, AutomationScriptsMap, Configuration, ExecutionScriptsMap, RedirectPolicy,
6 SerializableHeaderMap,
7};
8use crate::{page::build, utils::PageResponse};
9
10use crate::features::chrome_common::RequestInterceptConfiguration;
11#[cfg(feature = "disk")]
12use crate::features::disk::DatabaseHandler;
13use crate::packages::robotparser::parser::RobotFileParser;
14use crate::page::{
15 AntiBotTech, Page, PageLinkBuildSettings, CHROME_UNKNOWN_STATUS_ERROR, UNKNOWN_STATUS_ERROR,
16};
17use crate::utils::abs::{convert_abs_url, parse_absolute_url};
18use crate::utils::interner::ListBucket;
19use crate::utils::{
20 crawl_duration_expired, emit_log, emit_log_shutdown, get_path_from_url, get_semaphore,
21 networking_capable, prepare_url, setup_website_selectors, spawn_set, AllowedDomainTypes,
22};
23use crate::{CaseInsensitiveString, Client, ClientBuilder, RelativeSelectors};
24#[cfg(feature = "cron")]
25use async_job::{async_trait, Job, Runner};
26use hashbrown::{HashMap, HashSet};
27use reqwest::header::REFERER;
28use reqwest::StatusCode;
29use std::fmt;
30use std::net::IpAddr;
31use std::sync::atomic::{AtomicBool, AtomicI8, AtomicUsize, Ordering};
32use std::sync::Arc;
33use std::time::{Duration, Instant};
34use tokio::{
35 sync::{broadcast, Semaphore},
36 task::JoinSet,
37 time::Interval,
38};
39use tokio_stream::StreamExt;
40use url::Url;
41
42#[cfg(feature = "cache_request")]
43use http_cache_reqwest::{Cache, CacheMode, HttpCache, HttpCacheOptions};
44
45#[cfg(feature = "cache_request")]
46pub use http_global_cache::CACACHE_MANAGER;
47
48const BACKOFF_MAX_DURATION: tokio::time::Duration = tokio::time::Duration::from_secs(60);
50
51pub fn calc_limits(multiplier: usize) -> usize {
53 let logical = num_cpus::get();
54 let physical = num_cpus::get_physical();
55
56 let sem_limit = if logical > physical {
57 (logical) / (physical)
58 } else {
59 logical
60 };
61
62 let (sem_limit, sem_max) = if logical == physical {
63 (sem_limit * physical, 30 * multiplier)
64 } else {
65 (sem_limit * 2, 20 * multiplier)
66 };
67
68 sem_limit.max(sem_max)
69}
70
71static JS_SAFE_CHALLENGE_PATTERNS: &[&str] = &[
73 r#"Enable JavaScript and cookies to continue"#, r#"To continue, please enable JavaScript in your browser settings"#, r#"Please enable JavaScript to view the page content"#, ];
77
78pub fn is_safe_javascript_challenge(page: &Page) -> bool {
80 let page = page.get_html_bytes_u8();
81
82 let page_size = page.len();
83
84 if page_size == 0 || page_size > 10_000 {
85 return false;
86 }
87
88 AC_JS_CHALLENGE.find(page).is_some()
89}
90
91#[cfg(all(
92 any(
93 target_os = "android",
94 target_os = "fuchsia",
95 target_os = "illumos",
96 target_os = "ios",
97 target_os = "linux",
98 target_os = "macos",
99 target_os = "solaris",
100 target_os = "tvos",
101 target_os = "visionos",
102 target_os = "watchos",
103 ),
104 any(not(feature = "wreq"), feature = "cache_request")
105))]
106pub fn set_interface(client: ClientBuilder, network_interface: &str) -> ClientBuilder {
108 client.interface(network_interface)
109}
110
111#[cfg(not(any(
112 all(feature = "wreq", not(feature = "cache_request")),
113 target_os = "android",
114 target_os = "fuchsia",
115 target_os = "illumos",
116 target_os = "ios",
117 target_os = "linux",
118 target_os = "macos",
119 target_os = "solaris",
120 target_os = "tvos",
121 target_os = "visionos",
122 target_os = "watchos",
123)))]
124pub fn set_interface(client: ClientBuilder, _interface: &str) -> ClientBuilder {
126 client
127}
128
129lazy_static! {
130 static ref AC_JS_CHALLENGE: aho_corasick::AhoCorasick = aho_corasick::AhoCorasick::new(JS_SAFE_CHALLENGE_PATTERNS).expect("safe challenges");
131 pub static ref DEFAULT_PERMITS: usize = calc_limits(1);
133 pub(crate) static ref SEM_SHARED: Arc<Semaphore> = {
135 let base_limit = match std::env::var("SEMAPHORE_MULTIPLIER") {
136 Ok(multiplier) => match multiplier.parse::<isize>() {
137 Ok(parsed_value) => (*DEFAULT_PERMITS as isize)
138 .wrapping_mul(parsed_value)
139 .max(1) as usize,
140 Err(_) => *DEFAULT_PERMITS,
141 },
142 _ => *DEFAULT_PERMITS,
143 };
144 Arc::new(Semaphore::const_new(base_limit))
145 };
146 pub(crate) static ref LINKS_VISITED_MEMORY_LIMIT: usize = {
148 const DEFAULT_LIMIT: usize = 15_000;
149
150 match std::env::var("LINKS_VISITED_MEMORY_LIMIT") {
151 Ok(limit) => limit.parse::<usize>().unwrap_or(DEFAULT_LIMIT),
152 _ => DEFAULT_LIMIT
153 }
154 };
155 static ref WILD_CARD_PATH: CaseInsensitiveString = CaseInsensitiveString::from("*");
156}
157
158#[cfg(not(feature = "decentralized"))]
159lazy_static! {
160 static ref SEM: Semaphore = {
162 let base_limit = calc_limits(1);
163
164 let base_limit = match std::env::var("SEMAPHORE_MULTIPLIER") {
165 Ok(multiplier) => match multiplier.parse::<isize>() {
166 Ok(parsed_value) => (base_limit as isize * parsed_value).max(1) as usize,
167 Err(_) => base_limit,
168 },
169 _ => base_limit,
170 };
171
172 Semaphore::const_new(base_limit)
173 };
174}
175
176#[cfg(feature = "decentralized")]
177lazy_static! {
178 static ref WORKERS: HashSet<String> = {
180 let mut set: HashSet<_> = HashSet::new();
181
182 for worker in std::env::var("SPIDER_WORKER_SCRAPER")
183 .unwrap_or_else(|_| "http://127.0.0.1:3031".to_string())
184 .split(",")
185 {
186 set.insert(worker.to_string());
187 }
188
189 for worker in std::env::var("SPIDER_WORKER")
190 .unwrap_or_else(|_| "http://127.0.0.1:3030".to_string())
191 .split(",")
192 {
193 set.insert(worker.to_string());
194 }
195
196 set
197 };
198 static ref SEM: Semaphore = {
199 let sem_limit = calc_limits(3);
200 Semaphore::const_new(sem_limit * WORKERS.len())
201 };
202}
203
204#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, strum::EnumString, strum::Display)]
208#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
209pub enum CrawlStatus {
210 #[default]
212 Start,
213 Idle,
215 Active,
217 Blocked,
219 FirewallBlocked,
221 ServerError,
223 ConnectError,
225 RateLimited,
227 Empty,
229 Invalid,
231 #[cfg(feature = "control")]
232 Shutdown,
234 #[cfg(feature = "control")]
235 Paused,
237}
238
239#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, strum::EnumString, strum::Display)]
241#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
242pub enum ProcessLinkStatus {
243 #[default]
245 Allowed,
246 Blocked,
248 BudgetExceeded,
250}
251
252#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, strum::EnumString, strum::Display)]
254#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
255pub enum CronType {
256 #[default]
257 Crawl,
259 Scrape,
261}
262
263#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, strum::EnumString, strum::Display)]
264#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
265pub enum WebsiteMetaInfo {
267 RequiresJavascript,
269 Apache403,
271 OpenResty403,
273 #[default]
275 None,
276}
277
278pub type OnLinkFindCallback = Arc<
280 dyn Fn(CaseInsensitiveString, Option<String>) -> (CaseInsensitiveString, Option<String>)
281 + Send
282 + Sync,
283>;
284
285pub trait OnShouldCrawlClosure: Fn(&Page) -> bool + Send + Sync + 'static {}
287impl<F: Fn(&Page) -> bool + Send + Sync + 'static> OnShouldCrawlClosure for F {}
288
289#[derive(Clone)]
291pub enum OnShouldCrawlCallback {
292 Fn(fn(&Page) -> bool),
294
295 Closure(Arc<dyn OnShouldCrawlClosure>),
297}
298impl OnShouldCrawlCallback {
299 fn call(&self, page: &Page) -> bool {
300 match self {
301 Self::Fn(func) => func(page),
302 Self::Closure(closure) => closure(page),
303 }
304 }
305}
306
307#[derive(Clone)]
310pub struct ClientRotator {
311 clients: Vec<Client>,
312 index: Arc<AtomicUsize>,
313}
314
315impl ClientRotator {
316 pub fn new(clients: Vec<Client>) -> Self {
318 Self {
319 clients,
320 index: Arc::new(AtomicUsize::new(0)),
321 }
322 }
323
324 pub fn next(&self) -> &Client {
326 let idx = self.index.fetch_add(1, Ordering::Relaxed) % self.clients.len();
327 &self.clients[idx]
328 }
329
330 pub fn len(&self) -> usize {
332 self.clients.len()
333 }
334
335 pub fn is_empty(&self) -> bool {
337 self.clients.is_empty()
338 }
339
340 #[cfg(feature = "hedge")]
343 pub fn next_pair(&self) -> (&Client, Option<&Client>) {
344 let len = self.clients.len();
345 if len <= 1 {
346 return (&self.clients[0], None);
347 }
348 let idx = self.index.fetch_add(2, Ordering::Relaxed);
349 let primary_idx = idx % len;
350 let hedge_idx = (idx + 1) % len;
351 (&self.clients[primary_idx], Some(&self.clients[hedge_idx]))
352 }
353}
354
355#[derive(Clone, Default)]
367pub struct Website {
368 pub configuration: Box<Configuration>,
370 pub on_link_find_callback: Option<OnLinkFindCallback>,
372 pub on_should_crawl_callback: Option<OnShouldCrawlCallback>,
374 pub crawl_id: Box<String>,
376 #[cfg(feature = "extra_information")]
377 pub extra_info: Option<Box<String>>,
379 seed_html: Option<String>,
381 links_visited: Box<ListBucket>,
383 signatures: Box<HashSet<u64>>,
385 extra_links: Box<HashSet<CaseInsensitiveString>>,
387 pages: Option<Vec<Page>>,
389 robot_file_parser: Option<Box<RobotFileParser>>,
391 url: Box<CaseInsensitiveString>,
393 domain_parsed: Option<Box<Url>>,
395 channel: Option<(broadcast::Sender<Page>, Arc<broadcast::Receiver<Page>>)>,
397 channel_guard: Option<ChannelGuard>,
399 channel_queue: Option<(broadcast::Sender<String>, Arc<broadcast::Receiver<String>>)>,
401 status: CrawlStatus,
403 initial_status_code: StatusCode,
405 initial_anti_bot_tech: AntiBotTech,
407 initial_html_length: usize,
409 initial_page_waf_check: bool,
411 initial_page_should_retry: bool,
413 shutdown: bool,
415 client: Option<Client>,
417 client_rotator: Option<Arc<ClientRotator>>,
419 #[cfg(feature = "disk")]
421 sqlite: Option<Box<DatabaseHandler>>,
422 #[cfg(feature = "disk")]
424 enable_sqlite: bool,
425 send_configured: bool,
427 website_meta_info: WebsiteMetaInfo,
429 skip_initial: bool,
431 #[cfg(feature = "cookies")]
432 pub cookie_jar: Arc<crate::client::cookie::Jar>,
434}
435
436impl fmt::Debug for Website {
437 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
438 let domain_str = self.domain_parsed.as_ref().map(|u| u.as_str().to_owned());
439 let pages_len = self.pages.as_ref().map(|p| p.len()).unwrap_or(0);
440
441 let mut ds = f.debug_struct("Website");
442
443 ds.field("url", &self.url.as_ref())
444 .field("crawl_id", &self.crawl_id)
445 .field("domain_parsed", &domain_str)
446 .field(
448 "on_link_find_callback",
449 &self.on_link_find_callback.is_some(),
450 )
451 .field(
452 "on_should_crawl_callback",
453 &self.on_should_crawl_callback.is_some(),
454 )
455 .field("status", &self.status)
457 .field("shutdown", &self.shutdown)
458 .field("extra_links_len", &self.extra_links.len())
459 .field("signatures_len", &self.signatures.len())
460 .field("pages_len", &pages_len)
461 .field("channel_present", &self.channel.is_some())
463 .field("channel_queue_present", &self.channel_queue.is_some())
464 .field("client_present", &self.client.is_some())
465 .field("initial_status_code", &self.initial_status_code)
467 .field("initial_html_length", &self.initial_html_length)
468 .field("initial_anti_bot_tech", &self.initial_anti_bot_tech)
469 .field("initial_page_waf_check", &self.initial_page_waf_check)
470 .field("initial_page_should_retry", &self.initial_page_should_retry)
471 .field("send_configured", &self.send_configured)
473 .field("website_meta_info", &self.website_meta_info)
474 .field("skip_initial", &self.skip_initial);
475
476 #[cfg(feature = "disk")]
477 {
478 ds.field("sqlite_present", &self.sqlite.is_some())
479 .field("enable_sqlite", &self.enable_sqlite);
480 }
481
482 ds.finish()
483 }
484}
485
486impl Website {
487 fn _new(url: &str, check_firewall: bool) -> Self {
489 let url = url.trim();
490 let url: Box<CaseInsensitiveString> = if networking_capable(url) {
491 CaseInsensitiveString::new(&url).into()
492 } else {
493 CaseInsensitiveString::new(&prepare_url(url)).into()
494 };
495
496 let domain_parsed: Option<Box<Url>> = parse_absolute_url(&url);
497 let mut status = CrawlStatus::Start;
498
499 if let Some(u) = &domain_parsed {
500 if check_firewall && crate::utils::abs::block_website(u) {
501 status = CrawlStatus::FirewallBlocked;
502 }
503 }
504
505 Self {
506 configuration: Configuration::new().into(),
507 status,
508 domain_parsed,
509 url,
510 #[cfg(feature = "disk")]
511 enable_sqlite: true,
512 ..Default::default()
513 }
514 }
515
516 pub fn new(url: &str) -> Self {
518 Website::_new(url, true)
519 }
520
521 pub fn new_with_firewall(url: &str, check_firewall: bool) -> Self {
523 Website::_new(url, check_firewall)
524 }
525
526 #[cfg(feature = "disk")]
528 pub fn setup_database_handler(&self) -> Box<DatabaseHandler> {
529 Box::new(DatabaseHandler::new(&Some(self.target_id())))
530 }
531
532 #[cfg(feature = "disk")]
533 pub fn setup_shared_db(&mut self, db: Box<DatabaseHandler>) {
535 self.sqlite = Some(db)
536 }
537
538 #[cfg(feature = "disk")]
539 pub fn setup_sqlite(&mut self) {
541 if self.sqlite.is_none() {
542 self.sqlite = Some(self.setup_database_handler())
543 }
544 }
545
546 pub fn set_url(&mut self, url: &str) -> &mut Self {
548 let url = if url.starts_with(' ') || url.ends_with(' ') {
549 url.trim()
550 } else {
551 url
552 };
553
554 let domain: Box<CaseInsensitiveString> = if networking_capable(url) {
555 CaseInsensitiveString::new(&url).into()
556 } else {
557 CaseInsensitiveString::new(&prepare_url(url)).into()
558 };
559
560 self.domain_parsed = parse_absolute_url(&domain);
561 self.url = domain;
562 self
563 }
564
565 pub fn set_url_only(&mut self, url: &str) -> &mut Self {
567 self.url = CaseInsensitiveString::new(&url).into();
568 self
569 }
570
571 pub fn target_id(&self) -> String {
573 string_concat!(self.crawl_id, self.url.inner())
574 }
575
576 pub fn single_page(&self) -> bool {
578 match &self.configuration.inner_budget {
579 Some(b) => match b.get(&*WILD_CARD_PATH) {
580 Some(b) => b.eq(&1),
581 _ => false,
582 },
583 _ => false,
584 }
585 }
586
587 #[cfg(feature = "disk")]
589 pub fn setup_disk(&mut self) {
590 if self.enable_sqlite && self.sqlite.is_none() {
591 self.setup_sqlite();
592 }
593 if self.configuration.shared {
595 if let Some(sqlite) = self.sqlite.as_mut() {
596 sqlite.seeded = true;
597 }
599 }
600 }
601
602 #[cfg(feature = "disk")]
603 pub fn set_disk_persistance(&mut self, persist: bool) -> &mut Self {
605 if self.enable_sqlite && self.sqlite.is_some() {
606 if let Some(sqlite) = self.sqlite.as_mut() {
607 sqlite.persist = persist;
608 }
609 }
610 self
611 }
612
613 #[cfg(not(feature = "disk"))]
615 pub fn setup_disk(&mut self) {}
616
617 pub fn get_robots_parser(&self) -> &Option<Box<RobotFileParser>> {
619 &self.robot_file_parser
620 }
621
622 pub fn get_requires_javascript(&self) -> bool {
624 self.website_meta_info == WebsiteMetaInfo::RequiresJavascript
625 }
626
627 pub fn get_website_meta_info(&self) -> &WebsiteMetaInfo {
629 &self.website_meta_info
630 }
631
632 #[cfg(feature = "disk")]
634 pub async fn is_allowed_disk(&self, url_to_check: &str) -> bool {
635 match &self.sqlite {
636 Some(sqlite) => {
637 if !sqlite.ready() {
638 true
639 } else {
640 let db_pool = sqlite.get_db_pool().await;
641 let allowed = sqlite.url_exists(db_pool, url_to_check).await;
642
643 !allowed
644 }
645 }
646 _ => true,
647 }
648 }
649
650 #[cfg(not(feature = "disk"))]
652 pub async fn is_allowed_disk(&self, _url_to_check: &str) -> bool {
653 true
654 }
655
656 #[cfg(feature = "disk")]
658 pub async fn is_allowed_signature_disk(&self, signature_to_check: u64) -> bool {
659 match &self.sqlite {
660 Some(sqlite) => {
661 if !sqlite.ready() {
662 true
663 } else {
664 let db_pool = sqlite.get_db_pool().await;
665
666 !sqlite.signature_exists(db_pool, signature_to_check).await
667 }
668 }
669 _ => true,
670 }
671 }
672
673 #[cfg(not(feature = "disk"))]
675 pub async fn is_allowed_signature_disk(&self, _signature_to_check: u64) -> bool {
676 true
677 }
678
679 pub async fn is_signature_allowed(&self, signature: u64) -> bool {
681 !self.signatures.contains(&signature) || self.is_allowed_signature_disk(signature).await
682 }
683
684 #[cfg(feature = "disk")]
686 pub async fn clear_disk(&self) {
687 if let Some(sqlite) = &self.sqlite {
688 if sqlite.pool_inited() {
689 let _ = DatabaseHandler::clear_table(sqlite.get_db_pool().await).await;
690 }
691 }
692 }
693
694 #[cfg(not(feature = "disk"))]
696 pub async fn clear_disk(&self) {}
697
698 #[cfg(feature = "disk")]
700 pub(crate) fn shared_disk_enabled(&self) -> bool {
701 self.configuration.shared && self.sqlite.is_some()
702 }
703
704 #[cfg(feature = "disk")]
706 pub async fn insert_url_disk(&self, new_url: &str) {
707 if let Some(sqlite) = &self.sqlite {
708 sqlite.insert_url(sqlite.get_db_pool().await, new_url).await
709 }
710 }
711
712 #[cfg(feature = "disk")]
714 pub async fn insert_signature_disk(&self, signature: u64) {
715 if let Some(sqlite) = &self.sqlite {
716 sqlite
717 .insert_signature(sqlite.get_db_pool().await, signature)
718 .await
719 }
720 }
721
722 #[cfg(feature = "disk")]
724 pub async fn insert_link(&mut self, new_url: CaseInsensitiveString) {
725 let mem_load = crate::utils::detect_system::get_global_memory_state().await;
726 let beyond_memory_limits = self.links_visited.len() >= *LINKS_VISITED_MEMORY_LIMIT;
727 let seed_check = mem_load == 2 || mem_load == 1 || beyond_memory_limits;
728
729 if seed_check {
730 let mut seeded = false;
731 if let Some(sqlite) = &self.sqlite {
732 if !sqlite.ready() {
733 let _ = self.seed().await;
734 seeded = true;
735 }
736 }
737 if let Some(sqlite) = self.sqlite.as_mut() {
738 sqlite.set_seeded(seeded);
739 }
740 }
741
742 if mem_load == 2 || beyond_memory_limits || self.shared_disk_enabled() {
743 self.insert_url_disk(&new_url).await
744 } else if mem_load == 1 {
745 if self.links_visited.len() <= 100 {
746 self.links_visited.insert(new_url);
747 } else {
748 self.insert_url_disk(&new_url).await
749 }
750 } else {
751 self.links_visited.insert(new_url);
752 }
753 }
754
755 #[cfg(not(feature = "disk"))]
757 pub async fn insert_link(&mut self, link: CaseInsensitiveString) {
758 self.links_visited.insert(link);
759 }
760
761 #[cfg(feature = "disk")]
763 pub async fn insert_signature(&mut self, new_signature: u64) {
764 let mem_load = crate::utils::detect_system::get_global_memory_state().await;
765 let beyond_memory_limits = self.signatures.len() >= *LINKS_VISITED_MEMORY_LIMIT;
766 let seed_check = mem_load == 2 || mem_load == 1 || beyond_memory_limits;
767
768 if seed_check {
769 let mut seeded = false;
770 if let Some(sqlite) = &self.sqlite {
771 if !sqlite.ready() {
772 let _ = self.seed().await;
773 seeded = true;
774 }
775 }
776 if let Some(sqlite) = self.sqlite.as_mut() {
777 sqlite.set_seeded(seeded);
778 }
779 }
780
781 if mem_load == 2 || beyond_memory_limits || self.shared_disk_enabled() {
782 self.insert_signature_disk(new_signature).await
783 } else if mem_load == 1 {
784 if self.signatures.len() <= 100 {
785 self.signatures.insert(new_signature);
786 } else {
787 self.insert_signature_disk(new_signature).await
788 }
789 } else {
790 self.signatures.insert(new_signature);
791 }
792 }
793
794 #[cfg(not(feature = "disk"))]
796 pub async fn insert_signature(&mut self, new_signature: u64) {
797 self.signatures.insert(new_signature);
798 }
799
800 #[cfg(feature = "disk")]
802 pub async fn seed(&mut self) -> Result<(), sqlx::Error> {
803 let links = self.get_links();
804
805 if let Some(sqlite) = &self.sqlite {
806 if let Ok(links) = sqlite.seed(sqlite.get_db_pool().await, links).await {
807 self.links_visited.clear();
808
809 for link in links {
810 self.links_visited.insert(link);
811 }
812
813 if let Some(sqlite) = self.sqlite.as_mut() {
814 sqlite.seeded = true;
815 }
816 }
817 }
818
819 Ok(())
820 }
821
822 async fn handle_process<T>(
824 &self,
825 handle: &Option<Arc<AtomicI8>>,
826 interval: &mut Interval,
827 shutdown: T,
828 ) -> bool
829 where
830 T: std::future::Future<Output = ()>,
831 {
832 if self.shutdown {
833 (shutdown).await;
834 false
835 } else {
836 match handle.as_ref() {
837 Some(handle) => {
838 while handle.load(Ordering::Relaxed) == 1 {
839 interval.tick().await;
840 }
841 if handle.load(Ordering::Relaxed) == 2 {
842 (shutdown).await;
843 false
844 } else {
845 true
846 }
847 }
848 _ => true,
849 }
850 }
851 }
852
853 #[inline]
862 #[cfg(not(feature = "regex"))]
863 pub fn is_allowed(&mut self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
864 let status = self.is_allowed_budgetless(link);
865
866 if status.eq(&ProcessLinkStatus::Allowed) && self.is_over_budget(link) {
867 return ProcessLinkStatus::BudgetExceeded;
868 }
869
870 status
871 }
872
873 #[inline]
882 #[cfg(feature = "regex")]
883 pub fn is_allowed(&mut self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
884 let status = self.is_allowed_budgetless(link);
885
886 if status.eq(&ProcessLinkStatus::Allowed) && self.is_over_budget(link) {
887 return ProcessLinkStatus::BudgetExceeded;
888 }
889 status
890 }
891
892 #[inline]
900 #[cfg(not(feature = "regex"))]
901 pub fn is_allowed_budgetless(&mut self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
902 if self.links_visited.contains(link) {
903 ProcessLinkStatus::Blocked
904 } else {
905 let status = self.is_allowed_default(link.inner());
906
907 if status.eq(&ProcessLinkStatus::Allowed) && self.is_over_depth(link) {
908 return ProcessLinkStatus::Blocked;
909 }
910
911 status
912 }
913 }
914
915 #[inline]
923 #[cfg(feature = "regex")]
924 pub fn is_allowed_budgetless(&mut self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
925 if self.links_visited.contains(link) {
926 ProcessLinkStatus::Blocked
927 } else {
928 let status = self.is_allowed_default(link);
929 if status.eq(&ProcessLinkStatus::Allowed) && self.is_over_depth(link) {
930 return ProcessLinkStatus::Blocked;
931 }
932 status
933 }
934 }
935
936 #[inline]
942 #[cfg(feature = "regex")]
943 pub fn is_allowed_default(&self, link: &CaseInsensitiveString) -> ProcessLinkStatus {
944 let blacklist = self.configuration.get_blacklist_compiled();
945 let whitelist = self.configuration.get_whitelist_compiled();
946
947 let blocked_whitelist = !whitelist.is_empty() && !contains(whitelist, link.inner());
948 let blocked_blacklist = !blacklist.is_empty() && contains(blacklist, link.inner());
949
950 if blocked_whitelist || blocked_blacklist || !self.is_allowed_robots(link.as_ref()) {
951 ProcessLinkStatus::Blocked
952 } else {
953 ProcessLinkStatus::Allowed
954 }
955 }
956
957 #[inline]
963 #[cfg(not(feature = "regex"))]
964 pub fn is_allowed_default(&self, link: &CompactString) -> ProcessLinkStatus {
965 let whitelist = self.configuration.get_whitelist_compiled();
966 let blacklist = self.configuration.get_blacklist_compiled();
967
968 let blocked_whitelist = !whitelist.is_empty() && !contains(whitelist, link);
969 let blocked_blacklist = !blacklist.is_empty() && contains(blacklist, link);
970
971 if blocked_whitelist || blocked_blacklist || !self.is_allowed_robots(link) {
972 ProcessLinkStatus::Blocked
973 } else {
974 ProcessLinkStatus::Allowed
975 }
976 }
977
978 pub fn is_allowed_robots(&self, link: &str) -> bool {
982 if self.configuration.respect_robots_txt {
983 if let Some(r) = &self.robot_file_parser {
984 return r.can_fetch(
985 match &self.configuration.user_agent {
986 Some(ua) => ua,
987 _ => "*",
988 },
989 link,
990 );
991 }
992 }
993
994 true
995 }
996
997 pub(crate) fn is_over_inner_depth_budget(&mut self, link: &CaseInsensitiveString) -> bool {
999 let mut over = false;
1000
1001 if let Some(segments) = get_path_from_url(link)
1002 .strip_prefix('/')
1003 .map(|remainder| remainder.split('/'))
1004 {
1005 let mut depth: usize = 0;
1006
1007 for _ in segments {
1008 depth = depth.saturating_add(1);
1009 if depth > self.configuration.depth_distance {
1010 over = true;
1011 break;
1012 }
1013 }
1014 }
1015
1016 over
1017 }
1018
1019 #[cfg(feature = "sitemap")]
1021 pub(crate) fn is_over_wild_budget(
1022 &self,
1023 budget: &Option<hashbrown::HashMap<case_insensitive_string::CaseInsensitiveString, u32>>,
1024 ) -> bool {
1025 let exceeded_wild_budget = if self.configuration.wild_card_budgeting {
1026 match budget {
1027 Some(budget) => match budget.get(&*WILD_CARD_PATH) {
1028 Some(budget) => budget.abs_diff(0) == 1,
1029 _ => false,
1030 },
1031 _ => false,
1032 }
1033 } else {
1034 false
1035 };
1036 exceeded_wild_budget
1037 }
1038
1039 pub(crate) fn is_over_inner_budget(&mut self, link: &CaseInsensitiveString) -> bool {
1041 match self.configuration.inner_budget.as_mut() {
1042 Some(budget) => {
1043 let exceeded_wild_budget = if self.configuration.wild_card_budgeting {
1044 match budget.get_mut(&*WILD_CARD_PATH) {
1045 Some(budget) => {
1046 if budget.abs_diff(0) == 1 {
1047 true
1048 } else {
1049 *budget -= 1;
1050 false
1051 }
1052 }
1053 _ => false,
1054 }
1055 } else {
1056 false
1057 };
1058
1059 let skip_paths = self.configuration.wild_card_budgeting && budget.len() == 1;
1062 let has_depth_control = self.configuration.depth_distance > 0;
1063
1064 if !skip_paths && !exceeded_wild_budget {
1066 let path_segments = get_path_from_url(link)
1067 .strip_prefix('/')
1068 .map(|remainder| remainder.split('/'));
1069
1070 match path_segments {
1071 Some(segments) => {
1072 let mut joint_segment = CaseInsensitiveString::default();
1073 let mut over = false;
1074 let mut depth: usize = 0;
1075
1076 for seg in segments {
1077 if has_depth_control {
1078 depth = depth.saturating_add(1);
1079 if depth > self.configuration.depth_distance {
1080 over = true;
1081 break;
1082 }
1083 }
1084
1085 joint_segment.push_str(seg);
1086
1087 if budget.contains_key(&joint_segment) {
1088 if let Some(budget) = budget.get_mut(&joint_segment) {
1089 if budget.abs_diff(0) == 0 || *budget == 0 {
1090 over = true;
1091 break;
1092 } else {
1093 *budget -= 1;
1094 continue;
1095 }
1096 }
1097 }
1098 }
1099
1100 over
1101 }
1102 _ => false,
1103 }
1104 } else {
1105 exceeded_wild_budget
1106 }
1107 }
1108 _ => false,
1109 }
1110 }
1111
1112 pub(crate) fn is_over_depth(&mut self, link: &CaseInsensitiveString) -> bool {
1114 self.configuration.depth_distance > 0 && self.is_over_inner_depth_budget(link)
1115 }
1116
1117 pub(crate) fn is_over_budget(&mut self, link: &CaseInsensitiveString) -> bool {
1119 self.is_over_inner_budget(link)
1120 }
1121
1122 #[cfg(all(feature = "agent", feature = "serde"))]
1124 pub(crate) fn restore_wildcard_budget(&mut self) {
1125 if self.configuration.wild_card_budgeting {
1126 if let Some(budget) = self.configuration.inner_budget.as_mut() {
1127 if let Some(counter) = budget.get_mut(&*WILD_CARD_PATH) {
1128 *counter = counter.saturating_add(1);
1129 }
1130 }
1131 }
1132 }
1133
1134 pub fn size(&self) -> usize {
1136 self.links_visited.len()
1137 }
1138
1139 #[cfg(not(feature = "disk"))]
1141 pub async fn get_size(&self) -> usize {
1142 self.links_visited.len()
1143 }
1144
1145 #[cfg(feature = "disk")]
1147 pub async fn get_size(&self) -> usize {
1148 let disk_count = if let Some(sqlite) = &self.sqlite {
1149 if sqlite.pool_inited() {
1150 let disk_count = DatabaseHandler::count_records(sqlite.get_db_pool().await).await;
1151
1152 disk_count.unwrap_or_default() as usize
1153 } else {
1154 0
1155 }
1156 } else {
1157 0
1158 };
1159
1160 let mut mem_count = self.links_visited.len();
1161
1162 if mem_count >= *LINKS_VISITED_MEMORY_LIMIT {
1163 mem_count -= *LINKS_VISITED_MEMORY_LIMIT;
1164 }
1165
1166 disk_count + mem_count
1167 }
1168
1169 pub fn drain_extra_links(&mut self) -> hashbrown::hash_set::Drain<'_, CaseInsensitiveString> {
1171 self.extra_links.drain()
1172 }
1173
1174 pub fn set_initial_status_code(&mut self, initial_status_code: StatusCode) {
1176 self.initial_status_code = initial_status_code;
1177 }
1178
1179 pub fn get_initial_status_code(&self) -> &StatusCode {
1181 &self.initial_status_code
1182 }
1183
1184 pub fn set_initial_html_length(&mut self, initial_html_length: usize) {
1186 self.initial_html_length = initial_html_length;
1187 }
1188
1189 pub fn get_initial_html_length(&self) -> usize {
1191 self.initial_html_length
1192 }
1193
1194 pub fn set_initial_anti_bot_tech(&mut self, initial_anti_bot_tech: AntiBotTech) {
1196 self.initial_anti_bot_tech = initial_anti_bot_tech;
1197 }
1198
1199 pub fn get_initial_anti_bot_tech(&self) -> &AntiBotTech {
1201 &self.initial_anti_bot_tech
1202 }
1203
1204 pub fn set_initial_page_waf_check(&mut self, initial_page_waf_check: bool) {
1206 self.initial_page_waf_check = initial_page_waf_check;
1207 }
1208
1209 pub fn get_initial_page_waf_check(&self) -> bool {
1211 self.initial_page_waf_check
1212 }
1213
1214 pub fn set_initial_page_should_retry(&mut self, initial_page_should_retry: bool) {
1216 self.initial_page_should_retry = initial_page_should_retry;
1217 }
1218
1219 pub fn get_initial_page_should_retry(&self) -> bool {
1221 self.initial_page_should_retry
1222 }
1223
1224 #[cfg(any(
1226 feature = "string_interner_bucket_backend",
1227 feature = "string_interner_string_backend",
1228 feature = "string_interner_buffer_backend",
1229 ))]
1230 pub fn drain_links(
1231 &mut self,
1232 ) -> hashbrown::hash_set::Drain<'_, string_interner::symbol::SymbolUsize> {
1233 self.links_visited.drain()
1234 }
1235
1236 #[cfg(not(any(
1237 feature = "string_interner_bucket_backend",
1238 feature = "string_interner_string_backend",
1239 feature = "string_interner_buffer_backend",
1240 )))]
1241 pub fn drain_links(&mut self) -> hashbrown::hash_set::Drain<'_, CaseInsensitiveString> {
1243 self.links_visited.drain()
1244 }
1245
1246 #[cfg(any(
1248 feature = "string_interner_bucket_backend",
1249 feature = "string_interner_string_backend",
1250 feature = "string_interner_buffer_backend",
1251 ))]
1252 pub fn drain_signatures(&mut self) -> hashbrown::hash_set::Drain<'_, u64> {
1253 self.signatures.drain()
1254 }
1255
1256 #[cfg(not(any(
1257 feature = "string_interner_bucket_backend",
1258 feature = "string_interner_string_backend",
1259 feature = "string_interner_buffer_backend",
1260 )))]
1261 pub fn drain_signatures(&mut self) -> hashbrown::hash_set::Drain<'_, u64> {
1263 self.signatures.drain()
1264 }
1265
1266 pub fn set_extra_links(
1268 &mut self,
1269 extra_links: HashSet<CaseInsensitiveString>,
1270 ) -> &HashSet<CaseInsensitiveString> {
1271 self.extra_links.extend(extra_links);
1272 &self.extra_links
1273 }
1274
1275 pub fn get_extra_links(&self) -> &HashSet<CaseInsensitiveString> {
1277 &self.extra_links
1278 }
1279
1280 pub async fn clear_all(&mut self) {
1282 self.clear();
1283 self.clear_disk().await;
1284 }
1285
1286 pub fn clear(&mut self) {
1288 self.links_visited.clear();
1289 self.signatures.clear();
1290 self.pages.take();
1291 self.extra_links.clear();
1292 }
1293
1294 pub fn get_client(&self) -> &Option<Client> {
1296 &self.client
1297 }
1298
1299 pub fn get_pages(&self) -> Option<&Vec<Page>> {
1301 self.pages.as_ref()
1302 }
1303
1304 #[cfg(not(feature = "disk"))]
1306 pub async fn get_links_disk(&self) -> HashSet<CaseInsensitiveString> {
1307 Default::default()
1308 }
1309
1310 #[cfg(feature = "disk")]
1312 pub async fn get_links_disk(&self) -> HashSet<CaseInsensitiveString> {
1313 if let Some(sqlite) = &self.sqlite {
1314 if sqlite.pool_inited() {
1315 if let Ok(links) =
1316 DatabaseHandler::get_all_resources(sqlite.get_db_pool().await).await
1317 {
1318 links
1319 } else {
1320 Default::default()
1321 }
1322 } else {
1323 Default::default()
1324 }
1325 } else {
1326 Default::default()
1327 }
1328 }
1329
1330 #[cfg(feature = "disk")]
1332 pub async fn get_all_links_visited(&self) -> HashSet<CaseInsensitiveString> {
1333 let mut l = self.get_links_disk().await;
1334 let m = self.links_visited.get_links();
1335
1336 l.extend(m);
1337
1338 l
1339 }
1340
1341 #[cfg(not(feature = "disk"))]
1343 pub async fn get_all_links_visited(&self) -> HashSet<CaseInsensitiveString> {
1344 self.get_links()
1345 }
1346
1347 pub fn get_links(&self) -> HashSet<CaseInsensitiveString> {
1349 self.links_visited.get_links()
1350 }
1351
1352 pub fn get_url_parsed(&self) -> &Option<Box<Url>> {
1354 &self.domain_parsed
1355 }
1356
1357 pub fn get_url(&self) -> &CaseInsensitiveString {
1359 &self.url
1360 }
1361
1362 pub fn get_delay(&self) -> Duration {
1364 Duration::from_millis(self.configuration.delay)
1365 }
1366
1367 pub fn get_status(&self) -> &CrawlStatus {
1369 &self.status
1370 }
1371
1372 pub fn set_status(&mut self, status: CrawlStatus) -> &CrawlStatus {
1374 self.status = status;
1375 &self.status
1376 }
1377
1378 pub fn reset_status(&mut self) -> &CrawlStatus {
1380 self.status = CrawlStatus::Start;
1381 &self.status
1382 }
1383
1384 pub fn persist_links(&mut self) -> &mut Self {
1387 self.status = CrawlStatus::Active;
1388 self
1389 }
1390
1391 pub fn get_absolute_path(&self, domain: Option<&str>) -> Option<Url> {
1393 if domain.is_some() {
1394 url::Url::parse(domain.unwrap_or_default())
1395 .ok()
1396 .map(|mut url| {
1397 if let Ok(mut path) = url.path_segments_mut() {
1398 path.clear();
1399 }
1400 url
1401 })
1402 } else if let Some(mut d) = self.domain_parsed.as_deref().cloned() {
1403 if let Ok(mut path) = d.path_segments_mut() {
1404 path.clear();
1405 }
1406 Some(d)
1407 } else {
1408 None
1409 }
1410 }
1411
1412 pub fn stop(&mut self) {
1414 self.shutdown = true;
1415 }
1416
1417 pub fn start(&mut self) {
1419 self.shutdown = false;
1420 }
1421
1422 pub async fn configure_robots_parser(&mut self, client: &Client) {
1424 if self.configuration.respect_robots_txt {
1425 let robot_file_parser = self
1426 .robot_file_parser
1427 .get_or_insert_with(RobotFileParser::new);
1428
1429 if robot_file_parser.mtime() <= 4000 {
1430 let host_str = match &self.domain_parsed {
1431 Some(domain) => domain.as_str(),
1432 _ => self.url.inner(),
1433 };
1434
1435 if !host_str.is_empty() {
1436 if host_str.ends_with('/') {
1437 robot_file_parser.read(client, host_str).await;
1438 } else {
1439 robot_file_parser
1440 .read(client, &string_concat!(host_str, "/"))
1441 .await;
1442 }
1443 }
1444 if let Some(delay) =
1445 robot_file_parser.get_crawl_delay(&self.configuration.user_agent)
1446 {
1447 self.configuration.delay = delay.as_millis().min(60000) as u64;
1448 }
1449 }
1450 }
1451 }
1452
1453 pub fn setup_strict_policy(&self) -> Policy {
1455 use crate::client::redirect::Attempt;
1456 use crate::page::domain_name;
1457 use std::sync::atomic::AtomicU8;
1458
1459 let default_policy = Policy::default();
1460
1461 match self.domain_parsed.as_deref().cloned() {
1462 Some(host_s) => {
1463 let initial_redirect_limit = if self.configuration.respect_robots_txt {
1464 2
1465 } else {
1466 1
1467 };
1468 let subdomains = self.configuration.subdomains;
1469 let tld = self.configuration.tld;
1470 let host_domain_name = if tld {
1471 domain_name(&host_s).to_string()
1472 } else {
1473 Default::default()
1474 };
1475 let redirect_limit = *self.configuration.redirect_limit;
1476
1477 let custom_policy = {
1478 let initial_redirect = Arc::new(AtomicU8::new(0));
1479
1480 move |attempt: Attempt| {
1481 if tld && domain_name(attempt.url()) == host_domain_name
1482 || subdomains
1483 && attempt
1484 .url()
1485 .host_str()
1486 .unwrap_or_default()
1487 .ends_with(host_s.host_str().unwrap_or_default())
1488 || attempt.url().host() == host_s.host()
1489 {
1490 default_policy.redirect(attempt)
1491 } else if attempt.previous().len() > redirect_limit {
1492 attempt.error("too many redirects")
1493 } else if attempt.status().is_redirection()
1494 && (0..initial_redirect_limit)
1495 .contains(&initial_redirect.load(Ordering::Relaxed))
1496 {
1497 initial_redirect.fetch_add(1, Ordering::Relaxed);
1498 default_policy.redirect(attempt)
1499 } else {
1500 attempt.stop()
1501 }
1502 }
1503 };
1504 Policy::custom(custom_policy)
1505 }
1506 _ => default_policy,
1507 }
1508 }
1509
1510 pub fn setup_redirect_policy(&self) -> Policy {
1512 match self.configuration.redirect_policy {
1513 RedirectPolicy::Loose => Policy::limited(*self.configuration.redirect_limit),
1514 RedirectPolicy::None => Policy::none(),
1515 RedirectPolicy::Strict => self.setup_strict_policy(),
1516 }
1517 }
1518
1519 pub fn configure_headers(&mut self) {
1521 let mut headers: reqwest::header::HeaderMap = reqwest::header::HeaderMap::new();
1522
1523 let user_agent = match &self.configuration.user_agent {
1524 Some(ua) => ua.as_str(),
1525 _ => get_ua(self.configuration.only_chrome_agent()),
1526 };
1527
1528 if self.configuration.modify_headers {
1529 crate::utils::header_utils::extend_headers(
1530 &mut headers,
1531 user_agent,
1532 &self.configuration.headers,
1533 &None,
1534 &self.configuration.viewport,
1535 &self.domain_parsed,
1536 );
1537
1538 if !headers.is_empty() {
1539 if let Some(referer) = headers.remove(REFERER) {
1541 if let Ok(v) = referer.to_str() {
1542 if self.configuration.referer.is_none() && !v.is_empty() {
1544 self.configuration.referer = Some(v.into())
1545 }
1546 }
1547 }
1548 self.configuration
1549 .headers
1550 .replace(Box::new(SerializableHeaderMap::from(headers)));
1551 }
1552 }
1553 }
1554
1555 #[cfg(all(
1556 any(not(feature = "wreq"), feature = "cache_request"),
1557 not(feature = "decentralized")
1558 ))]
1559 pub fn configure_base_client(&self) -> ClientBuilder {
1561 let policy = self.setup_redirect_policy();
1562
1563 let user_agent = match &self.configuration.user_agent {
1564 Some(ua) => ua.as_str(),
1565 _ => get_ua(self.configuration.only_chrome_agent()),
1566 };
1567
1568 let missing_agent = match &self.configuration.headers {
1571 Some(headers) => {
1572 !headers.contains_key(crate::client::header::USER_AGENT)
1573 && !headers.contains_key("User-Agent")
1574 }
1575 _ => true,
1576 };
1577
1578 let timeout_mult = if self.configuration.proxies.is_some() {
1579 2
1580 } else {
1581 1
1582 };
1583
1584 let client = reqwest::Client::builder()
1585 .redirect(policy)
1586 .http09_responses()
1587 .http1_ignore_invalid_headers_in_responses(true)
1588 .referer(self.configuration.referer.is_none())
1589 .connect_timeout(
1590 self.configuration
1591 .default_http_connect_timeout
1592 .unwrap_or(Duration::from_secs(24 * timeout_mult)),
1593 )
1594 .read_timeout(
1595 self.configuration
1596 .default_http_read_timeout
1597 .unwrap_or(Duration::from_secs(42 * timeout_mult)),
1598 )
1599 .http1_title_case_headers()
1600 .http1_allow_obsolete_multiline_headers_in_responses(true)
1601 .http1_allow_spaces_after_header_name_in_responses(true)
1602 .danger_accept_invalid_certs(self.configuration.accept_invalid_certs);
1605
1606 let client = if let Some(network_interface) = &self.configuration.network_interface {
1607 set_interface(client, network_interface)
1608 } else {
1609 client
1610 };
1611
1612 let client = if let Some(local_address) = &self.configuration.local_address {
1613 client.local_address(*local_address)
1614 } else {
1615 client
1616 };
1617
1618 let client = if self.configuration.proxies.is_none() {
1619 client
1620 } else {
1621 client.tcp_keepalive(Duration::from_secs(30))
1622 };
1623
1624 let client = if missing_agent {
1626 client.user_agent(user_agent)
1627 } else {
1628 client
1629 };
1630
1631 let client = if self.configuration.http2_prior_knowledge {
1632 client.http2_prior_knowledge()
1633 } else {
1634 client
1635 };
1636
1637 crate::utils::header_utils::setup_default_headers(client, &self.configuration)
1638 }
1639
1640 #[cfg(all(
1641 feature = "wreq",
1642 not(feature = "decentralized"),
1643 not(feature = "cache_request")
1644 ))]
1645 pub fn configure_base_client(&self) -> ClientBuilder {
1647 let policy = self.setup_redirect_policy();
1648
1649 let user_agent = match &self.configuration.user_agent {
1650 Some(ua) => ua.as_str(),
1651 _ => get_ua(self.configuration.only_chrome_agent()),
1652 };
1653
1654 let missing_agent = match &self.configuration.headers {
1655 Some(headers) => {
1656 !headers.contains_key(crate::client::header::USER_AGENT)
1657 && !headers.contains_key("User-Agent")
1658 }
1659 _ => true,
1660 };
1661
1662 let timeout_mult = if self.configuration.proxies.is_some() {
1663 2
1664 } else {
1665 1
1666 };
1667
1668 let client = Client::builder()
1669 .redirect(policy)
1670 .referer(self.configuration.referer.is_none())
1671 .connect_timeout(
1672 self.configuration
1673 .default_http_connect_timeout
1674 .unwrap_or(Duration::from_secs(24 * timeout_mult)),
1675 )
1676 .read_timeout(
1677 self.configuration
1678 .default_http_read_timeout
1679 .unwrap_or(Duration::from_secs(42 * timeout_mult)),
1680 );
1681
1682 let client = if let Some(local_address) = &self.configuration.local_address {
1683 client.local_address(*local_address)
1684 } else {
1685 client
1686 };
1687
1688 let client = if self.configuration.proxies.is_none() {
1689 client
1690 } else {
1691 client.tcp_keepalive(Duration::from_secs(30))
1692 };
1693
1694 let client = if missing_agent {
1695 client.user_agent(user_agent)
1696 } else {
1697 client
1698 };
1699
1700 let client = if let Some(emulation) = self.configuration.emulation {
1701 client.emulation(emulation)
1702 } else {
1703 client
1704 };
1705
1706 crate::utils::header_utils::setup_default_headers(client, &self.configuration)
1707 }
1708
1709 #[cfg(all(not(feature = "decentralized"), not(feature = "cache_request")))]
1711 pub fn configure_http_client_builder(&self) -> ClientBuilder {
1712 let client = self.configure_base_client();
1713
1714 let mut client = match &self.configuration.request_timeout {
1715 Some(t) => client.timeout(**t),
1716 _ => client,
1717 };
1718
1719 let client = match &self.configuration.proxies {
1720 Some(proxies) => {
1721 let linux = cfg!(target_os = "linux");
1722 let ignore_plain_socks = proxies.len() >= 2 && linux;
1723 let replace_plain_socks = proxies.len() == 1 && linux;
1724
1725 for proxie in proxies.iter() {
1726 if proxie.ignore == crate::configuration::ProxyIgnore::Http {
1727 continue;
1728 }
1729
1730 let proxie = &proxie.addr;
1731 let socks = proxie.starts_with("socks://");
1732
1733 if ignore_plain_socks && socks {
1735 continue;
1736 }
1737
1738 if replace_plain_socks && socks {
1740 if let Ok(proxy) =
1741 crate::client::Proxy::all(proxie.replacen("socks://", "http://", 1))
1742 {
1743 client = client.proxy(proxy);
1744 }
1745 } else if let Ok(proxy) = crate::client::Proxy::all(proxie) {
1746 client = client.proxy(proxy);
1747 }
1748 }
1749
1750 client
1751 }
1752 _ => client,
1753 };
1754
1755 #[cfg(feature = "spider_cloud")]
1757 let client = if let Some(ref sc) = self.configuration.spider_cloud {
1758 if sc.uses_proxy() {
1759 match (
1760 crate::client::Proxy::all(&sc.proxy_url),
1761 reqwest::header::HeaderValue::from_str(&format!("Bearer {}", sc.api_key)),
1762 ) {
1763 (Ok(proxy), Ok(auth_value)) => client.proxy(proxy.custom_http_auth(auth_value)),
1764 _ => client,
1765 }
1766 } else {
1767 client
1768 }
1769 } else {
1770 client
1771 };
1772
1773 let client = if crate::utils::connect::background_connect_threading() {
1774 client.connector_layer(crate::utils::connect::BackgroundProcessorLayer::new())
1775 } else {
1776 client
1777 };
1778
1779 let client = match self.configuration.concurrency_limit {
1780 Some(limit) => {
1781 client.connector_layer(tower::limit::concurrency::ConcurrencyLimitLayer::new(limit))
1782 }
1783 _ => client,
1784 };
1785
1786 self.configure_http_client_cookies(client)
1787 }
1788
1789 #[cfg(all(not(feature = "decentralized"), feature = "cache_request"))]
1791 pub fn configure_http_client_builder(&self) -> reqwest_middleware::ClientBuilder {
1792 use crate::utils::create_cache_key;
1793 let client = self.configure_base_client();
1794
1795 let mut client = match &self.configuration.request_timeout {
1796 Some(t) => client.timeout(**t),
1797 _ => client,
1798 };
1799
1800 let client = match &self.configuration.proxies {
1801 Some(proxies) => {
1802 let linux = cfg!(target_os = "linux");
1803 let ignore_plain_socks = proxies.len() >= 2 && linux;
1804 let replace_plain_socks = proxies.len() == 1 && linux;
1805
1806 for proxie in proxies.iter() {
1807 if proxie.ignore == crate::configuration::ProxyIgnore::Http {
1808 continue;
1809 }
1810 let proxie = &proxie.addr;
1811
1812 let socks = proxie.starts_with("socks://");
1813
1814 if ignore_plain_socks && socks {
1816 continue;
1817 }
1818
1819 if replace_plain_socks && socks {
1821 if let Ok(proxy) =
1822 crate::client::Proxy::all(proxie.replacen("socks://", "http://", 1))
1823 {
1824 client = client.proxy(proxy);
1825 }
1826 } else if let Ok(proxy) = crate::client::Proxy::all(proxie) {
1827 client = client.proxy(proxy);
1828 }
1829 }
1830
1831 client
1832 }
1833 _ => client,
1834 };
1835
1836 #[cfg(feature = "spider_cloud")]
1838 let client = if let Some(ref sc) = self.configuration.spider_cloud {
1839 if sc.uses_proxy() {
1840 match (
1841 crate::client::Proxy::all(&sc.proxy_url),
1842 reqwest::header::HeaderValue::from_str(&format!("Bearer {}", sc.api_key)),
1843 ) {
1844 (Ok(proxy), Ok(auth_value)) => client.proxy(proxy.custom_http_auth(auth_value)),
1845 _ => client,
1846 }
1847 } else {
1848 client
1849 }
1850 } else {
1851 client
1852 };
1853
1854 let client = self.configure_http_client_cookies(client);
1855
1856 let client = if crate::utils::connect::background_connect_threading() {
1857 client.connector_layer(crate::utils::connect::BackgroundProcessorLayer::new())
1858 } else {
1859 client
1860 };
1861
1862 let client = match self.configuration.concurrency_limit {
1863 Some(limit) => {
1864 client.connector_layer(tower::limit::concurrency::ConcurrencyLimitLayer::new(limit))
1865 }
1866 _ => client,
1867 };
1868
1869 let client =
1870 reqwest_middleware::ClientBuilder::new(unsafe { client.build().unwrap_unchecked() });
1871
1872 if self.configuration.cache {
1873 let mut cache_options = HttpCacheOptions::default();
1874
1875 cache_options.cache_key = Some(Arc::new(|req: &http::request::Parts| {
1876 let mut auth_token = None;
1877 if let Some(auth) = req.headers.get("authorization") {
1878 if let Ok(token) = auth.to_str() {
1879 if !token.is_empty() {
1880 auth_token = Some(token);
1881 }
1882 }
1883 }
1884 create_cache_key(req, Some(req.method.as_str()), auth_token)
1885 }));
1886 client.with(Cache(HttpCache {
1887 mode: CacheMode::Default,
1888 manager: CACACHE_MANAGER.clone(),
1889 options: cache_options,
1890 }))
1891 } else {
1892 client
1893 }
1894 }
1895
1896 #[cfg(all(not(feature = "decentralized"), feature = "cookies"))]
1898 pub fn configure_http_client_cookies(
1899 &self,
1900 client: crate::client::ClientBuilder,
1901 ) -> crate::client::ClientBuilder {
1902 let client = client.cookie_provider(self.cookie_jar.clone());
1903
1904 if !self.configuration.cookie_str.is_empty() {
1905 if let Some(url) = self.domain_parsed.as_ref() {
1906 self.cookie_jar
1907 .add_cookie_str(&self.configuration.cookie_str, url);
1908 }
1909 }
1910
1911 client
1912 }
1913
1914 #[cfg(all(not(feature = "decentralized"), not(feature = "cookies")))]
1916 pub fn configure_http_client_cookies(
1917 &self,
1918 client: crate::client::ClientBuilder,
1919 ) -> crate::client::ClientBuilder {
1920 client
1921 }
1922
1923 pub fn set_http_client(&mut self, client: Client) -> &Option<Client> {
1925 self.client = Some(client);
1926 &self.client
1927 }
1928
1929 #[cfg(all(not(feature = "decentralized"), not(feature = "cache_request")))]
1931 fn build_single_proxy_client(
1932 &self,
1933 proxy: &crate::configuration::RequestProxy,
1934 ) -> Option<Client> {
1935 if proxy.ignore == crate::configuration::ProxyIgnore::Http {
1936 return None;
1937 }
1938
1939 let client = self.configure_base_client();
1940
1941 let client = match &self.configuration.request_timeout {
1942 Some(t) => client.timeout(**t),
1943 _ => client,
1944 };
1945
1946 let addr = &proxy.addr;
1947 let linux = cfg!(target_os = "linux");
1948 let socks = addr.starts_with("socks://");
1949
1950 let client = if socks && linux {
1951 match crate::client::Proxy::all(addr.replacen("socks://", "http://", 1)) {
1952 Ok(p) => client.proxy(p),
1953 Err(_) => return None,
1954 }
1955 } else {
1956 match crate::client::Proxy::all(addr) {
1957 Ok(p) => client.proxy(p),
1958 Err(_) => return None,
1959 }
1960 };
1961
1962 #[cfg(feature = "spider_cloud")]
1963 let client = if let Some(ref sc) = self.configuration.spider_cloud {
1964 if sc.uses_proxy() {
1965 match (
1966 crate::client::Proxy::all(&sc.proxy_url),
1967 reqwest::header::HeaderValue::from_str(&format!("Bearer {}", sc.api_key)),
1968 ) {
1969 (Ok(proxy), Ok(auth_value)) => client.proxy(proxy.custom_http_auth(auth_value)),
1970 _ => client,
1971 }
1972 } else {
1973 client
1974 }
1975 } else {
1976 client
1977 };
1978
1979 let client = if crate::utils::connect::background_connect_threading() {
1980 client.connector_layer(crate::utils::connect::BackgroundProcessorLayer::new())
1981 } else {
1982 client
1983 };
1984
1985 let client = match self.configuration.concurrency_limit {
1986 Some(limit) => {
1987 client.connector_layer(tower::limit::concurrency::ConcurrencyLimitLayer::new(limit))
1988 }
1989 _ => client,
1990 };
1991
1992 let client = self.configure_http_client_cookies(client);
1993 unsafe { Some(client.build().unwrap_unchecked()) }
1994 }
1995
1996 #[cfg(all(not(feature = "decentralized"), feature = "cache_request"))]
1998 fn build_single_proxy_client(
1999 &self,
2000 proxy: &crate::configuration::RequestProxy,
2001 ) -> Option<Client> {
2002 use crate::utils::create_cache_key;
2003
2004 if proxy.ignore == crate::configuration::ProxyIgnore::Http {
2005 return None;
2006 }
2007
2008 let client = self.configure_base_client();
2009
2010 let client = match &self.configuration.request_timeout {
2011 Some(t) => client.timeout(**t),
2012 _ => client,
2013 };
2014
2015 let addr = &proxy.addr;
2016 let linux = cfg!(target_os = "linux");
2017 let socks = addr.starts_with("socks://");
2018
2019 let client = if socks && linux {
2020 match crate::client::Proxy::all(addr.replacen("socks://", "http://", 1)) {
2021 Ok(p) => client.proxy(p),
2022 Err(_) => return None,
2023 }
2024 } else {
2025 match crate::client::Proxy::all(addr) {
2026 Ok(p) => client.proxy(p),
2027 Err(_) => return None,
2028 }
2029 };
2030
2031 #[cfg(feature = "spider_cloud")]
2032 let client = if let Some(ref sc) = self.configuration.spider_cloud {
2033 if sc.uses_proxy() {
2034 match (
2035 crate::client::Proxy::all(&sc.proxy_url),
2036 reqwest::header::HeaderValue::from_str(&format!("Bearer {}", sc.api_key)),
2037 ) {
2038 (Ok(proxy), Ok(auth_value)) => client.proxy(proxy.custom_http_auth(auth_value)),
2039 _ => client,
2040 }
2041 } else {
2042 client
2043 }
2044 } else {
2045 client
2046 };
2047
2048 let client = self.configure_http_client_cookies(client);
2049
2050 let client = if crate::utils::connect::background_connect_threading() {
2051 client.connector_layer(crate::utils::connect::BackgroundProcessorLayer::new())
2052 } else {
2053 client
2054 };
2055
2056 let client = match self.configuration.concurrency_limit {
2057 Some(limit) => {
2058 client.connector_layer(tower::limit::concurrency::ConcurrencyLimitLayer::new(limit))
2059 }
2060 _ => client,
2061 };
2062
2063 let client =
2064 reqwest_middleware::ClientBuilder::new(unsafe { client.build().unwrap_unchecked() });
2065
2066 if self.configuration.cache {
2067 let mut cache_options = HttpCacheOptions::default();
2068
2069 cache_options.cache_key = Some(Arc::new(|req: &http::request::Parts| {
2070 let mut auth_token = None;
2071 if let Some(auth) = req.headers.get("authorization") {
2072 if let Ok(token) = auth.to_str() {
2073 if !token.is_empty() {
2074 auth_token = Some(token);
2075 }
2076 }
2077 }
2078 create_cache_key(req, Some(req.method.as_str()), auth_token)
2079 }));
2080
2081 Some(
2082 client
2083 .with(Cache(HttpCache {
2084 mode: CacheMode::Default,
2085 manager: CACACHE_MANAGER.clone(),
2086 options: cache_options,
2087 }))
2088 .build(),
2089 )
2090 } else {
2091 Some(client.build())
2092 }
2093 }
2094
2095 #[cfg(not(feature = "decentralized"))]
2097 fn build_rotated_clients(&self) -> Option<Arc<ClientRotator>> {
2098 let proxies = self.configuration.proxies.as_ref()?;
2099 if proxies.len() < 2 {
2100 return None;
2101 }
2102 let clients: Vec<Client> = proxies
2103 .iter()
2104 .filter_map(|proxy| self.build_single_proxy_client(proxy))
2105 .collect();
2106 if clients.len() < 2 {
2107 return None;
2108 }
2109 Some(Arc::new(ClientRotator::new(clients)))
2110 }
2111
2112 #[cfg(all(not(feature = "decentralized"), not(feature = "cache_request")))]
2114 pub fn configure_http_client(&self) -> Client {
2115 let client = self.configure_http_client_builder();
2116 unsafe { client.build().unwrap_unchecked() }
2118 }
2119
2120 #[cfg(all(not(feature = "decentralized"), feature = "cache_request"))]
2122 pub fn configure_http_client(&self) -> Client {
2123 let client = self.configure_http_client_builder();
2124 client.build()
2125 }
2126
2127 #[cfg(all(feature = "decentralized", not(feature = "cache_request")))]
2129 pub fn configure_http_client(&self) -> Client {
2130 use reqwest::header::{HeaderMap, HeaderValue};
2131
2132 let mut headers = HeaderMap::new();
2133
2134 let policy = self.setup_redirect_policy();
2135
2136 let mut client = Client::builder()
2137 .user_agent(match &self.configuration.user_agent {
2138 Some(ua) => ua.as_str(),
2139 _ => &get_ua(self.configuration.only_chrome_agent()),
2140 })
2141 .redirect(policy)
2142 .tcp_keepalive(Duration::from_millis(500));
2143
2144 let referer = if self.configuration.tld && self.configuration.subdomains {
2145 2
2146 } else if self.configuration.tld {
2147 2
2148 } else if self.configuration.subdomains {
2149 1
2150 } else {
2151 0
2152 };
2153
2154 if referer > 0 {
2155 headers.insert(reqwest::header::REFERER, HeaderValue::from(referer));
2157 }
2158
2159 if let Some(h) = &self.configuration.headers {
2160 headers.extend(h.inner().clone());
2161 }
2162
2163 if let Some(domain_url) = self.get_absolute_path(None) {
2164 let domain_url = domain_url.as_str();
2165 let domain_host = if domain_url.ends_with("/") {
2166 &domain_url[0..domain_url.len() - 1]
2167 } else {
2168 domain_url
2169 };
2170 if let Ok(value) = HeaderValue::from_str(domain_host) {
2171 headers.insert(reqwest::header::HOST, value);
2172 }
2173 }
2174
2175 for worker in WORKERS.iter() {
2176 if let Ok(worker) = crate::client::Proxy::all(worker) {
2177 client = client.proxy(worker);
2178 }
2179 }
2180
2181 if !self.configuration.modify_headers && self.configuration.modify_http_client_headers {
2182 if let Some(ua) = &self.configuration.user_agent {
2183 crate::utils::header_utils::extend_headers(
2184 &mut headers,
2185 ua,
2186 &self.configuration.headers,
2187 &None,
2188 &self.configuration.viewport,
2189 &self.domain_parsed,
2190 );
2191 }
2192 }
2193
2194 unsafe {
2196 match &self.configuration.request_timeout {
2197 Some(t) => client.timeout(**t),
2198 _ => client,
2199 }
2200 .default_headers(headers)
2201 .build()
2202 .unwrap_unchecked()
2203 }
2204 }
2205
2206 #[cfg(all(feature = "decentralized", feature = "cache_request"))]
2208 pub fn configure_http_client(&mut self) -> Client {
2209 use crate::utils::create_cache_key;
2210 use reqwest::header::{HeaderMap, HeaderValue};
2211 use reqwest_middleware::ClientBuilder;
2212
2213 let mut headers = HeaderMap::new();
2214
2215 let policy = self.setup_redirect_policy();
2216
2217 let mut client = reqwest::Client::builder()
2218 .user_agent(match &self.configuration.user_agent {
2219 Some(ua) => ua.as_str(),
2220 _ => get_ua(self.configuration.only_chrome_agent()),
2221 })
2222 .redirect(policy)
2223 .tcp_keepalive(Duration::from_millis(500));
2224
2225 let referer = if self.configuration.tld && self.configuration.subdomains {
2226 2
2227 } else if self.configuration.tld {
2228 2
2229 } else if self.configuration.subdomains {
2230 1
2231 } else {
2232 0
2233 };
2234
2235 if referer > 0 {
2236 headers.insert(reqwest::header::REFERER, HeaderValue::from(referer));
2238 }
2239
2240 if let Some(h) = &self.configuration.headers {
2241 headers.extend(h.inner().clone());
2242 }
2243
2244 if let Some(domain_url) = self.get_absolute_path(None) {
2245 let domain_url = domain_url.as_str();
2246 let domain_host = if domain_url.ends_with("/") {
2247 &domain_url[0..domain_url.len() - 1]
2248 } else {
2249 domain_url
2250 };
2251 if let Ok(value) = HeaderValue::from_str(domain_host) {
2252 headers.insert(reqwest::header::HOST, value);
2253 }
2254 }
2255
2256 for worker in WORKERS.iter() {
2257 if let Ok(worker) = crate::client::Proxy::all(worker) {
2258 client = client.proxy(worker);
2259 }
2260 }
2261
2262 let mut cache_options = HttpCacheOptions::default();
2263
2264 cache_options.cache_key = Some(Arc::new(|req: &http::request::Parts| {
2265 let mut auth_token = None;
2266 if let Some(auth) = req.headers.get("authorization") {
2267 if let Ok(token) = auth.to_str() {
2268 if !token.is_empty() {
2269 auth_token = Some(token);
2270 }
2271 }
2272 }
2273 create_cache_key(req, Some(req.method.as_str()), auth_token)
2274 }));
2275
2276 if !self.configuration.modify_headers && self.configuration.modify_http_client_headers {
2277 if let Some(ua) = &self.configuration.user_agent {
2278 crate::utils::header_utils::extend_headers(
2279 &mut headers,
2280 ua,
2281 &self.configuration.headers,
2282 &None,
2283 &self.configuration.viewport,
2284 &self.domain_parsed,
2285 );
2286 }
2287 }
2288
2289 let client = ClientBuilder::new(unsafe {
2290 match &self.configuration.request_timeout {
2291 Some(t) => client.timeout(**t),
2292 _ => client,
2293 }
2294 .default_headers(headers)
2295 .build()
2296 .unwrap_unchecked()
2297 })
2298 .with(Cache(HttpCache {
2299 mode: CacheMode::Default,
2300 manager: CACACHE_MANAGER.clone(),
2301 options: cache_options,
2302 }));
2303
2304 client.build()
2305 }
2306
2307 #[cfg(feature = "control")]
2309 pub fn configure_handler(&self) -> Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)> {
2310 use crate::utils::{Handler, CONTROLLER};
2311
2312 if self.configuration.no_control_thread {
2313 None
2314 } else {
2315 let c: Arc<AtomicI8> = Arc::new(AtomicI8::new(0));
2316 let handle = c.clone();
2317 let target_id = self.target_id();
2318
2319 let join_handle = crate::utils::spawn_task("control_handler", async move {
2320 let mut l = CONTROLLER.read().await.1.to_owned();
2321
2322 while l.changed().await.is_ok() {
2323 let n = &*l.borrow();
2324 let (target, rest) = n;
2325
2326 if target_id.eq_ignore_ascii_case(target) {
2327 if rest == &Handler::Resume {
2328 c.store(0, Ordering::Relaxed);
2329 }
2330 if rest == &Handler::Pause {
2331 c.store(1, Ordering::Relaxed);
2332 }
2333 if rest == &Handler::Shutdown {
2334 c.store(2, Ordering::Relaxed);
2335 }
2336 }
2337 }
2338 });
2339
2340 Some((handle, join_handle))
2341 }
2342 }
2343
2344 #[cfg(not(feature = "control"))]
2345 pub fn configure_handler(&self) -> Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)> {
2347 None
2348 }
2349
2350 #[cfg(all(feature = "chrome", feature = "chrome_intercept"))]
2352 pub async fn setup_chrome_interception(
2353 &self,
2354 page: &chromiumoxide::Page,
2355 ) -> Option<tokio::task::JoinHandle<()>> {
2356 crate::features::chrome::setup_chrome_interception_base(
2357 page,
2358 self.configuration.chrome_intercept.enabled,
2359 &self.configuration.auth_challenge_response,
2360 self.configuration.chrome_intercept.block_visuals,
2361 self.url.inner(),
2362 )
2363 .await
2364 }
2365
2366 #[cfg(all(feature = "chrome", not(feature = "chrome_intercept")))]
2368 pub async fn setup_chrome_interception(
2369 &self,
2370 _chrome_page: &chromiumoxide::Page,
2371 ) -> Option<tokio::task::JoinHandle<()>> {
2372 None
2373 }
2374
2375 pub fn setup_selectors(&self) -> RelativeSelectors {
2377 setup_website_selectors(
2378 self.get_url().inner(),
2379 AllowedDomainTypes::new(self.configuration.subdomains, self.configuration.tld),
2380 )
2381 }
2382
2383 pub fn setup_base(&mut self) -> (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) {
2385 self.determine_limits();
2386 self.setup_disk();
2387 self.configure_headers();
2388
2389 crate::utils::connect::init_background_runtime();
2390
2391 let client = match self.client.take() {
2392 Some(client) => client,
2393 _ => self.configure_http_client(),
2394 };
2395
2396 #[cfg(not(feature = "decentralized"))]
2397 {
2398 self.client_rotator = self.build_rotated_clients();
2399 }
2400
2401 (client, self.configure_handler())
2402 }
2403
2404 pub async fn setup(
2406 &mut self,
2407 ) -> (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) {
2408 let setup = self.setup_base();
2409 if self.status != CrawlStatus::Active {
2410 self.clear_all().await;
2411 } else {
2412 self.skip_initial = !self.extra_links.is_empty();
2413 }
2414 self.configure_robots_parser(&setup.0).await;
2415 setup
2416 }
2417
2418 pub fn setup_crawl(
2420 &self,
2421 ) -> (
2422 std::pin::Pin<Box<tokio::time::Interval>>,
2423 std::pin::Pin<Box<Duration>>,
2424 ) {
2425 let interval = Box::pin(tokio::time::interval(Duration::from_millis(10)));
2426 let throttle = Box::pin(self.get_delay());
2427
2428 (interval, throttle)
2429 }
2430
2431 #[cfg(feature = "glob")]
2433 pub fn get_expanded_links(&self, domain_name: &str) -> Vec<CaseInsensitiveString> {
2434 let mut expanded = crate::features::glob::expand_url(domain_name);
2435
2436 if expanded.is_empty() {
2437 if let Some(u) = self.get_absolute_path(Some(domain_name)) {
2438 expanded.push(u.as_str().into());
2439 }
2440 };
2441
2442 expanded
2443 }
2444
2445 pub fn set_crawl_initial_status(
2447 &mut self,
2448 page: &crate::page::Page,
2449 links: &HashSet<CaseInsensitiveString>,
2450 ) {
2451 use crate::utils::{detect_open_resty_forbidden, APACHE_FORBIDDEN};
2452
2453 if page.status_code == reqwest::StatusCode::FORBIDDEN && links.is_empty() {
2454 if is_safe_javascript_challenge(page) {
2455 self.website_meta_info = WebsiteMetaInfo::RequiresJavascript;
2456 } else if page.get_html_bytes_u8() == *APACHE_FORBIDDEN {
2457 self.website_meta_info = WebsiteMetaInfo::Apache403;
2458 } else if detect_open_resty_forbidden(page.get_html_bytes_u8()) {
2459 self.website_meta_info = WebsiteMetaInfo::OpenResty403;
2460 }
2461 self.status = CrawlStatus::Blocked;
2462 } else if page.status_code == reqwest::StatusCode::TOO_MANY_REQUESTS {
2463 self.status = CrawlStatus::RateLimited;
2464 } else if page.status_code.is_server_error() {
2465 self.status = CrawlStatus::ServerError;
2466 } else if page.is_empty() {
2467 if page.status_code == *UNKNOWN_STATUS_ERROR
2468 || page.status_code == *CHROME_UNKNOWN_STATUS_ERROR
2469 {
2470 self.status = CrawlStatus::ConnectError;
2471 } else {
2472 self.status = CrawlStatus::Empty;
2473 }
2474 }
2475 }
2476
2477 #[cfg(feature = "cmd")]
2479 pub async fn _crawl_establish_cmd(
2480 &mut self,
2481 cmd: std::path::PathBuf,
2482 cmd_args: Vec<String>,
2483 base: &mut RelativeSelectors,
2484 _ssg_build: bool,
2485 ) -> HashSet<CaseInsensitiveString> {
2486 if self.skip_initial {
2487 return Default::default();
2488 }
2489
2490 if !self
2491 .is_allowed_default(self.get_base_link())
2492 .eq(&ProcessLinkStatus::Allowed)
2493 {
2494 return HashSet::new();
2495 }
2496
2497 let url = self.url.inner();
2498
2499 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
2500 let mut links_ssg = HashSet::new();
2501 let mut links_pages = if self.configuration.return_page_links {
2502 Some(HashSet::new())
2503 } else {
2504 None
2505 };
2506
2507 let mut page_links_settings =
2508 PageLinkBuildSettings::new(true, self.configuration.full_resources);
2509 page_links_settings.subdomains = self.configuration.subdomains;
2510 page_links_settings.tld = self.configuration.tld;
2511 page_links_settings.normalize = self.configuration.normalize;
2512
2513 let mut domain_parsed = self.domain_parsed.take();
2514
2515 let mut retry_count = self.configuration.retry;
2516 let mut last_err: Option<std::io::Error> = None;
2517
2518 let build_error_page = |status: StatusCode, _err: std::io::Error| {
2519 let mut p = Page::default();
2520 p.url = url.to_string();
2521 p.status_code = status;
2522 #[cfg(not(feature = "page_error_status_details"))]
2523 {
2524 p.error_status = Some(_err.to_string());
2525 }
2526 p
2527 };
2528
2529 let mut page: Page = loop {
2530 let bytes = match Self::run_via_cmd(&cmd, &cmd_args, url).await {
2531 Ok(b) => {
2532 if b.is_empty() {
2533 last_err = Some(std::io::Error::new(
2534 std::io::ErrorKind::UnexpectedEof,
2535 "cmd returned empty stdout",
2536 ));
2537 None
2538 } else {
2539 Some(b)
2540 }
2541 }
2542 Err(e) => {
2543 last_err = Some(e);
2544 None
2545 }
2546 };
2547
2548 if let Some(bytes) = bytes.as_deref() {
2549 let mut domain_parsed_out = None;
2550
2551 let page = Page::new_page_streaming_from_bytes(
2552 url,
2553 bytes,
2554 base,
2555 &self.configuration.external_domains_caseless,
2556 &page_links_settings,
2557 &mut links,
2558 Some(&mut links_ssg),
2559 &domain_parsed,
2560 &mut domain_parsed_out,
2561 &mut links_pages,
2562 )
2563 .await;
2564
2565 if self.domain_parsed.is_none() {
2566 if let Some(mut dp) = domain_parsed.take() {
2567 convert_abs_url(&mut dp);
2568 self.domain_parsed.replace(dp);
2569 } else if let Some(mut dp) = domain_parsed_out.take() {
2570 convert_abs_url(&mut dp);
2571 self.domain_parsed.replace(dp);
2572 }
2573 } else if self.domain_parsed.is_none() {
2574 self.domain_parsed = domain_parsed_out;
2575 }
2576
2577 if page.should_retry && retry_count > 0 {
2578 retry_count -= 1;
2579 if let Some(timeout) = page.get_timeout() {
2580 tokio::time::sleep(timeout).await;
2581 } else {
2582 tokio::time::sleep(std::time::Duration::from_millis(250)).await;
2583 }
2584 continue;
2585 }
2586
2587 break page;
2588 }
2589
2590 if retry_count == 0 {
2591 let err = last_err
2592 .take()
2593 .unwrap_or_else(|| std::io::Error::other("cmd fetch failed (unknown error)"));
2594 break build_error_page(StatusCode::BAD_GATEWAY, err);
2595 }
2596
2597 retry_count -= 1;
2598 tokio::time::sleep(std::time::Duration::from_millis(250)).await;
2599 };
2600
2601 if page.get_html_bytes_u8().starts_with(b"<?xml") {
2602 page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
2603 .await;
2604 }
2605
2606 emit_log(url);
2607
2608 if let Some(signature) = page.signature {
2609 if !self.is_signature_allowed(signature).await {
2610 return Default::default();
2611 }
2612 self.insert_signature(signature).await;
2613 }
2614
2615 let url_ci = match &self.on_link_find_callback {
2616 Some(cb) => cb(*self.url.clone(), None).0,
2617 _ => *self.url.clone(),
2618 };
2619 self.insert_link(url_ci).await;
2620
2621 if self.configuration.return_page_links {
2622 page.page_links = links_pages
2623 .filter(|pages: &HashSet<CaseInsensitiveString>| !pages.is_empty())
2624 .map(Box::new);
2625 }
2626
2627 links.extend(links_ssg);
2628
2629 self.initial_status_code = page.status_code;
2630 self.initial_html_length = page.get_html_bytes_u8().len();
2631 self.initial_anti_bot_tech = page.anti_bot_tech;
2632 self.initial_page_should_retry = page.should_retry;
2633 self.initial_page_waf_check = page.waf_check;
2634
2635 self.set_crawl_initial_status(&page, &links);
2636
2637 if let Some(ref cb) = self.on_should_crawl_callback {
2638 if !cb.call(&page) {
2639 page.blocked_crawl = true;
2640 channel_send_page(&self.channel, page, &self.channel_guard);
2641 return Default::default();
2642 }
2643 }
2644
2645 channel_send_page(&self.channel, page, &self.channel_guard);
2646
2647 links
2648 }
2649
2650 #[cfg(not(feature = "glob"))]
2652 pub async fn _crawl_establish(
2653 &mut self,
2654 client: &Client,
2655 base: &mut RelativeSelectors,
2656 _: bool,
2657 ) -> HashSet<CaseInsensitiveString> {
2658 if self.skip_initial {
2659 return Default::default();
2660 }
2661
2662 if self
2663 .is_allowed_default(self.get_base_link())
2664 .eq(&ProcessLinkStatus::Allowed)
2665 {
2666 let url = self.url.inner();
2667
2668 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
2669 let mut links_ssg = HashSet::new();
2670 let mut links_pages = if self.configuration.return_page_links {
2671 Some(HashSet::new())
2672 } else {
2673 None
2674 };
2675 let mut page_links_settings =
2676 PageLinkBuildSettings::new(true, self.configuration.full_resources);
2677
2678 page_links_settings.subdomains = self.configuration.subdomains;
2679 page_links_settings.tld = self.configuration.tld;
2680 page_links_settings.normalize = self.configuration.normalize;
2681
2682 let mut domain_parsed = self.domain_parsed.take();
2683
2684 let mut page = if let Some(mut seeded_page) = self.build_seed_page() {
2685 #[cfg(not(feature = "decentralized"))]
2687 {
2688 let html_bytes = seeded_page.get_html_bytes_u8();
2689 if !html_bytes.is_empty() && !auto_encoder::is_binary_file(html_bytes) {
2690 let html = seeded_page.get_html();
2691 let extracted_links: HashSet<CaseInsensitiveString> = seeded_page
2692 .links_stream_base_ssg(base, &html, client, &self.domain_parsed)
2693 .await;
2694 links.extend(extracted_links);
2695 }
2696 }
2697 seeded_page
2698 } else {
2699 Page::new_page_streaming(
2700 url,
2701 client,
2702 false,
2703 base,
2704 &self.configuration.external_domains_caseless,
2705 &page_links_settings,
2706 &mut links,
2707 Some(&mut links_ssg),
2708 &domain_parsed, &mut self.domain_parsed,
2710 &mut links_pages,
2711 )
2712 .await
2713 };
2714
2715 if page.get_html_bytes_u8().starts_with(b"<?xml") {
2716 page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
2717 .await;
2718 }
2719
2720 if self.domain_parsed.is_none() {
2721 if let Some(mut domain_parsed) = domain_parsed.take() {
2722 convert_abs_url(&mut domain_parsed);
2723 self.domain_parsed.replace(domain_parsed);
2724 }
2725 }
2726
2727 let mut retry_count = self.configuration.retry;
2728 let domains_caseless = &self.configuration.external_domains_caseless;
2729
2730 while page.should_retry && retry_count > 0 {
2731 retry_count -= 1;
2732 if let Some(timeout) = page.get_timeout() {
2733 tokio::time::sleep(timeout).await;
2734 }
2735
2736 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
2737 let mut domain_parsed_clone = self.domain_parsed.clone();
2738
2739 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
2740 page.clone_from(
2741 &Page::new_page_streaming(
2742 url,
2743 client,
2744 false,
2745 base,
2746 domains_caseless,
2747 &page_links_settings,
2748 &mut links,
2749 Some(&mut links_ssg),
2750 &domain_parsed,
2751 &mut domain_parsed_clone,
2752 &mut links_pages,
2753 )
2754 .await,
2755 );
2756 })
2757 .await
2758 {
2759 log::info!("backoff gateway timeout exceeded {elasped}");
2760 }
2761
2762 self.domain_parsed = domain_parsed_clone;
2763 } else {
2764 page.clone_from(
2765 &Page::new_page_streaming(
2766 url,
2767 client,
2768 false,
2769 base,
2770 &self.configuration.external_domains_caseless,
2771 &page_links_settings,
2772 &mut links,
2773 Some(&mut links_ssg),
2774 &domain_parsed,
2775 &mut self.domain_parsed,
2776 &mut links_pages,
2777 )
2778 .await,
2779 );
2780 }
2781 }
2782
2783 emit_log(url);
2784
2785 if let Some(signature) = page.signature {
2786 if !self.is_signature_allowed(signature).await {
2787 return Default::default();
2788 }
2789 self.insert_signature(signature).await;
2790 }
2791
2792 let url = match &self.on_link_find_callback {
2793 Some(cb) => cb(*self.url.clone(), None).0,
2794 _ => *self.url.clone(),
2795 };
2796
2797 self.insert_link(url).await;
2798
2799 if self.configuration.return_page_links {
2800 page.page_links = links_pages
2801 .filter(|pages: &HashSet<CaseInsensitiveString>| !pages.is_empty())
2802 .map(Box::new);
2803 }
2804
2805 links.extend(links_ssg);
2806
2807 self.initial_status_code = page.status_code;
2808 self.initial_html_length = page.get_html_bytes_u8().len();
2809 self.initial_anti_bot_tech = page.anti_bot_tech;
2810 self.initial_page_should_retry = page.should_retry;
2811 self.initial_page_waf_check = page.waf_check;
2812
2813 self.set_crawl_initial_status(&page, &links);
2814
2815 if let Some(ref cb) = self.on_should_crawl_callback {
2816 if !cb.call(&page) {
2817 page.blocked_crawl = true;
2818 channel_send_page(&self.channel, page, &self.channel_guard);
2819 return Default::default();
2820 }
2821 }
2822
2823 channel_send_page(&self.channel, page, &self.channel_guard);
2824
2825 links
2826 } else {
2827 HashSet::new()
2828 }
2829 }
2830
2831 #[cfg(feature = "cmd")]
2833 pub async fn run_via_cmd(
2834 cmd: &std::path::Path,
2835 fixed_args: &[String],
2836 url: &str,
2837 ) -> std::io::Result<Vec<u8>> {
2838 use tokio::process::Command;
2839 let mut args: Vec<String> = Vec::with_capacity(fixed_args.len() + 1);
2840 let mut used_placeholder = false;
2841
2842 for a in fixed_args {
2843 if a.contains("{url}") {
2844 used_placeholder = true;
2845 args.push(a.replace("{url}", url));
2846 } else {
2847 args.push(a.clone());
2848 }
2849 }
2850
2851 if !used_placeholder {
2852 args.push(url.to_string());
2853 }
2854
2855 let out = Command::new(cmd)
2856 .args(&args)
2857 .kill_on_drop(true)
2858 .output()
2859 .await?;
2860
2861 if !out.status.success() {
2862 let code = out.status.code().unwrap_or(-1);
2863 let stderr = String::from_utf8_lossy(&out.stderr);
2864
2865 return Err(std::io::Error::other(format!(
2866 "cmd exit={code} stderr={stderr}"
2867 )));
2868 }
2869
2870 Ok(out.stdout)
2871 }
2872
2873 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
2877 #[cfg(feature = "cmd")]
2878 pub async fn crawl_concurrent_cmd(
2879 &mut self,
2880 cmd: std::path::PathBuf,
2881 cmd_args: Vec<String>,
2882 handle: &Option<Arc<AtomicI8>>,
2883 ) {
2884 self.start();
2885 self.status = CrawlStatus::Active;
2886
2887 let mut selector: (
2888 CompactString,
2889 smallvec::SmallVec<[CompactString; 2]>,
2890 CompactString,
2891 ) = self.setup_selectors();
2892
2893 if self.single_page() {
2894 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
2895 let mut links_pages: Option<HashSet<CaseInsensitiveString>> =
2896 if self.configuration.return_page_links {
2897 Some(HashSet::new())
2898 } else {
2899 None
2900 };
2901
2902 let mut relative_selectors = selector;
2903 let mut domain_parsed = None;
2904
2905 let target = self
2906 .domain_parsed
2907 .as_ref()
2908 .map(|u| u.as_str())
2909 .unwrap_or(self.get_url());
2910
2911 let bytes = match Self::run_via_cmd(&cmd, &cmd_args, target).await {
2912 Ok(b) => b,
2913 Err(_err) => {
2914 let mut page = Page::default();
2915 page.url = target.to_string();
2916 page.status_code = StatusCode::BAD_GATEWAY;
2917 #[cfg(not(feature = "page_error_status_details"))]
2918 {
2919 page.error_status = Some(_err.to_string());
2920 }
2921 channel_send_page(&self.channel, page, &self.channel_guard);
2922 return;
2923 }
2924 };
2925
2926 let page = Page::new_page_streaming_from_bytes(
2927 target,
2928 &bytes,
2929 &mut relative_selectors,
2930 &self.configuration.external_domains_caseless,
2931 &PageLinkBuildSettings::new_full(
2932 false,
2933 self.configuration.full_resources,
2934 self.configuration.subdomains,
2935 self.configuration.tld,
2936 self.configuration.normalize,
2937 ),
2938 &mut links,
2939 None,
2940 &self.domain_parsed,
2941 &mut domain_parsed,
2942 &mut links_pages,
2943 )
2944 .await;
2945
2946 channel_send_page(&self.channel, page, &self.channel_guard);
2947 return;
2948 }
2949
2950 let on_should_crawl_callback = self.on_should_crawl_callback.clone();
2951 let return_page_links = self.configuration.return_page_links;
2952 let full_resources = self.configuration.full_resources;
2953 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
2954
2955 let (mut interval, throttle) = self.setup_crawl();
2956 let mut links: HashSet<CaseInsensitiveString> = self.drain_extra_links().collect();
2957
2958 links.extend(
2959 self._crawl_establish_cmd(cmd.clone(), cmd_args.clone(), &mut selector, false)
2960 .await,
2961 );
2962
2963 self.configuration.configure_allowlist();
2964 let semaphore = self.setup_semaphore();
2965
2966 let shared = Arc::new((
2967 cmd,
2968 cmd_args,
2969 selector,
2970 self.channel.clone(),
2971 self.configuration.external_domains_caseless.clone(),
2972 self.channel_guard.clone(),
2973 self.configuration.retry,
2974 return_page_links,
2975 PageLinkBuildSettings::new_full(
2976 false,
2977 full_resources,
2978 self.configuration.subdomains,
2979 self.configuration.tld,
2980 self.configuration.normalize,
2981 ),
2982 self.domain_parsed.clone(),
2983 self.on_link_find_callback.clone(),
2984 ));
2985
2986 let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> = JoinSet::new();
2987
2988 let mut exceeded_budget = false;
2989 let concurrency = throttle.is_zero();
2990
2991 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
2992
2993 if !concurrency && !links.is_empty() {
2994 tokio::time::sleep(*throttle).await;
2995 }
2996
2997 let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
2998 Some(Instant::now())
2999 } else {
3000 None
3001 };
3002
3003 'outer: loop {
3004 #[cfg(all(feature = "agent", feature = "serde"))]
3005 self.apply_url_prefilter(&mut links).await;
3006
3007 let mut stream =
3008 tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect());
3009
3010 loop {
3011 if !concurrency {
3012 tokio::time::sleep(*throttle).await;
3013 }
3014
3015 let semaphore = get_semaphore(&semaphore, !self.configuration.shared_queue).await;
3016
3017 tokio::select! {
3018 biased;
3019
3020 Some(link) = stream.next(),
3021 if semaphore.available_permits() > 0
3022 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) =>
3023 {
3024 if !self.handle_process(handle, &mut interval, async {
3025 emit_log_shutdown(link.inner());
3026 let permits = set.len();
3027 set.shutdown().await;
3028 semaphore.add_permits(permits);
3029 }).await {
3030 while let Some(links) = stream.next().await {
3031 self.extra_links.insert(links);
3032 }
3033 break 'outer;
3034 }
3035
3036 let allowed = self.is_allowed(&link);
3037 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
3038 exceeded_budget = true;
3039 break;
3040 }
3041 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
3042 continue;
3043 }
3044
3045 emit_log(link.inner());
3046 self.insert_link(link.clone()).await;
3047
3048 if let Ok(permit) = semaphore.clone().acquire_owned().await {
3049 let shared = shared.clone();
3050 let on_should_crawl_callback = on_should_crawl_callback.clone();
3051 spawn_set("page_fetch_cmd", &mut set, async move {
3052 let link_result = match &shared.10 {
3053 Some(cb) => cb(link, None),
3054 _ => (link, None),
3055 };
3056
3057 let mut out_links: HashSet<CaseInsensitiveString> = HashSet::new();
3058 let mut links_pages = if shared.7 { Some(HashSet::new()) } else { None };
3059
3060 let mut relative_selectors = shared.2.clone();
3061 let mut r_settings = shared.8;
3062 r_settings.ssg_build = true;
3063
3064 let target_url = link_result.0.as_ref();
3065
3066 let mut retry_count = shared.6;
3068 let mut last_err: Option<std::io::Error> = None;
3069
3070 let bytes = loop {
3071 match Self::run_via_cmd(&shared.0, &shared.1, target_url).await {
3072 Ok(b) if !b.is_empty() => break Some(b),
3073 Ok(_) => {
3074 last_err = Some(std::io::Error::new(
3075 std::io::ErrorKind::UnexpectedEof,
3076 "cmd returned empty stdout",
3077 ));
3078 }
3079 Err(e) => {
3080 last_err = Some(e);
3081 }
3082 }
3083
3084 if retry_count == 0 { break None; }
3085 retry_count -= 1;
3086
3087 tokio::time::sleep(std::time::Duration::from_millis(250)).await;
3088 };
3089
3090 let mut domain_parsed = None;
3091
3092 let mut page = if let Some(bytes) = bytes {
3093 Page::new_page_streaming_from_bytes(
3094 target_url,
3095 &bytes,
3096 &mut relative_selectors,
3097 &shared.4,
3098 &r_settings,
3099 &mut out_links,
3100 None,
3101 &shared.9,
3102 &mut domain_parsed,
3103 &mut links_pages,
3104 ).await
3105 } else {
3106 let mut p = Page::default();
3108 p.url = target_url.to_string();
3109 p.status_code = StatusCode::BAD_GATEWAY;
3110 if let Some(_e) = last_err {
3111 #[cfg(not(feature = "page_error_status_details"))]
3112 {
3113 p.error_status = Some(_e.to_string());
3114 }
3115 }
3116 p
3117 };
3118
3119 if shared.7 {
3120 page.page_links = links_pages
3121 .filter(|pages| !pages.is_empty())
3122 .map(Box::new);
3123 }
3124
3125 if let Some(ref cb) = on_should_crawl_callback {
3126 if !cb.call(&page) {
3127 page.blocked_crawl = true;
3128 channel_send_page(&shared.3, page, &shared.5);
3129 drop(permit);
3130 return Default::default();
3131 }
3132 }
3133
3134 let signature = page.signature;
3135 channel_send_page(&shared.3, page, &shared.5);
3136 drop(permit);
3137
3138 (out_links, signature)
3139 });
3140 }
3141
3142 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
3143 },
3144
3145 Some(result) = set.join_next(), if !set.is_empty() => {
3146 if let Ok(res) = result {
3147 match res.1 {
3148 Some(signature) => {
3149 if self.is_signature_allowed(signature).await {
3150 self.insert_signature(signature).await;
3151 self.links_visited.extend_links(&mut links, res.0);
3152 }
3153 }
3154 _ => {
3155 self.links_visited.extend_links(&mut links, res.0);
3156 }
3157 }
3158 } else {
3159 break;
3160 }
3161 }
3162
3163 else => break,
3164 }
3165
3166 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
3167
3168 if (links.is_empty() && set.is_empty()) || exceeded_budget {
3169 if exceeded_budget {
3170 while let Some(links) = stream.next().await {
3171 self.extra_links.insert(links);
3172 }
3173 while let Some(links) = set.join_next().await {
3174 if let Ok(links) = links {
3175 self.extra_links.extend(links.0);
3176 }
3177 }
3178 }
3179 break 'outer;
3180 }
3181 }
3182
3183 self.subscription_guard().await;
3184 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
3185
3186 if links.is_empty() && set.is_empty() {
3187 break;
3188 }
3189 }
3190
3191 if !links.is_empty() {
3192 self.extra_links.extend(links);
3193 }
3194 }
3195
3196 #[allow(dead_code)]
3198 fn build_seed_page(&self) -> Option<Page> {
3199 if let Some(seeded_html) = self.get_seeded_html() {
3200 if crate::utils::is_cacheable_body_empty(seeded_html.as_bytes()) {
3201 return None;
3202 }
3203 let mut page_response = PageResponse::default();
3204 page_response.content = Some(Box::new(seeded_html.as_bytes().to_vec()));
3205 Some(build(self.url.inner(), page_response))
3206 } else {
3207 None
3208 }
3209 }
3210
3211 #[cfg(all(
3213 not(feature = "decentralized"),
3214 feature = "chrome",
3215 not(feature = "glob")
3216 ))]
3217 pub async fn crawl_establish(
3218 &mut self,
3219 client: &Client,
3220 base: &mut RelativeSelectors,
3221 _: bool,
3222 chrome_page: &chromiumoxide::Page,
3223 ) -> HashSet<CaseInsensitiveString> {
3224 if self.skip_initial {
3225 return Default::default();
3226 }
3227
3228 if self
3229 .is_allowed_default(self.get_base_link())
3230 .eq(&ProcessLinkStatus::Allowed)
3231 {
3232 let (_, intercept_handle) = tokio::join!(
3233 crate::features::chrome::setup_chrome_events(chrome_page, &self.configuration),
3234 self.setup_chrome_interception(chrome_page)
3235 );
3236
3237 let mut page = if let Some(seeded_html) = self.get_seeded_html() {
3238 Page::new_seeded(
3239 self.url.inner(),
3240 client,
3241 chrome_page,
3242 &self.configuration.wait_for,
3243 &self.configuration.screenshot,
3244 false, &self.configuration.openai_config,
3246 &self.configuration.execution_scripts,
3247 &self.configuration.automation_scripts,
3248 &self.configuration.viewport,
3249 &self.configuration.request_timeout,
3250 &self.configuration.track_events,
3251 self.configuration.referer.clone(),
3252 self.configuration.max_page_bytes,
3253 self.configuration.get_cache_options(),
3254 &self.configuration.cache_policy,
3255 Some(seeded_html.clone()),
3256 Some(&self.cookie_jar),
3257 &self.configuration.remote_multimodal,
3258 )
3259 .await
3260 } else {
3261 Page::new(
3262 self.url.inner(),
3263 client,
3264 chrome_page,
3265 &self.configuration.wait_for,
3266 &self.configuration.screenshot,
3267 false, &self.configuration.openai_config,
3269 &self.configuration.execution_scripts,
3270 &self.configuration.automation_scripts,
3271 &self.configuration.viewport,
3272 &self.configuration.request_timeout,
3273 &self.configuration.track_events,
3274 self.configuration.referer.clone(),
3275 self.configuration.max_page_bytes,
3276 self.configuration.get_cache_options(),
3277 &self.configuration.cache_policy,
3278 &self.configuration.remote_multimodal,
3279 )
3280 .await
3281 };
3282
3283 let mut retry_count = self.configuration.retry;
3284
3285 if let Some(final_redirect_destination) = &page.final_redirect_destination {
3286 if final_redirect_destination == "chrome-error://chromewebdata/"
3287 && page.status_code.is_success()
3288 && page.is_empty()
3289 && self.configuration.proxies.is_some()
3290 {
3291 page.error_status = Some("Invalid proxy configuration.".into());
3292 page.should_retry = true;
3293 page.status_code = *crate::page::CHROME_UNKNOWN_STATUS_ERROR;
3294 }
3295 }
3296
3297 while page.should_retry && retry_count > 0 {
3298 retry_count -= 1;
3299 if let Some(timeout) = page.get_timeout() {
3300 tokio::time::sleep(timeout).await;
3301 }
3302 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
3303 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
3304 let next_page = Page::new(
3305 self.url.inner(),
3306 client,
3307 chrome_page,
3308 &self.configuration.wait_for,
3309 &self.configuration.screenshot,
3310 false, &self.configuration.openai_config,
3312 &self.configuration.execution_scripts,
3313 &self.configuration.automation_scripts,
3314 &self.configuration.viewport,
3315 &self.configuration.request_timeout,
3316 &self.configuration.track_events,
3317 self.configuration.referer.clone(),
3318 self.configuration.max_page_bytes,
3319 self.configuration.get_cache_options(),
3320 &self.configuration.cache_policy,
3321 &self.configuration.remote_multimodal,
3322 )
3323 .await;
3324 page.clone_from(&next_page);
3325 })
3326 .await
3327 {
3328 log::warn!("backoff timeout {elasped}");
3329 }
3330 } else {
3331 let next_page = Page::new(
3332 self.url.inner(),
3333 client,
3334 chrome_page,
3335 &self.configuration.wait_for,
3336 &self.configuration.screenshot,
3337 false, &self.configuration.openai_config,
3339 &self.configuration.execution_scripts,
3340 &self.configuration.automation_scripts,
3341 &self.configuration.viewport,
3342 &self.configuration.request_timeout,
3343 &self.configuration.track_events,
3344 self.configuration.referer.clone(),
3345 self.configuration.max_page_bytes,
3346 self.configuration.get_cache_options(),
3347 &self.configuration.cache_policy,
3348 &self.configuration.remote_multimodal,
3349 )
3350 .await;
3351 page.clone_from(&next_page);
3352 }
3353
3354 if let Some(final_redirect_destination) = &page.final_redirect_destination {
3356 if final_redirect_destination == "chrome-error://chromewebdata/"
3357 && page.status_code.is_success()
3358 && page.is_empty()
3359 && self.configuration.proxies.is_some()
3360 {
3361 page.error_status = Some("Invalid proxy configuration.".into());
3362 page.should_retry = true;
3363 page.status_code = *crate::page::CHROME_UNKNOWN_STATUS_ERROR;
3364 }
3365 }
3366 }
3367
3368 if let Some(h) = intercept_handle {
3369 let abort_handle = h.abort_handle();
3370 if let Err(elasped) =
3371 tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await
3372 {
3373 log::warn!("Handler timeout exceeded {elasped}");
3374 abort_handle.abort();
3375 }
3376 }
3377
3378 if let Some(domain) = &page.final_redirect_destination {
3379 let domain: Box<CaseInsensitiveString> = CaseInsensitiveString::new(&domain).into();
3380 let prior_domain = self.domain_parsed.take();
3381 self.domain_parsed = parse_absolute_url(&domain);
3382 self.url = domain;
3383
3384 let s = self.setup_selectors();
3385 base.0 = s.0;
3386 base.1 = s.1;
3387
3388 if let Some(pdname) = prior_domain {
3389 if let Some(dname) = pdname.host_str() {
3390 base.2 = dname.into();
3391 }
3392 }
3393 }
3394
3395 emit_log(self.url.inner());
3396
3397 if let Some(sid) = page.signature {
3398 self.insert_signature(sid).await;
3399 }
3400
3401 let url = match &self.on_link_find_callback {
3402 Some(cb) => cb(*self.url.clone(), None).0,
3403 _ => *self.url.clone(),
3404 };
3405
3406 self.insert_link(url).await;
3407
3408 if self.configuration.return_page_links && page.page_links.is_none() {
3410 page.page_links = Some(Box::default());
3411 }
3412
3413 let xml_file = page.get_html_bytes_u8().starts_with(b"<?xml");
3414
3415 let mut links = if !page.is_empty() && !xml_file {
3416 page.links_ssg(base, client, &self.domain_parsed).await
3417 } else {
3418 Default::default()
3419 };
3420
3421 if xml_file {
3422 page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
3423 .await;
3424 }
3425
3426 self.initial_status_code = page.status_code;
3427 self.initial_html_length = page.get_html_bytes_u8().len();
3428 self.initial_anti_bot_tech = page.anti_bot_tech;
3429 self.initial_page_should_retry = page.should_retry;
3430 self.initial_page_waf_check = page.waf_check;
3431
3432 self.set_crawl_initial_status(&page, &links);
3433
3434 if let Some(ref cb) = self.on_should_crawl_callback {
3435 if !cb.call(&page) {
3436 page.blocked_crawl = true;
3437 channel_send_page(&self.channel, page, &self.channel_guard);
3438 return Default::default();
3439 }
3440 }
3441
3442 channel_send_page(&self.channel, page, &self.channel_guard);
3443
3444 links
3445 } else {
3446 HashSet::new()
3447 }
3448 }
3449
3450 #[cfg(all(not(feature = "decentralized"), feature = "chrome",))]
3452 pub async fn crawl_establish_chrome_one(
3453 &self,
3454 client: &Client,
3455 base: &mut RelativeSelectors,
3456 url: &Option<&str>,
3457 chrome_page: &chromiumoxide::Page,
3458 ) -> HashSet<CaseInsensitiveString> {
3459 if self
3460 .is_allowed_default(self.get_base_link())
3461 .eq(&ProcessLinkStatus::Allowed)
3462 {
3463 let (_, intercept_handle) = tokio::join!(
3464 crate::features::chrome::setup_chrome_events(chrome_page, &self.configuration),
3465 self.setup_chrome_interception(chrome_page)
3466 );
3467
3468 let mut page = Page::new(
3469 url.unwrap_or(self.url.inner()),
3470 client,
3471 chrome_page,
3472 &self.configuration.wait_for,
3473 &self.configuration.screenshot,
3474 false, &self.configuration.openai_config,
3476 &self.configuration.execution_scripts,
3477 &self.configuration.automation_scripts,
3478 &self.configuration.viewport,
3479 &self.configuration.request_timeout,
3480 &self.configuration.track_events,
3481 self.configuration.referer.clone(),
3482 self.configuration.max_page_bytes,
3483 self.configuration.get_cache_options(),
3484 &self.configuration.cache_policy,
3485 &self.configuration.remote_multimodal,
3486 )
3487 .await;
3488
3489 let mut retry_count = self.configuration.retry;
3490
3491 if let Some(final_redirect_destination) = &page.final_redirect_destination {
3492 if final_redirect_destination == "chrome-error://chromewebdata/"
3493 && page.status_code.is_success()
3494 && page.is_empty()
3495 && self.configuration.proxies.is_some()
3496 {
3497 page.error_status = Some("Invalid proxy configuration.".into());
3498 page.should_retry = true;
3499 page.status_code = *crate::page::CHROME_UNKNOWN_STATUS_ERROR;
3500 }
3501 }
3502
3503 while page.should_retry && retry_count > 0 {
3504 retry_count -= 1;
3505 if let Some(timeout) = page.get_timeout() {
3506 tokio::time::sleep(timeout).await;
3507 }
3508 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
3509 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
3510 let next_page = Page::new(
3511 self.url.inner(),
3512 client,
3513 chrome_page,
3514 &self.configuration.wait_for,
3515 &self.configuration.screenshot,
3516 false, &self.configuration.openai_config,
3518 &self.configuration.execution_scripts,
3519 &self.configuration.automation_scripts,
3520 &self.configuration.viewport,
3521 &self.configuration.request_timeout,
3522 &self.configuration.track_events,
3523 self.configuration.referer.clone(),
3524 self.configuration.max_page_bytes,
3525 self.configuration.get_cache_options(),
3526 &self.configuration.cache_policy,
3527 &self.configuration.remote_multimodal,
3528 )
3529 .await;
3530 page.clone_from(&next_page);
3531 })
3532 .await
3533 {
3534 log::warn!("backoff timeout {elasped}");
3535 }
3536 } else {
3537 let next_page = Page::new(
3538 self.url.inner(),
3539 client,
3540 chrome_page,
3541 &self.configuration.wait_for,
3542 &self.configuration.screenshot,
3543 false, &self.configuration.openai_config,
3545 &self.configuration.execution_scripts,
3546 &self.configuration.automation_scripts,
3547 &self.configuration.viewport,
3548 &self.configuration.request_timeout,
3549 &self.configuration.track_events,
3550 self.configuration.referer.clone(),
3551 self.configuration.max_page_bytes,
3552 self.configuration.get_cache_options(),
3553 &self.configuration.cache_policy,
3554 &self.configuration.remote_multimodal,
3555 )
3556 .await;
3557 page.clone_from(&next_page);
3558 }
3559
3560 if let Some(final_redirect_destination) = &page.final_redirect_destination {
3562 if final_redirect_destination == "chrome-error://chromewebdata/"
3563 && page.status_code.is_success()
3564 && page.is_empty()
3565 && self.configuration.proxies.is_some()
3566 {
3567 page.error_status = Some("Invalid proxy configuration.".into());
3568 page.should_retry = true;
3569 page.status_code = *crate::page::CHROME_UNKNOWN_STATUS_ERROR;
3570 }
3571 }
3572 }
3573
3574 if let Some(h) = intercept_handle {
3575 let abort_handle = h.abort_handle();
3576 if let Err(elasped) =
3577 tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await
3578 {
3579 log::warn!("Handler timeout exceeded {elasped}");
3580 abort_handle.abort();
3581 }
3582 }
3583
3584 if let Some(domain) = &page.final_redirect_destination {
3585 let domain: Box<CaseInsensitiveString> = CaseInsensitiveString::new(&domain).into();
3586 let s = self.setup_selectors();
3587
3588 base.0 = s.0;
3589 base.1 = s.1;
3590
3591 if let Some(pdname) = parse_absolute_url(&domain) {
3592 if let Some(dname) = pdname.host_str() {
3593 base.2 = dname.into();
3594 }
3595 }
3596 }
3597
3598 emit_log(self.url.inner());
3599
3600 if self.configuration.return_page_links && page.page_links.is_none() {
3601 page.page_links = Some(Box::default());
3602 }
3603
3604 let xml_file = page.get_html_bytes_u8().starts_with(b"<?xml");
3605
3606 let mut links = if !page.is_empty() && !xml_file {
3607 page.links_ssg(base, client, &self.domain_parsed).await
3608 } else {
3609 Default::default()
3610 };
3611
3612 if xml_file {
3613 page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
3614 .await;
3615 }
3616
3617 if let Some(ref cb) = self.on_should_crawl_callback {
3618 if !cb.call(&page) {
3619 page.blocked_crawl = true;
3620 channel_send_page(&self.channel, page, &self.channel_guard);
3621 return Default::default();
3622 }
3623 }
3624
3625 channel_send_page(&self.channel, page, &self.channel_guard);
3626
3627 links
3628 } else {
3629 HashSet::new()
3630 }
3631 }
3632
3633 #[cfg(all(
3635 feature = "webdriver",
3636 not(feature = "decentralized"),
3637 not(feature = "chrome")
3638 ))]
3639 pub async fn crawl_establish_webdriver_one(
3640 &self,
3641 client: &Client,
3642 base: &mut RelativeSelectors,
3643 url: &Option<&str>,
3644 driver: &std::sync::Arc<thirtyfour::WebDriver>,
3645 ) -> HashSet<CaseInsensitiveString> {
3646 if self
3647 .is_allowed_default(self.get_base_link())
3648 .eq(&ProcessLinkStatus::Allowed)
3649 {
3650 let timeout = self
3651 .configuration
3652 .webdriver_config
3653 .as_ref()
3654 .and_then(|c| c.timeout);
3655
3656 crate::features::webdriver::setup_driver_events(driver, &self.configuration).await;
3658
3659 let mut page =
3660 Page::new_page_webdriver(url.unwrap_or(self.url.inner()), driver, timeout).await;
3661
3662 let mut retry_count = self.configuration.retry;
3663
3664 while page.should_retry && retry_count > 0 {
3665 retry_count -= 1;
3666 if let Some(timeout_duration) = page.get_timeout() {
3667 tokio::time::sleep(timeout_duration).await;
3668 }
3669 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
3670 if let Err(elapsed) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
3671 let next_page =
3672 Page::new_page_webdriver(self.url.inner(), driver, timeout).await;
3673 page.clone_from(&next_page);
3674 })
3675 .await
3676 {
3677 log::warn!("backoff timeout {elapsed}");
3678 }
3679 } else {
3680 let next_page =
3681 Page::new_page_webdriver(self.url.inner(), driver, timeout).await;
3682 page.clone_from(&next_page);
3683 }
3684 }
3685
3686 if let Some(domain) = &page.final_redirect_destination {
3687 let domain: Box<CaseInsensitiveString> = CaseInsensitiveString::new(&domain).into();
3688 let s = self.setup_selectors();
3689
3690 base.0 = s.0;
3691 base.1 = s.1;
3692
3693 if let Some(pdname) = parse_absolute_url(&domain) {
3694 if let Some(dname) = pdname.host_str() {
3695 base.2 = dname.into();
3696 }
3697 }
3698 }
3699
3700 emit_log(self.url.inner());
3701
3702 if self.configuration.return_page_links && page.page_links.is_none() {
3703 page.page_links = Some(Box::default());
3704 }
3705
3706 let xml_file = page.get_html_bytes_u8().starts_with(b"<?xml");
3707
3708 let mut links = if !page.is_empty() && !xml_file {
3709 page.links_ssg(base, client, &self.domain_parsed).await
3710 } else {
3711 Default::default()
3712 };
3713
3714 if xml_file {
3715 page.links_stream_xml_links_stream_base(base, &page.get_html(), &mut links, &None)
3716 .await;
3717 }
3718
3719 if let Some(ref cb) = self.on_should_crawl_callback {
3720 if !cb.call(&page) {
3721 page.blocked_crawl = true;
3722 channel_send_page(&self.channel, page, &self.channel_guard);
3723 return Default::default();
3724 }
3725 }
3726
3727 channel_send_page(&self.channel, page, &self.channel_guard);
3728
3729 links
3730 } else {
3731 HashSet::new()
3732 }
3733 }
3734
3735 #[cfg(all(not(feature = "glob"), feature = "decentralized"))]
3737 pub async fn crawl_establish(
3738 &mut self,
3739 client: &Client,
3740 _: &(CompactString, smallvec::SmallVec<[CompactString; 2]>),
3741 http_worker: bool,
3742 ) -> HashSet<CaseInsensitiveString> {
3743 let links: HashSet<CaseInsensitiveString> = if self
3745 .is_allowed_default(&self.get_base_link())
3746 .eq(&ProcessLinkStatus::Allowed)
3747 {
3748 let link = self.url.inner();
3749
3750 let mut page = Page::new_page_with_cache(
3751 &if http_worker && link.starts_with("https") {
3752 link.replacen("https", "http", 1)
3753 } else {
3754 link.to_string()
3755 },
3756 &client,
3757 self.configuration.get_cache_options(),
3758 &self.configuration.cache_policy,
3759 )
3760 .await;
3761
3762 if let Some(sid) = page.signature {
3763 self.insert_signature(sid).await;
3764 }
3765
3766 self.insert_link(match &self.on_link_find_callback {
3767 Some(cb) => cb(*self.url.to_owned(), None).0,
3768 _ => *self.url.to_owned(),
3769 })
3770 .await;
3771
3772 self.initial_status_code = page.status_code;
3773 self.initial_html_length = page.get_html_bytes_u8().len();
3774 self.initial_anti_bot_tech = page.anti_bot_tech;
3775 self.initial_page_should_retry = page.should_retry;
3776 self.initial_page_waf_check = page.waf_check;
3777
3778 if self.configuration.return_page_links {
3780 page.page_links = Some(page.links.clone().into());
3781 }
3782
3783 let links = HashSet::from(page.links.clone());
3784
3785 self.set_crawl_initial_status(&page, &links);
3786
3787 channel_send_page(&self.channel, page, &self.channel_guard);
3788
3789 links
3790 } else {
3791 HashSet::new()
3792 };
3793
3794 links
3795 }
3796
3797 #[cfg(all(feature = "glob", feature = "decentralized"))]
3799 pub async fn crawl_establish(
3800 &mut self,
3801 client: &Client,
3802 _: &(CompactString, smallvec::SmallVec<[CompactString; 2]>),
3803 http_worker: bool,
3804 ) -> HashSet<CaseInsensitiveString> {
3805 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
3806 let expanded = self.get_expanded_links(self.url.inner().as_str());
3807 self.configuration.configure_allowlist();
3808
3809 for link in expanded {
3810 let allowed = self.is_allowed(&link);
3811
3812 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
3813 break;
3814 }
3815 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
3816 continue;
3817 }
3818
3819 let mut page = Page::new_page_with_cache(
3820 &if http_worker && link.as_ref().starts_with("https") {
3821 link.inner().replacen("https", "http", 1).to_string()
3822 } else {
3823 link.inner().to_string()
3824 },
3825 client,
3826 self.configuration.get_cache_options(),
3827 &self.configuration.cache_policy,
3828 )
3829 .await;
3830
3831 let u = page.get_url();
3832 let u = if u.is_empty() { link } else { u.into() };
3833
3834 let link_result = match &self.on_link_find_callback {
3835 Some(cb) => cb(u, None),
3836 _ => (u, None),
3837 };
3838
3839 if let Some(sid) = page.signature {
3840 self.insert_signature(sid).await;
3841 }
3842
3843 self.insert_link(link_result.0).await;
3844
3845 if self.configuration.return_page_links {
3846 page.page_links = Some(Default::default());
3847 }
3848
3849 channel_send_page(&self.channel, page.clone(), &self.channel_guard);
3850
3851 let page_links = page.links;
3852
3853 links.extend(page_links);
3854 }
3855
3856 links
3857 }
3858
3859 #[cfg(all(feature = "glob", feature = "chrome", not(feature = "decentralized")))]
3861 pub async fn crawl_establish(
3862 &mut self,
3863 client: &Client,
3864 base: &mut RelativeSelectors,
3865 _: bool,
3866 page: &chromiumoxide::Page,
3867 ) -> HashSet<CaseInsensitiveString> {
3868 if self.skip_initial {
3869 return Default::default();
3870 }
3871 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
3872 let expanded = self.get_expanded_links(&self.url.inner().as_str());
3873 self.configuration.configure_allowlist();
3874
3875 for link in expanded {
3876 let allowed = self.is_allowed(&link);
3877
3878 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
3879 break;
3880 }
3881 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
3882 continue;
3883 }
3884
3885 let mut page = Page::new(
3886 &link.inner().as_str(),
3887 &client,
3888 &page,
3889 &self.configuration.wait_for,
3890 &self.configuration.screenshot,
3891 false, &self.configuration.openai_config,
3893 &self.configuration.execution_scripts,
3894 &self.configuration.automation_scripts,
3895 &self.configuration.viewport,
3896 &self.configuration.request_timeout,
3897 &self.configuration.track_events,
3898 self.configuration.referer.clone(),
3899 self.configuration.max_page_bytes,
3900 self.configuration.get_cache_options(),
3901 &self.configuration.cache_policy,
3902 &self.configuration.remote_multimodal,
3903 )
3904 .await;
3905
3906 let u = page.get_url();
3907 let u = if u.is_empty() { link } else { u.into() };
3908
3909 let link_result = match &self.on_link_find_callback {
3910 Some(cb) => cb(u, None),
3911 _ => (u, None),
3912 };
3913
3914 if let Some(sid) = page.signature {
3915 self.insert_signature(sid).await;
3916 }
3917
3918 self.insert_link(link_result.0).await;
3919
3920 if self.configuration.return_page_links {
3921 page.page_links = Some(Default::default());
3922 let next_links = HashSet::from(page.links(&base, &self.domain_parsed).await);
3923
3924 channel_send_page(&self.channel, page.clone(), &self.channel_guard);
3925
3926 links.extend(next_links);
3927 } else {
3928 channel_send_page(&self.channel, page.clone(), &self.channel_guard);
3929 let next_links = HashSet::from(page.links(&base, &self.domain_parsed).await);
3930
3931 links.extend(next_links);
3932 }
3933 }
3934
3935 links
3936 }
3937
3938 #[cfg(feature = "glob")]
3940 async fn _crawl_establish(
3941 &mut self,
3942 client: &Client,
3943 base: &mut RelativeSelectors,
3944 _: bool,
3945 ) -> HashSet<CaseInsensitiveString> {
3946 if self.skip_initial {
3947 return Default::default();
3948 }
3949 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
3950 let domain_name = self.url.inner();
3951 let expanded = self.get_expanded_links(domain_name.as_str());
3952
3953 self.configuration.configure_allowlist();
3954
3955 for url in expanded {
3956 #[cfg(feature = "regex")]
3957 let url_ref: &CaseInsensitiveString = &url;
3958 #[cfg(not(feature = "regex"))]
3959 let url_ref: &CompactString = url.inner();
3960 if self
3961 .is_allowed_default(url_ref)
3962 .eq(&ProcessLinkStatus::Allowed)
3963 {
3964 let mut links_ssg = HashSet::new();
3965 let mut links_pages = if self.configuration.return_page_links {
3966 Some(HashSet::new())
3967 } else {
3968 None
3969 };
3970 let mut page_links_settings =
3971 PageLinkBuildSettings::new(true, self.configuration.full_resources);
3972
3973 page_links_settings.subdomains = self.configuration.subdomains;
3974 page_links_settings.tld = self.configuration.tld;
3975 page_links_settings.normalize = self.configuration.normalize;
3976
3977 let mut domain_parsed = self.domain_parsed.take();
3978
3979 let mut page = Page::new_page_streaming(
3980 &url,
3981 client,
3982 false,
3983 base,
3984 &self.configuration.external_domains_caseless,
3985 &page_links_settings,
3986 &mut links,
3987 Some(&mut links_ssg),
3988 &domain_parsed, &mut self.domain_parsed,
3990 &mut links_pages,
3991 )
3992 .await;
3993
3994 if self.domain_parsed.is_none() {
3995 if let Some(mut domain_parsed) = domain_parsed.take() {
3996 convert_abs_url(&mut domain_parsed);
3997 self.domain_parsed.replace(domain_parsed);
3998 }
3999 }
4000
4001 let mut retry_count = self.configuration.retry;
4002 let domains_caseless = &self.configuration.external_domains_caseless;
4003
4004 while page.should_retry && retry_count > 0 {
4005 retry_count -= 1;
4006 if let Some(timeout) = page.get_timeout() {
4007 tokio::time::sleep(timeout).await;
4008 }
4009
4010 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
4011 let mut domain_parsed_clone = self.domain_parsed.clone();
4012
4013 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
4014 page.clone_from(
4015 &Page::new_page_streaming(
4016 &url,
4017 client,
4018 false,
4019 base,
4020 domains_caseless,
4021 &page_links_settings,
4022 &mut links,
4023 Some(&mut links_ssg),
4024 &domain_parsed,
4025 &mut domain_parsed_clone,
4026 &mut links_pages,
4027 )
4028 .await,
4029 );
4030 })
4031 .await
4032 {
4033 log::info!("backoff gateway timeout exceeded {elasped}");
4034 }
4035
4036 self.domain_parsed = domain_parsed_clone;
4037 } else {
4038 page.clone_from(
4039 &Page::new_page_streaming(
4040 &url,
4041 client,
4042 false,
4043 base,
4044 &self.configuration.external_domains_caseless,
4045 &page_links_settings,
4046 &mut links,
4047 Some(&mut links_ssg),
4048 &domain_parsed,
4049 &mut self.domain_parsed,
4050 &mut links_pages,
4051 )
4052 .await,
4053 );
4054 }
4055 }
4056
4057 emit_log(&url);
4058
4059 if let Some(signature) = page.signature {
4060 if !self.is_signature_allowed(signature).await {
4061 return Default::default();
4062 }
4063 self.insert_signature(signature).await;
4064 }
4065
4066 self.insert_link(
4067 self.on_link_find_callback
4068 .as_ref()
4069 .map(|cb| cb(*self.url.clone(), None).0)
4070 .unwrap_or_else(|| *self.url.clone()),
4071 )
4072 .await;
4073
4074 if self.configuration.return_page_links {
4075 page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new);
4076 }
4077
4078 links.extend(links_ssg);
4079
4080 self.initial_status_code = page.status_code;
4081 self.initial_html_length = page.get_html_bytes_u8().len();
4082 self.initial_anti_bot_tech = page.anti_bot_tech;
4083 self.initial_page_should_retry = page.should_retry;
4084 self.initial_page_waf_check = page.waf_check;
4085
4086 self.set_crawl_initial_status(&page, &links);
4087
4088 if let Some(ref cb) = self.on_should_crawl_callback {
4089 if !cb.call(&page) {
4090 page.blocked_crawl = true;
4091 channel_send_page(&self.channel, page, &self.channel_guard);
4092 return Default::default();
4093 }
4094 }
4095
4096 channel_send_page(&self.channel, page, &self.channel_guard);
4097 }
4098 }
4099
4100 links
4101 }
4102
4103 #[cfg(all(not(feature = "decentralized"), feature = "smart"))]
4105 pub async fn crawl_establish_smart(
4106 &mut self,
4107 client: &Client,
4108 mut base: &mut RelativeSelectors,
4109 browser: &crate::features::chrome::OnceBrowser,
4110 ) -> HashSet<CaseInsensitiveString> {
4111 if self.skip_initial {
4112 return Default::default();
4113 }
4114
4115 let links: HashSet<CaseInsensitiveString> = if self
4116 .is_allowed_default(&self.get_base_link())
4117 .eq(&ProcessLinkStatus::Allowed)
4118 {
4119 let url = self.url.inner();
4120
4121 let mut page = if let Some(seeded_page) = self.build_seed_page() {
4122 seeded_page
4123 } else {
4124 Page::new_page_with_cache(
4125 &url,
4126 &client,
4127 self.configuration.get_cache_options(),
4128 &self.configuration.cache_policy,
4129 )
4130 .await
4131 };
4132
4133 let mut retry_count = self.configuration.retry;
4134
4135 while page.should_retry && retry_count > 0 {
4136 retry_count -= 1;
4137 if let Some(timeout) = page.get_timeout() {
4138 tokio::time::sleep(timeout).await;
4139 }
4140 let client_error = page.status_code.is_client_error();
4141
4142 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
4143 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
4144 if retry_count.is_power_of_two() {
4145 Website::render_chrome_page(
4146 &self.configuration,
4147 client,
4148 &mut page,
4149 url,
4150 &self.domain_parsed,
4151 browser,
4152 )
4153 .await;
4154 } else {
4155 let next_page = Page::new_page_with_cache(
4156 url,
4157 &client,
4158 self.configuration.get_cache_options(),
4159 &self.configuration.cache_policy,
4160 )
4161 .await;
4162 page.clone_from(&next_page);
4163 };
4164 })
4165 .await
4166 {
4167 log::warn!("backoff timeout {elasped}");
4168 }
4169 } else {
4170 if retry_count.is_power_of_two() || client_error {
4171 Website::render_chrome_page(
4172 &self.configuration,
4173 client,
4174 &mut page,
4175 url,
4176 &self.domain_parsed,
4177 browser,
4178 )
4179 .await
4180 } else {
4181 page.clone_from(
4182 &Page::new_page_with_cache(
4183 url,
4184 &client,
4185 self.configuration.get_cache_options(),
4186 &self.configuration.cache_policy,
4187 )
4188 .await,
4189 );
4190 }
4191 }
4192 }
4193
4194 let (page_links, bytes_transferred): (HashSet<CaseInsensitiveString>, Option<f64>) =
4195 page.smart_links(
4196 &base,
4197 &self.configuration,
4198 &self.domain_parsed,
4199 &browser,
4200 Some(&self.cookie_jar),
4201 )
4202 .await;
4203
4204 if let Some(domain) = &page.final_redirect_destination {
4205 let prior_domain = self.domain_parsed.take();
4206 crate::utils::modify_selectors(
4207 &prior_domain,
4208 domain,
4209 &mut self.domain_parsed,
4210 &mut self.url,
4211 &mut base,
4212 AllowedDomainTypes::new(self.configuration.subdomains, self.configuration.tld),
4213 );
4214 }
4215
4216 emit_log(&self.url.inner());
4217
4218 if let Some(sid) = page.signature {
4219 self.insert_signature(sid).await;
4220 }
4221
4222 self.insert_link(
4223 self.on_link_find_callback
4224 .as_ref()
4225 .map(|cb| cb(*self.url.clone(), None).0)
4226 .unwrap_or_else(|| *self.url.clone()),
4227 )
4228 .await;
4229
4230 let links = if !page_links.is_empty() {
4231 page_links
4232 } else {
4233 Default::default()
4234 };
4235
4236 page.bytes_transferred = bytes_transferred;
4237
4238 self.initial_status_code = page.status_code;
4239 self.initial_html_length = page.get_html_bytes_u8().len();
4240 self.initial_anti_bot_tech = page.anti_bot_tech;
4241 self.initial_page_should_retry = page.should_retry;
4242 self.initial_page_waf_check = page.waf_check;
4243
4244 self.set_crawl_initial_status(&page, &links);
4245
4246 if self.configuration.return_page_links {
4247 page.page_links = if links.is_empty() {
4248 None
4249 } else {
4250 Some(Box::new(links.clone()))
4251 };
4252 }
4253
4254 if let Some(cb) = &mut self.on_should_crawl_callback {
4255 if !cb.call(&page) {
4256 page.blocked_crawl = true;
4257 channel_send_page(&self.channel, page, &self.channel_guard);
4258 return Default::default();
4259 }
4260 }
4261
4262 channel_send_page(&self.channel, page, &self.channel_guard);
4263
4264 links
4265 } else {
4266 HashSet::new()
4267 };
4268
4269 links
4270 }
4271
4272 #[cfg(all(not(feature = "decentralized"), feature = "smart"))]
4274 pub async fn render_chrome_page(
4275 config: &Configuration,
4276 client: &Client,
4277 page: &mut Page,
4278 url: &str,
4279 base: &Option<Box<Url>>,
4280 browser: &crate::features::chrome::OnceBrowser,
4281 ) {
4282 if let Some(browser_controller) = browser
4283 .get_or_init(|| crate::website::Website::setup_browser_base(&config, &base, None))
4284 .await
4285 {
4286 if let Ok(chrome_page) = crate::features::chrome::attempt_navigation(
4287 "about:blank",
4288 &browser_controller.browser.0,
4289 &config.request_timeout,
4290 &browser_controller.browser.2,
4291 &config.viewport,
4292 )
4293 .await
4294 {
4295 let (_, intercept_handle) = tokio::join!(
4296 crate::features::chrome::setup_chrome_events(&chrome_page, &config),
4297 crate::features::chrome::setup_chrome_interception_base(
4298 &chrome_page,
4299 config.chrome_intercept.enabled,
4300 &config.auth_challenge_response,
4301 config.chrome_intercept.block_visuals,
4302 &url,
4303 )
4304 );
4305
4306 let next_page = Page::new(
4307 &url,
4308 &client,
4309 &chrome_page,
4310 &config.wait_for,
4311 &config.screenshot,
4312 false, &config.openai_config,
4314 &config.execution_scripts,
4315 &config.automation_scripts,
4316 &config.viewport,
4317 &config.request_timeout,
4318 &config.track_events,
4319 config.referer.clone(),
4320 config.max_page_bytes,
4321 config.get_cache_options(),
4322 &config.cache_policy,
4323 &config.remote_multimodal,
4324 )
4325 .await;
4326
4327 page.clone_from(&next_page);
4328
4329 if let Some(h) = intercept_handle {
4330 let abort_handle = h.abort_handle();
4331 if let Err(elasped) =
4332 tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await
4333 {
4334 log::warn!("Handler timeout exceeded {elasped}");
4335 abort_handle.abort();
4336 }
4337 }
4338 }
4339 }
4340 }
4341
4342 pub fn set_crawl_status(&mut self) {
4344 if self.status == CrawlStatus::Start || self.status == CrawlStatus::Active {
4345 self.status = if self.domain_parsed.is_none() {
4346 CrawlStatus::Invalid
4347 } else {
4348 CrawlStatus::Idle
4349 };
4350 }
4351 }
4352
4353 pub fn setup_semaphore(&self) -> Arc<Semaphore> {
4355 if self.configuration.shared_queue {
4356 SEM_SHARED.clone()
4357 } else {
4358 Arc::new(Semaphore::const_new(
4359 self.configuration
4360 .concurrency_limit
4361 .unwrap_or(*DEFAULT_PERMITS),
4362 ))
4363 }
4364 }
4365
4366 #[cfg(any(
4369 feature = "cache",
4370 feature = "cache_mem",
4371 feature = "chrome_remote_cache"
4372 ))]
4373 async fn try_cache_shortcircuit(&mut self) -> bool {
4374 use crate::utils::{cache_skip_browser, get_cached_url};
4375
4376 self.configuration.configure_budget();
4378
4379 if !self.single_page() {
4380 return false;
4381 }
4382
4383 let cache_options = self.configuration.get_cache_options();
4384 if !cache_skip_browser(&cache_options) {
4385 return false;
4386 }
4387
4388 let target_url = self.url.inner().to_string();
4389
4390 if let Some(html) = get_cached_url(
4391 &target_url,
4392 cache_options.as_ref(),
4393 &self.configuration.cache_policy,
4394 )
4395 .await
4396 {
4397 self.status = CrawlStatus::Active;
4398 let page_response = crate::utils::build_cached_html_page_response(&target_url, &html);
4399 let page = build(&target_url, page_response);
4400 self.initial_status_code = page.status_code;
4401 self.initial_html_length = page.get_html_bytes_u8().len();
4402 self.links_visited
4403 .insert(CaseInsensitiveString::from(target_url.as_str()));
4404 channel_send_page(&self.channel, page, &self.channel_guard);
4405 self.subscription_guard().await;
4406 return true;
4407 }
4408
4409 false
4410 }
4411
4412 #[cfg(not(any(
4414 feature = "cache",
4415 feature = "cache_mem",
4416 feature = "chrome_remote_cache"
4417 )))]
4418 async fn try_cache_shortcircuit(&mut self) -> bool {
4419 false
4420 }
4421
4422 #[cfg(any(
4426 feature = "cache",
4427 feature = "cache_mem",
4428 feature = "chrome_remote_cache"
4429 ))]
4430 async fn crawl_cache_phase(&mut self, _client: &Client) -> bool {
4431 use crate::utils::{build_cached_html_page_response, cache_skip_browser, get_cached_url};
4432
4433 let cache_options = self.configuration.get_cache_options();
4434 if !cache_skip_browser(&cache_options) {
4435 return false;
4436 }
4437
4438 self.configuration.configure_budget();
4439 self.configuration.configure_allowlist();
4440
4441 let target_url = self.url.inner().to_string();
4442
4443 let html = match get_cached_url(
4445 &target_url,
4446 cache_options.as_ref(),
4447 &self.configuration.cache_policy,
4448 )
4449 .await
4450 {
4451 Some(h) => h,
4452 None => return false, };
4454
4455 self.status = CrawlStatus::Active;
4456 let selectors = self.setup_selectors();
4457 let full_resources = self.configuration.full_resources;
4458 let return_page_links = self.configuration.return_page_links;
4459 let normalize = self.configuration.normalize;
4460
4461 let page_response = build_cached_html_page_response(&target_url, &html);
4463 let mut page = build(&target_url, page_response);
4464
4465 if !self.configuration.external_domains_caseless.is_empty() {
4466 page.set_external(self.configuration.external_domains_caseless.clone());
4467 }
4468 page.set_url_parsed_direct();
4469 if return_page_links {
4470 page.page_links = Some(Default::default());
4471 }
4472
4473 let page_base = page.base.take().map(Box::new);
4474 let mut links: HashSet<CaseInsensitiveString> = if full_resources {
4475 page.links_full(&selectors, &page_base).await
4476 } else {
4477 page.links(&selectors, &page_base).await
4478 };
4479 page.base = None;
4480
4481 if normalize {
4482 page.signature
4483 .replace(crate::utils::hash_html(page.get_html_bytes_u8()).await);
4484 }
4485
4486 self.initial_status_code = page.status_code;
4488 self.initial_html_length = page.get_html_bytes_u8().len();
4489
4490 let url = match &self.on_link_find_callback {
4491 Some(cb) => cb(*self.url.clone(), None).0,
4492 _ => *self.url.clone(),
4493 };
4494 self.insert_link(url).await;
4495 self.links_visited
4496 .insert(CaseInsensitiveString::from(target_url.as_str()));
4497
4498 emit_log(&target_url);
4499
4500 if normalize {
4501 if let Some(sig) = page.signature {
4502 if !self.is_signature_allowed(sig).await {
4503 channel_send_page(&self.channel, page, &self.channel_guard);
4504 self.subscription_guard().await;
4505 return true;
4506 }
4507 self.insert_signature(sig).await;
4508 }
4509 }
4510
4511 self.set_crawl_initial_status(&page, &links);
4512
4513 if let Some(ref cb) = self.on_should_crawl_callback {
4514 if !cb.call(&page) {
4515 page.blocked_crawl = true;
4516 channel_send_page(&self.channel, page, &self.channel_guard);
4517 self.subscription_guard().await;
4518 return true; }
4520 }
4521
4522 channel_send_page(&self.channel, page, &self.channel_guard);
4523
4524 if self.single_page() {
4526 self.subscription_guard().await;
4527 return true;
4528 }
4529
4530 let mut cache_misses: HashSet<CaseInsensitiveString> = HashSet::new();
4532
4533 'cache_loop: loop {
4534 let current_links: Vec<CaseInsensitiveString> = links.drain().collect();
4535 if current_links.is_empty() {
4536 break;
4537 }
4538
4539 for link in current_links {
4540 let allowed = self.is_allowed(&link);
4541 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
4542 break 'cache_loop;
4543 }
4544 if allowed.eq(&ProcessLinkStatus::Blocked) {
4545 continue;
4546 }
4547
4548 let link_url = link.inner().to_string();
4549
4550 match get_cached_url(
4551 &link_url,
4552 cache_options.as_ref(),
4553 &self.configuration.cache_policy,
4554 )
4555 .await
4556 {
4557 Some(cached_html) => {
4558 emit_log(&link_url);
4559 self.insert_link(link.clone()).await;
4560
4561 let page_response =
4562 build_cached_html_page_response(&link_url, &cached_html);
4563 let mut page = build(&link_url, page_response);
4564
4565 if !self.configuration.external_domains_caseless.is_empty() {
4566 page.set_external(self.configuration.external_domains_caseless.clone());
4567 }
4568 page.set_url_parsed_direct();
4569 if return_page_links {
4570 page.page_links = Some(Default::default());
4571 }
4572
4573 let page_base = page.base.take().map(Box::new);
4574 let new_links = if full_resources {
4575 page.links_full(&selectors, &page_base).await
4576 } else {
4577 page.links(&selectors, &page_base).await
4578 };
4579 page.base = None;
4580
4581 if normalize {
4582 page.signature
4583 .replace(crate::utils::hash_html(page.get_html_bytes_u8()).await);
4584 if let Some(sig) = page.signature {
4585 if !self.is_signature_allowed(sig).await {
4586 continue;
4587 }
4588 self.insert_signature(sig).await;
4589 }
4590 }
4591
4592 if let Some(ref cb) = self.on_should_crawl_callback {
4593 if !cb.call(&page) {
4594 page.blocked_crawl = true;
4595 channel_send_page(&self.channel, page, &self.channel_guard);
4596 continue;
4597 }
4598 }
4599
4600 channel_send_page(&self.channel, page, &self.channel_guard);
4601 links.extend(new_links);
4603 }
4604 None => {
4605 cache_misses.insert(link);
4607 }
4608 }
4609 }
4610 }
4611
4612 if !cache_misses.is_empty() {
4614 self.extra_links.extend(cache_misses);
4615 return false; }
4617
4618 self.subscription_guard().await;
4619 true }
4621
4622 #[cfg(not(any(
4624 feature = "cache",
4625 feature = "cache_mem",
4626 feature = "chrome_remote_cache"
4627 )))]
4628 async fn crawl_cache_phase(&mut self, _client: &Client) -> bool {
4629 false
4630 }
4631
4632 pub async fn crawl(&mut self) {
4634 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4635 self.start();
4636 if self.try_cache_shortcircuit().await {
4637 self.set_crawl_status();
4638 return;
4639 }
4640 let (client, handle) = self.setup().await;
4641 let (handle, join_handle) = match handle {
4642 Some(h) => (Some(h.0), Some(h.1)),
4643 _ => (None, None),
4644 };
4645 self.crawl_concurrent(&client, &handle).await;
4646 self.sitemap_crawl_chain(&client, &handle, false).await;
4647 self.set_crawl_status();
4648 if let Some(h) = join_handle {
4649 h.abort()
4650 }
4651 self.client.replace(client);
4652 }
4653 }
4654
4655 pub async fn crawl_sitemap(&mut self) {
4657 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4658 self.start();
4659 if self.try_cache_shortcircuit().await {
4660 self.set_crawl_status();
4661 return;
4662 }
4663 let (client, handle) = self.setup().await;
4664 let (handle, join_handle) = match handle {
4665 Some(h) => (Some(h.0), Some(h.1)),
4666 _ => (None, None),
4667 };
4668 self.sitemap_crawl(&client, &handle, false).await;
4669 self.set_crawl_status();
4670 if let Some(h) = join_handle {
4671 h.abort()
4672 }
4673 self.client.replace(client);
4674 }
4675 }
4676
4677 #[cfg(all(
4679 feature = "sitemap",
4680 feature = "chrome",
4681 not(feature = "decentralized")
4682 ))]
4683 pub async fn crawl_sitemap_chrome(&mut self) {
4684 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4685 self.start();
4686 let (client, handle) = self.setup().await;
4687 let (handle, join_handle) = match handle {
4688 Some(h) => (Some(h.0), Some(h.1)),
4689 _ => (None, None),
4690 };
4691 self.sitemap_crawl_chrome(&client, &handle, false).await;
4692 self.set_crawl_status();
4693 if let Some(h) = join_handle {
4694 h.abort()
4695 }
4696 self.client.replace(client);
4697 }
4698 }
4699
4700 pub async fn configure_setup(&mut self) {
4702 self.status = CrawlStatus::Active;
4703 self.start();
4704 self.setup().await;
4705 self.configuration.configure_allowlist();
4706 self.send_configured = true;
4707 }
4708
4709 pub fn configure_setup_norobots(&mut self) {
4712 self.status = CrawlStatus::Active;
4713 self.start();
4714 self.setup_base();
4715 self.configuration.configure_allowlist();
4716 self.send_configured = true;
4717 }
4718
4719 #[cfg(not(feature = "decentralized"))]
4720 pub async fn crawl_raw_send(&self, url: Option<&str>) {
4725 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4726 let (client, handle) = (
4727 match &self.client {
4728 Some(c) => c.to_owned(),
4729 _ => self.configure_http_client(),
4730 },
4731 self.configure_handler(),
4732 );
4733 let (handle, join_handle) = match handle {
4734 Some(h) => (Some(h.0), Some(h.1)),
4735 _ => (None, None),
4736 };
4737 self.crawl_concurrent_raw_send(&client, &handle, &url).await;
4738 if let Some(h) = join_handle {
4739 h.abort()
4740 }
4741 }
4742 }
4743
4744 #[cfg(all(feature = "chrome", not(feature = "decentralized")))]
4745 pub async fn crawl_chrome_send(&self, url: Option<&str>) {
4749 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4750 let (client, handle) = (
4751 match &self.client {
4752 Some(c) => c.to_owned(),
4753 _ => self.configure_http_client(),
4754 },
4755 self.configure_handler(),
4756 );
4757 let (handle, join_handle) = match handle {
4758 Some(h) => (Some(h.0), Some(h.1)),
4759 _ => (None, None),
4760 };
4761 self.crawl_concurrent_send(&client, &handle, &url).await;
4762 if let Some(h) = join_handle {
4763 h.abort()
4764 }
4765 }
4766 }
4767
4768 #[cfg(all(feature = "chrome", feature = "decentralized"))]
4769 pub async fn crawl_chrome_send(&self, _url: Option<&str>) {}
4771
4772 #[cfg(all(feature = "chrome", not(feature = "decentralized")))]
4773 pub async fn fetch_chrome(&self, url: Option<&str>) {
4775 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4776 let (client, handle) = (
4777 match &self.client {
4778 Some(c) => c.to_owned(),
4779 _ => self.configure_http_client(),
4780 },
4781 self.configure_handler(),
4782 );
4783 let (_handle, join_handle) = match handle {
4784 Some(h) => (Some(h.0), Some(h.1)),
4785 _ => (None, None),
4786 };
4787 self._fetch_chrome(&client, &url).await;
4788 if let Some(h) = join_handle {
4789 h.abort()
4790 }
4791 }
4792 }
4793
4794 #[cfg(all(feature = "chrome", not(feature = "decentralized")))]
4795 pub async fn fetch_chrome_persisted(
4797 &self,
4798 url: Option<&str>,
4799 browser: &crate::features::chrome::BrowserController,
4800 ) {
4801 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4802 let (client, handle) = (
4803 match &self.client {
4804 Some(c) => c.to_owned(),
4805 _ => self.configure_http_client(),
4806 },
4807 self.configure_handler(),
4808 );
4809 let (_handle, join_handle) = match handle {
4810 Some(h) => (Some(h.0), Some(h.1)),
4811 _ => (None, None),
4812 };
4813 self._fetch_chrome_persisted(&client, &url, browser).await;
4814 if let Some(h) = join_handle {
4815 h.abort()
4816 }
4817 }
4818 }
4819
4820 #[cfg(all(feature = "decentralized", feature = "smart"))]
4821 pub async fn crawl_smart(&mut self) {
4823 self.crawl().await;
4824 }
4825
4826 #[cfg(all(feature = "decentralized", not(feature = "smart")))]
4827 pub async fn crawl_smart(&mut self) {
4829 self.crawl().await;
4830 }
4831
4832 #[cfg(all(not(feature = "decentralized"), feature = "smart"))]
4833 pub async fn crawl_smart(&mut self) {
4835 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4836 self.start();
4837 if self.try_cache_shortcircuit().await {
4838 self.set_crawl_status();
4839 return;
4840 }
4841 let (client, handle) = self.setup().await;
4842 let (handle, join_handle) = match handle {
4843 Some(h) => (Some(h.0), Some(h.1)),
4844 _ => (None, None),
4845 };
4846 self.crawl_concurrent_smart(&client, &handle).await;
4847 self.set_crawl_status();
4848 if let Some(h) = join_handle {
4849 h.abort()
4850 }
4851 self.client.replace(client);
4852 }
4853 }
4854
4855 #[cfg(all(not(feature = "decentralized"), not(feature = "smart")))]
4856 pub async fn crawl_smart(&mut self) {
4858 self.crawl().await
4859 }
4860
4861 pub async fn crawl_raw(&mut self) {
4863 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4864 self.start();
4865 if self.try_cache_shortcircuit().await {
4866 self.set_crawl_status();
4867 return;
4868 }
4869 let (client, handle) = self.setup().await;
4870 let (handle, join_handle) = match handle {
4871 Some(h) => (Some(h.0), Some(h.1)),
4872 _ => (None, None),
4873 };
4874 self.crawl_concurrent_raw(&client, &handle).await;
4875 self.sitemap_crawl_chain(&client, &handle, false).await;
4876 self.set_crawl_status();
4877 if let Some(h) = join_handle {
4878 h.abort()
4879 }
4880 self.client.replace(client);
4881 }
4882 }
4883
4884 pub async fn scrape(&mut self) {
4886 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4887 let mut w = self.clone();
4888 let mut rx2 = w.subscribe(0).expect("receiver enabled");
4889
4890 if self.pages.is_none() {
4891 self.pages = Some(Vec::new());
4892 }
4893
4894 let (done_tx, mut done_rx) = tokio::sync::oneshot::channel::<()>();
4896
4897 let crawl = async move {
4898 w.crawl().await;
4899 w.unsubscribe();
4900 let _ = done_tx.send(());
4902 };
4903
4904 let sub = async {
4905 loop {
4906 tokio::select! {
4907 biased;
4908 _ = &mut done_rx => {
4910 break;
4911 }
4912 result = rx2.recv() => {
4913 if let Ok(page) = result {
4914 if let Some(sid) = page.signature {
4915 self.insert_signature(sid).await;
4916 }
4917 self.insert_link(page.get_url().into()).await;
4918 if let Some(p) = self.pages.as_mut() {
4919 p.push(page);
4920 }
4921 } else {
4922 break;
4923 }
4924 }
4925 }
4926 }
4927 };
4928
4929 tokio::join!(sub, crawl);
4930 self.unsubscribe();
4932 }
4933 }
4934
4935 pub async fn scrape_raw(&mut self) {
4937 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4938 let mut w = self.clone();
4939 let mut rx2 = w.subscribe(0).expect("receiver enabled");
4940
4941 if self.pages.is_none() {
4942 self.pages = Some(Vec::new());
4943 }
4944
4945 let (done_tx, mut done_rx) = tokio::sync::oneshot::channel::<()>();
4946
4947 let crawl = async move {
4948 w.crawl_raw().await;
4949 w.unsubscribe();
4950 let _ = done_tx.send(());
4951 };
4952
4953 let sub = async {
4954 loop {
4955 tokio::select! {
4956 biased;
4957 _ = &mut done_rx => break,
4958 result = rx2.recv() => {
4959 if let Ok(page) = result {
4960 if let Some(sid) = page.signature {
4961 self.insert_signature(sid).await;
4962 }
4963 self.insert_link(page.get_url().into()).await;
4964 if let Some(p) = self.pages.as_mut() {
4965 p.push(page);
4966 }
4967 } else {
4968 break;
4969 }
4970 }
4971 }
4972 }
4973 };
4974
4975 tokio::join!(sub, crawl);
4976 self.unsubscribe();
4977 }
4978 }
4979
4980 pub async fn scrape_smart(&mut self) {
4982 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
4983 let mut w = self.clone();
4984 let mut rx2 = w.subscribe(0).expect("receiver enabled");
4985
4986 if self.pages.is_none() {
4987 self.pages = Some(Vec::new());
4988 }
4989
4990 let (done_tx, mut done_rx) = tokio::sync::oneshot::channel::<()>();
4991
4992 let crawl = async move {
4993 w.crawl_smart().await;
4994 w.unsubscribe();
4995 let _ = done_tx.send(());
4996 };
4997
4998 let sub = async {
4999 loop {
5000 tokio::select! {
5001 biased;
5002 _ = &mut done_rx => break,
5003 result = rx2.recv() => {
5004 if let Ok(page) = result {
5005 if let Some(sid) = page.signature {
5006 self.insert_signature(sid).await;
5007 }
5008 self.insert_link(page.get_url().into()).await;
5009 if let Some(p) = self.pages.as_mut() {
5010 p.push(page);
5011 }
5012 } else {
5013 break;
5014 }
5015 }
5016 }
5017 }
5018 };
5019
5020 tokio::join!(sub, crawl);
5021 self.unsubscribe();
5022 }
5023 }
5024
5025 pub async fn scrape_sitemap(&mut self) {
5027 if !self.status.eq(&CrawlStatus::FirewallBlocked) {
5028 let mut w = self.clone();
5029 let mut rx2 = w.subscribe(0).expect("receiver enabled");
5030
5031 if self.pages.is_none() {
5032 self.pages = Some(Vec::new());
5033 }
5034
5035 let (done_tx, mut done_rx) = tokio::sync::oneshot::channel::<()>();
5036
5037 let crawl = async move {
5038 w.crawl_sitemap().await;
5039 w.unsubscribe();
5040 let _ = done_tx.send(());
5041 };
5042
5043 let sub = async {
5044 loop {
5045 tokio::select! {
5046 biased;
5047 _ = &mut done_rx => break,
5048 result = rx2.recv() => {
5049 if let Ok(page) = result {
5050 if let Some(sid) = page.signature {
5051 self.insert_signature(sid).await;
5052 }
5053 self.insert_link(page.get_url().into()).await;
5054 if let Some(p) = self.pages.as_mut() {
5055 p.push(page);
5056 }
5057 } else {
5058 break;
5059 }
5060 }
5061 }
5062 }
5063 };
5064
5065 tokio::join!(sub, crawl);
5066 self.unsubscribe();
5067 }
5068 }
5069
5070 async fn dequeue(
5072 &mut self,
5073 q: &mut Option<tokio::sync::broadcast::Receiver<String>>,
5074 links: &mut HashSet<CaseInsensitiveString>,
5075 exceeded_budget: &mut bool,
5076 ) {
5077 #[cfg(all(feature = "agent", feature = "serde"))]
5079 if let Some(ref cfgs) = self.configuration.remote_multimodal {
5080 let credits = cfgs
5081 .relevance_credits
5082 .swap(0, std::sync::atomic::Ordering::Relaxed);
5083 for _ in 0..credits {
5084 self.restore_wildcard_budget();
5085 }
5086 }
5087
5088 if let Some(q) = q {
5089 while let Ok(link) = q.try_recv() {
5090 let s = link.into();
5091 let allowed = self.is_allowed_budgetless(&s);
5092
5093 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
5094 *exceeded_budget = true;
5095 break;
5096 }
5097
5098 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&s).await {
5099 continue;
5100 }
5101
5102 self.links_visited.extend_with_new_links(links, s);
5103 }
5104 }
5105 }
5106
5107 #[cfg(all(feature = "agent", feature = "serde"))]
5109 async fn apply_url_prefilter(&self, links: &mut HashSet<CaseInsensitiveString>) {
5110 if let Some(ref cfgs) = self.configuration.remote_multimodal {
5111 if cfgs.cfg.url_prefilter && cfgs.cfg.relevance_gate && !links.is_empty() {
5112 *links = crate::features::automation::prefilter_urls(cfgs, links).await;
5113 }
5114 }
5115 }
5116
5117 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
5119 async fn crawl_concurrent_raw(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
5120 self.start();
5121
5122 if self.crawl_cache_phase(client).await {
5124 return; }
5126 if !self.extra_links.is_empty() {
5127 self.skip_initial = true;
5128 }
5129
5130 self.status = CrawlStatus::Active;
5131 let client_rotator = self.client_rotator.clone();
5132 #[cfg(feature = "hedge")]
5133 let hedge_config = self.configuration.hedge.clone();
5134 let mut selector: (
5135 CompactString,
5136 smallvec::SmallVec<[CompactString; 2]>,
5137 CompactString,
5138 ) = self.setup_selectors();
5139 if self.single_page() {
5140 self._crawl_establish(client, &mut selector, false).await;
5141 } else {
5142 let on_should_crawl_callback = self.on_should_crawl_callback.clone();
5143 let full_resources = self.configuration.full_resources;
5144 let return_page_links = self.configuration.return_page_links;
5145 let only_html = self.configuration.only_html && !full_resources;
5146 #[cfg(any(
5147 feature = "cache",
5148 feature = "cache_mem",
5149 feature = "chrome_remote_cache"
5150 ))]
5151 let cache_options_raw = self.configuration.get_cache_options();
5152 #[cfg(any(
5153 feature = "cache",
5154 feature = "cache_mem",
5155 feature = "chrome_remote_cache"
5156 ))]
5157 let cache_policy_raw = self.configuration.cache_policy.clone();
5158 #[cfg(any(
5159 feature = "cache",
5160 feature = "cache_mem",
5161 feature = "chrome_remote_cache"
5162 ))]
5163 let normalize_raw = self.configuration.normalize;
5164 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
5165
5166 let (mut interval, throttle) = self.setup_crawl();
5167
5168 let mut links: HashSet<CaseInsensitiveString> = self.drain_extra_links().collect();
5169
5170 links.extend(self._crawl_establish(client, &mut selector, false).await);
5171
5172 self.configuration.configure_allowlist();
5173
5174 let semaphore = self.setup_semaphore();
5175
5176 let shared = Arc::new((
5177 client.to_owned(),
5178 selector,
5179 self.channel.clone(),
5180 self.configuration.external_domains_caseless.clone(),
5181 self.channel_guard.clone(),
5182 self.configuration.retry,
5183 self.configuration.full_resources,
5184 PageLinkBuildSettings::new_full(
5185 false,
5186 self.configuration.full_resources,
5187 self.configuration.subdomains,
5188 self.configuration.tld,
5189 self.configuration.normalize,
5190 ),
5191 self.domain_parsed.clone(),
5192 self.on_link_find_callback.clone(),
5193 self.configuration.remote_multimodal.clone(),
5194 ));
5195
5196 let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> = JoinSet::new();
5197
5198 let mut exceeded_budget = false;
5200 let concurrency = throttle.is_zero();
5201
5202 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5203
5204 if !concurrency && !links.is_empty() {
5205 tokio::time::sleep(*throttle).await;
5206 }
5207
5208 let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
5209 Some(Instant::now())
5210 } else {
5211 None
5212 };
5213
5214 'outer: loop {
5215 #[cfg(all(feature = "agent", feature = "serde"))]
5216 self.apply_url_prefilter(&mut links).await;
5217
5218 let mut stream =
5219 tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect());
5220
5221 loop {
5222 if !concurrency {
5223 tokio::time::sleep(*throttle).await;
5224 }
5225
5226 let semaphore =
5227 get_semaphore(&semaphore, !self.configuration.shared_queue).await;
5228
5229 tokio::select! {
5230 biased;
5231 Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
5232 if !self.handle_process(handle, &mut interval, async {
5233 emit_log_shutdown(link.inner());
5234 let permits = set.len();
5235 set.shutdown().await;
5236 semaphore.add_permits(permits);
5237 }).await {
5238 while let Some(links) = stream.next().await {
5239 self.extra_links.insert(links);
5240 }
5241 break 'outer;
5242 }
5243 let allowed = self.is_allowed(&link);
5244
5245 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
5246 exceeded_budget = true;
5247 break;
5248 }
5249
5250 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
5251 continue;
5252 }
5253
5254 emit_log(link.inner());
5255
5256 self.insert_link(link.clone()).await;
5257
5258 if let Ok(permit) = semaphore.clone().acquire_owned().await {
5259 let shared = shared.clone();
5260 let on_should_crawl_callback = on_should_crawl_callback.clone();
5261 let rotator = client_rotator.clone();
5262 #[cfg(feature = "hedge")]
5263 let hedge_cfg = hedge_config.clone();
5264 #[cfg(any(feature = "cache", feature = "cache_mem", feature = "chrome_remote_cache"))]
5265 let cache_opts = cache_options_raw.clone();
5266 #[cfg(any(feature = "cache", feature = "cache_mem", feature = "chrome_remote_cache"))]
5267 let cache_pol = cache_policy_raw.clone();
5268 #[cfg(any(feature = "cache", feature = "cache_mem", feature = "chrome_remote_cache"))]
5269 let normalize = normalize_raw;
5270 spawn_set("page_fetch", &mut set, async move {
5271 let link_result = match &shared.9 {
5272 Some(cb) => cb(link, None),
5273 _ => (link, None),
5274 };
5275
5276 let target_url = link_result.0.as_ref();
5277
5278 #[cfg(any(feature = "cache", feature = "cache_mem", feature = "chrome_remote_cache"))]
5280 {
5281 use crate::utils::{cache_skip_browser, get_cached_url, build_cached_html_page_response};
5282 if cache_skip_browser(&cache_opts) {
5283 if let Some(html) = get_cached_url(target_url, cache_opts.as_ref(), &cache_pol).await {
5284 let page_response = build_cached_html_page_response(target_url, &html);
5285 let mut page = build(target_url, page_response);
5286
5287 if !shared.3.is_empty() {
5288 page.set_external(shared.3.clone());
5289 }
5290 page.set_url_parsed_direct();
5291 if return_page_links {
5292 page.page_links = Some(Default::default());
5293 }
5294 let page_base = page.base.take().map(Box::new);
5295 let links = if full_resources {
5296 page.links_full(&shared.1, &page_base).await
5297 } else {
5298 page.links(&shared.1, &page_base).await
5299 };
5300 page.base = None;
5301 if normalize {
5302 page.signature.replace(crate::utils::hash_html(page.get_html_bytes_u8()).await);
5303 }
5304 if let Some(ref cb) = on_should_crawl_callback {
5305 if !cb.call(&page) {
5306 page.blocked_crawl = true;
5307 channel_send_page(&shared.2, page, &shared.4);
5308 drop(permit);
5309 return Default::default();
5310 }
5311 }
5312 let signature = page.signature;
5313 channel_send_page(&shared.2, page, &shared.4);
5314 drop(permit);
5315 return (links, signature);
5316 }
5317 }
5318 }
5319
5320 let external_domains_caseless = &shared.3;
5321
5322 #[cfg(feature = "hedge")]
5324 let (mut page, mut links, mut links_pages) = {
5325 let should_hedge = if let Some(ref hcfg) = hedge_cfg {
5326 hcfg.enabled && rotator.as_ref().map_or(false, |r| r.len() > 1)
5327 } else {
5328 false
5329 };
5330
5331 if should_hedge {
5332 let hcfg = hedge_cfg.as_ref().unwrap();
5333 let rot = rotator.as_ref().unwrap();
5334 let (primary_client, hedge_client_opt) = rot.next_pair();
5335
5336 if let Some(hedge_client) = hedge_client_opt {
5337 let delay = hcfg.delay;
5338
5339 let primary_fut = async {
5340 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
5341 let mut links_pages = if return_page_links { Some(HashSet::new()) } else { None };
5342 let mut selectors = shared.1.clone();
5343 let mut r_settings = shared.7;
5344 r_settings.ssg_build = true;
5345 let mut domain_parsed = None;
5346 let page = Page::new_page_streaming(
5347 target_url, primary_client, only_html,
5348 &mut selectors, external_domains_caseless,
5349 &r_settings, &mut links, None, &shared.8,
5350 &mut domain_parsed, &mut links_pages).await;
5351 (page, links, links_pages)
5352 };
5353
5354 tokio::pin!(primary_fut);
5355
5356 tokio::select! {
5357 biased;
5358 result = &mut primary_fut => result,
5359 _ = tokio::time::sleep(delay) => {
5360 log::info!("[hedge] fired after {}ms url={}", delay.as_millis(), target_url);
5361
5362 let hedge_fut = async {
5363 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
5364 let mut links_pages = if return_page_links { Some(HashSet::new()) } else { None };
5365 let mut selectors = shared.1.clone();
5366 let mut r_settings = shared.7;
5367 r_settings.ssg_build = true;
5368 let mut domain_parsed = None;
5369 let page = Page::new_page_streaming(
5370 target_url, hedge_client, only_html,
5371 &mut selectors, external_domains_caseless,
5372 &r_settings, &mut links, None, &shared.8,
5373 &mut domain_parsed, &mut links_pages).await;
5374 (page, links, links_pages)
5375 };
5376
5377 tokio::pin!(hedge_fut);
5378
5379 tokio::select! {
5380 biased;
5381 result = &mut primary_fut => {
5382 log::info!("[hedge] winner: primary url={}", target_url);
5383 result
5384 }
5385 result = &mut hedge_fut => {
5386 log::info!("[hedge] winner: hedge url={}", target_url);
5387 result
5388 }
5389 }
5390 }
5391 }
5392 } else {
5393 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
5394 let mut links_pages = if return_page_links { Some(HashSet::new()) } else { None };
5395 let mut selectors = shared.1.clone();
5396 let mut r_settings = shared.7;
5397 r_settings.ssg_build = true;
5398 let mut domain_parsed = None;
5399 let page = Page::new_page_streaming(
5400 target_url, primary_client, only_html,
5401 &mut selectors, external_domains_caseless,
5402 &r_settings, &mut links, None, &shared.8,
5403 &mut domain_parsed, &mut links_pages).await;
5404 (page, links, links_pages)
5405 }
5406 } else {
5407 let client = match &rotator {
5408 Some(r) => r.next(),
5409 None => &shared.0,
5410 };
5411 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
5412 let mut links_pages = if return_page_links { Some(HashSet::new()) } else { None };
5413 let mut selectors = shared.1.clone();
5414 let mut r_settings = shared.7;
5415 r_settings.ssg_build = true;
5416 let mut domain_parsed = None;
5417 let page = Page::new_page_streaming(
5418 target_url, client, only_html,
5419 &mut selectors, external_domains_caseless,
5420 &r_settings, &mut links, None, &shared.8,
5421 &mut domain_parsed, &mut links_pages).await;
5422 (page, links, links_pages)
5423 }
5424 };
5425
5426 #[cfg(not(feature = "hedge"))]
5427 let (mut page, mut links, mut links_pages) = {
5428 let client = match &rotator {
5429 Some(r) => r.next(),
5430 None => &shared.0,
5431 };
5432 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
5433 let mut links_pages = if return_page_links {
5434 Some(links.clone())
5435 } else {
5436 None
5437 };
5438 let mut relative_selectors = shared.1.clone();
5439 let mut r_settings = shared.7;
5440 r_settings.ssg_build = true;
5441 let mut domain_parsed = None;
5442 let page = Page::new_page_streaming(
5443 target_url,
5444 client, only_html,
5445 &mut relative_selectors,
5446 external_domains_caseless,
5447 &r_settings,
5448 &mut links,
5449 None,
5450 &shared.8,
5451 &mut domain_parsed,
5452 &mut links_pages).await;
5453 (page, links, links_pages)
5454 };
5455
5456 let mut retry_count = shared.5;
5457
5458 while page.should_retry && retry_count > 0 {
5459 retry_count -= 1;
5460
5461 if let Some(timeout) = page.get_timeout() {
5462 tokio::time::sleep(timeout).await;
5463 }
5464
5465 let retry_client = match &rotator {
5466 Some(r) => r.next(),
5467 None => &shared.0,
5468 };
5469
5470 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
5471 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
5472 let mut domain_parsed = None;
5473 let mut retry_r_settings = shared.7;
5474 retry_r_settings.ssg_build = true;
5475 let next_page = Page::new_page_streaming(
5476 target_url,
5477 retry_client, only_html,
5478 &mut shared.1.clone(),
5479 external_domains_caseless,
5480 &retry_r_settings,
5481 &mut links,
5482 None,
5483 &shared.8,
5484 &mut domain_parsed,
5485 &mut links_pages).await;
5486
5487 page.clone_from(&next_page);
5488
5489 }).await
5490 {
5491 log::warn!("Handler timeout exceeded {elasped}");
5492 }
5493
5494 } else {
5495 let mut domain_parsed = None;
5496 let mut retry_r_settings = shared.7;
5497 retry_r_settings.ssg_build = true;
5498 page.clone_from(&Page::new_page_streaming(
5499 target_url,
5500 retry_client,
5501 only_html,
5502 &mut shared.1.clone(),
5503 external_domains_caseless,
5504 &retry_r_settings,
5505 &mut links,
5506 None,
5507 &shared.8,
5508 &mut domain_parsed,
5509 &mut links_pages).await);
5510 }
5511 }
5512
5513 if return_page_links {
5514 page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new);
5515 }
5516
5517 #[cfg(all(feature = "agent", feature = "serde"))]
5519 if shared.10.is_some() {
5520 let html = page.get_html();
5521 if !html.is_empty() {
5522 use crate::features::automation::{run_remote_multimodal_extraction, AutomationResultExt};
5523 let title = page.metadata.as_ref().and_then(|m| m.title.as_ref()).map(|t| t.as_str());
5524 if let Ok(Some(result)) = run_remote_multimodal_extraction(
5525 &shared.10,
5526 &html,
5527 target_url,
5528 title,
5529 ).await {
5530 match page.remote_multimodal_usage.as_mut() {
5532 Some(v) => v.push(result.usage.clone()),
5533 None => page.remote_multimodal_usage = Some(vec![result.usage.clone()]),
5534 }
5535 if result.extracted.is_some() || result.screenshot.is_some() {
5537 let automation_result = result.to_automation_results();
5538 match page.extra_remote_multimodal_data.as_mut() {
5539 Some(v) => v.push(automation_result),
5540 None => page.extra_remote_multimodal_data = Some(vec![automation_result]),
5541 }
5542 }
5543 }
5544 }
5545 }
5546
5547 if let Some(ref cb) = on_should_crawl_callback {
5548 if !cb.call(&page) {
5549 page.blocked_crawl = true;
5550 channel_send_page(&shared.2, page, &shared.4);
5551 drop(permit);
5552 return Default::default()
5553 }
5554 }
5555
5556 let signature = page.signature;
5557
5558 channel_send_page(&shared.2, page, &shared.4);
5559
5560 drop(permit);
5561
5562 (links, signature)
5563 });
5564 }
5565
5566 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5567 },
5568 Some(result) = set.join_next(), if !set.is_empty() => {
5569 if let Ok(res) = result {
5570 match res.1 {
5571 Some(signature) => {
5572 if self.is_signature_allowed(signature).await {
5573 self.insert_signature(signature).await;
5574 self.links_visited.extend_links(&mut links, res.0);
5575 }
5576 }
5577 _ => {
5578 self.links_visited.extend_links(&mut links, res.0);
5579 }
5580 }
5581 } else {
5582 break;
5583 }
5584 }
5585 else => break,
5586 }
5587
5588 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5589
5590 if links.is_empty() && set.is_empty() || exceeded_budget {
5591 if exceeded_budget {
5593 while let Some(links) = stream.next().await {
5594 self.extra_links.insert(links);
5595 }
5596 while let Some(links) = set.join_next().await {
5597 if let Ok(links) = links {
5598 self.extra_links.extend(links.0);
5599 }
5600 }
5601 }
5602 break 'outer;
5603 }
5604 }
5605
5606 self.subscription_guard().await;
5607 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5608
5609 if links.is_empty() && set.is_empty() {
5610 break;
5611 }
5612 }
5613
5614 if !links.is_empty() {
5616 self.extra_links.extend(links);
5617 }
5618 }
5619 }
5620
5621 #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
5623 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
5624 async fn crawl_concurrent(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
5625 use crate::features::chrome::attempt_navigation;
5626 self.start();
5627
5628 if self.crawl_cache_phase(client).await {
5630 return; }
5632 if !self.extra_links.is_empty() {
5634 self.skip_initial = true;
5635 }
5636
5637 match self.setup_browser().await {
5639 Some(mut b) => {
5640 match attempt_navigation(
5641 "about:blank",
5642 &b.browser.0,
5643 &self.configuration.request_timeout,
5644 &b.browser.2,
5645 &self.configuration.viewport,
5646 )
5647 .await
5648 {
5649 Ok(new_page) => {
5650 let mut selectors = self.setup_selectors();
5651 self.status = CrawlStatus::Active;
5652
5653 if self.single_page() {
5654 self.crawl_establish(client, &mut selectors, false, &new_page)
5655 .await;
5656 drop(new_page);
5657 self.subscription_guard().await;
5658 b.dispose();
5659 } else {
5660 let semaphore: Arc<Semaphore> = self.setup_semaphore();
5661 let (mut interval, throttle) = self.setup_crawl();
5662
5663 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
5664
5665 let base_links = self
5666 .crawl_establish(client, &mut selectors, false, &new_page)
5667 .await;
5668
5669 drop(new_page);
5670
5671 let mut links: HashSet<CaseInsensitiveString> =
5672 self.drain_extra_links().collect();
5673
5674 links.extend(base_links);
5675
5676 self.configuration.configure_allowlist();
5677
5678 let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> =
5679 JoinSet::new();
5680
5681 let shared = Arc::new((
5682 client.to_owned(),
5683 selectors,
5684 self.channel.clone(),
5685 self.configuration.external_domains_caseless.clone(),
5686 self.channel_guard.clone(),
5687 b.browser.0.clone(),
5688 self.configuration.clone(),
5689 self.url.inner().to_string(),
5690 b.browser.2.clone(),
5691 self.domain_parsed.clone(),
5692 self.on_link_find_callback.clone(),
5693 ));
5694
5695 let add_external = !shared.3.is_empty();
5696 let on_should_crawl_callback = self.on_should_crawl_callback.clone();
5697 let full_resources = self.configuration.full_resources;
5698 let return_page_links = self.configuration.return_page_links;
5699 let mut exceeded_budget = false;
5700 let concurrency = throttle.is_zero();
5701
5702 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5703
5704 if !concurrency && !links.is_empty() {
5705 tokio::time::sleep(*throttle).await;
5706 }
5707
5708 let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
5709 Some(Instant::now())
5710 } else {
5711 None
5712 };
5713
5714 'outer: loop {
5715 #[cfg(all(feature = "agent", feature = "serde"))]
5716 self.apply_url_prefilter(&mut links).await;
5717
5718 let mut stream = tokio_stream::iter::<HashSet<CaseInsensitiveString>>(
5719 links.drain().collect(),
5720 );
5721
5722 loop {
5723 if !concurrency {
5724 tokio::time::sleep(*throttle).await;
5725 }
5726
5727 let semaphore =
5728 get_semaphore(&semaphore, !self.configuration.shared_queue)
5729 .await;
5730
5731 tokio::select! {
5732 biased;
5733 Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
5734 if !self
5735 .handle_process(
5736 handle,
5737 &mut interval,
5738 async {
5739 emit_log_shutdown(link.inner());
5740 let permits = set.len();
5741 set.shutdown().await;
5742 semaphore.add_permits(permits);
5743 },
5744 )
5745 .await
5746 {
5747 break 'outer;
5748 }
5749
5750 let allowed = self.is_allowed(&link);
5751
5752 if allowed
5753 .eq(&ProcessLinkStatus::BudgetExceeded)
5754 {
5755 exceeded_budget = true;
5756 break;
5757 }
5758 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
5759 continue;
5760 }
5761
5762 emit_log(link.inner());
5763
5764 self.insert_link(link.clone()).await;
5765
5766 if let Ok(permit) = semaphore.clone().acquire_owned().await {
5767 let shared = shared.clone();
5768 let on_should_crawl_callback = on_should_crawl_callback.clone();
5769 spawn_set("page_fetch", &mut set, async move {
5770 let link_result =
5771 match &shared.10 {
5772 Some(cb) => cb(link, None),
5773 _ => (link, None),
5774 };
5775
5776 let target_url_string = link_result.0.as_ref().to_string();
5777
5778 #[cfg(any(feature = "cache", feature = "cache_mem", feature = "chrome_remote_cache"))]
5780 {
5781 use crate::utils::{cache_skip_browser, get_cached_url, build_cached_html_page_response};
5782 let cache_options = shared.6.get_cache_options();
5783 if cache_skip_browser(&cache_options) {
5784 if let Some(html) = get_cached_url(&target_url_string, cache_options.as_ref(), &shared.6.cache_policy).await {
5785 let page_response = build_cached_html_page_response(&target_url_string, &html);
5786 let mut page = build(&target_url_string, page_response);
5787
5788 if add_external {
5789 page.set_external(shared.3.clone());
5790 }
5791 page.set_url_parsed_direct();
5792 let page_base = page.base.take().map(Box::new);
5793 if return_page_links {
5794 page.page_links = Some(Default::default());
5795 }
5796 let links = if full_resources {
5797 page.links_full(&shared.1, &page_base).await
5798 } else {
5799 page.links(&shared.1, &page_base).await
5800 };
5801 page.base = None;
5802 if shared.6.normalize {
5803 page.signature.replace(crate::utils::hash_html(page.get_html_bytes_u8()).await);
5804 }
5805 if let Some(ref cb) = on_should_crawl_callback {
5806 if !cb.call(&page) {
5807 page.blocked_crawl = true;
5808 channel_send_page(&shared.2, page, &shared.4);
5809 drop(permit);
5810 return Default::default();
5811 }
5812 }
5813 let signature = page.signature;
5814 channel_send_page(&shared.2, page, &shared.4);
5815 drop(permit);
5816 return (links, signature);
5817 }
5818 }
5819 }
5820
5821 let results = match attempt_navigation("about:blank", &shared.5, &shared.6.request_timeout, &shared.8, &shared.6.viewport).await {
5822 Ok(new_page) => {
5823 let (_, intercept_handle) = tokio::join!(
5824 crate::features::chrome::setup_chrome_events(&new_page, &shared.6),
5825 crate::features::chrome::setup_chrome_interception_base(
5826 &new_page,
5827 shared.6.chrome_intercept.enabled,
5828 &shared.6.auth_challenge_response,
5829 shared.6.chrome_intercept.block_visuals,
5830 &shared.7,
5831 )
5832 );
5833
5834 let target_url = target_url_string.as_str();
5835
5836 let mut page = Page::new(
5837 target_url,
5838 &shared.0,
5839 &new_page,
5840 &shared.6.wait_for,
5841 &shared.6.screenshot,
5842 false,
5843 &shared.6.openai_config,
5844 &shared.6.execution_scripts,
5845 &shared.6.automation_scripts,
5846 &shared.6.viewport,
5847 &shared.6.request_timeout,
5848 &shared.6.track_events,
5849 shared.6.referer.clone(),
5850 shared.6.max_page_bytes,
5851 shared.6.get_cache_options(),
5852 &shared.6.cache_policy,
5853 &shared.6.remote_multimodal,
5854 )
5855 .await;
5856
5857 let mut retry_count = shared.6.retry;
5858
5859 while page.should_retry && retry_count > 0 {
5860 retry_count -= 1;
5861 if let Some(timeout) = page.get_timeout() {
5862 tokio::time::sleep(timeout).await;
5863 }
5864 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
5865 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
5866 let p = Page::new(
5867 target_url,
5868 &shared.0,
5869 &new_page,
5870 &shared.6.wait_for,
5871 &shared.6.screenshot,
5872 false,
5873 &shared.6.openai_config,
5874 &shared.6.execution_scripts,
5875 &shared.6.automation_scripts,
5876 &shared.6.viewport,
5877 &shared.6.request_timeout,
5878 &shared.6.track_events,
5879 shared.6.referer.clone(),
5880 shared.6.max_page_bytes,
5881 shared.6.get_cache_options(),
5882 &shared.6.cache_policy,
5883 &shared.6.remote_multimodal,
5884 ).await;
5885 page.clone_from(&p);
5886
5887 }).await {
5888 log::info!("{target_url} backoff gateway timeout exceeded {elasped}");
5889 }
5890 } else {
5891 page.clone_from(
5892 &Page::new(
5893 target_url,
5894 &shared.0,
5895 &new_page,
5896 &shared.6.wait_for,
5897 &shared.6.screenshot,
5898 false,
5899 &shared.6.openai_config,
5900 &shared.6.execution_scripts,
5901 &shared.6.automation_scripts,
5902 &shared.6.viewport,
5903 &shared.6.request_timeout,
5904 &shared.6.track_events,
5905 shared.6.referer.clone(),
5906 shared.6.max_page_bytes,
5907 shared.6.get_cache_options(),
5908 &shared.6.cache_policy,
5909 &shared.6.remote_multimodal,
5910 )
5911 .await,
5912 );
5913 }
5914 }
5915
5916 if let Some(h) = intercept_handle {
5917 let abort_handle = h.abort_handle();
5918 if let Err(elasped) = tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await {
5919 log::warn!("Handler timeout exceeded {elasped}");
5920 abort_handle.abort();
5921 }
5922 }
5923
5924 if add_external {
5925 page.set_external(shared.3.clone());
5926 }
5927
5928 let prev_domain = page.base.take();
5929
5930 page.set_url_parsed_direct();
5933 let page_base = page.base.take().map(Box::new);
5934
5935 if return_page_links {
5936 page.page_links = Some(Default::default());
5937 }
5938
5939 let links = if full_resources {
5940 page.links_full(&shared.1, &page_base).await
5941 } else {
5942 page.links(&shared.1, &page_base).await
5943 };
5944
5945 page.base = prev_domain;
5946
5947 if shared.6.normalize {
5948 page.signature.replace(crate::utils::hash_html(page.get_html_bytes_u8()).await);
5949 }
5950
5951 if let Some(ref cb) = on_should_crawl_callback {
5952 if !cb.call(&page) {
5953 page.blocked_crawl = true;
5954 channel_send_page(&shared.2, page, &shared.4);
5955 drop(permit);
5956 return Default::default()
5957 }
5958 }
5959
5960 let signature = page.signature;
5961
5962 channel_send_page(
5963 &shared.2, page, &shared.4,
5964 );
5965
5966 (links, signature)
5967 }
5968 _ => Default::default(),
5969 };
5970
5971
5972 drop(permit);
5973
5974 results
5975 });
5976 }
5977
5978 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
5979 }
5980 Some(result) = set.join_next(), if !set.is_empty() => {
5981 if let Ok(res) = result {
5982 match res.1 {
5983 Some(signature) => {
5984 if self.is_signature_allowed(signature).await {
5985 self.insert_signature(signature).await;
5986 self.links_visited.extend_links(&mut links, res.0);
5987 }
5988 }
5989 _ => {
5990 self.links_visited.extend_links(&mut links, res.0);
5991 }
5992 }
5993 } else{
5994 break
5995 }
5996 }
5997 else => break,
5998 };
5999
6000 if links.is_empty() && set.is_empty() || exceeded_budget {
6001 if exceeded_budget {
6002 while set.join_next().await.is_some() {}
6003 }
6004 break 'outer;
6005 }
6006 }
6007
6008 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6009
6010 if links.is_empty() && set.is_empty() {
6011 break;
6012 }
6013 }
6014
6015 self.subscription_guard().await;
6016 b.dispose();
6017 if !links.is_empty() {
6019 self.extra_links.extend(links);
6020 }
6021 }
6022 }
6023 Err(err) => {
6024 b.dispose();
6025 log::error!("{}", err)
6026 }
6027 }
6028 }
6029 _ => log::error!("Chrome initialization failed."),
6030 }
6031 }
6032
6033 #[cfg(not(feature = "decentralized"))]
6035 #[cfg_attr(
6036 all(feature = "tracing", not(feature = "decentralized")),
6037 tracing::instrument(skip_all)
6038 )]
6039 async fn crawl_concurrent_raw_send(
6040 &self,
6041 client: &Client,
6042 handle: &Option<Arc<AtomicI8>>,
6043 url: &Option<&str>,
6044 ) -> Website {
6045 let mut selector: (
6046 CompactString,
6047 smallvec::SmallVec<[CompactString; 2]>,
6048 CompactString,
6049 ) = self.setup_selectors();
6050
6051 let mut website = self.clone();
6052
6053 if let Some(u) = url {
6054 match &website.domain_parsed {
6055 Some(domain_url) => {
6056 if domain_url.as_str().starts_with(u) {
6057 website.set_url_only(u);
6058 } else {
6059 website.set_url(u);
6060 }
6061 }
6062 _ => {
6063 website.set_url(u);
6064 }
6065 }
6066 }
6067
6068 if !website.send_configured {
6069 website.configure_setup().await;
6070 }
6071
6072 if self.single_page() {
6073 website._crawl_establish(client, &mut selector, false).await;
6074 website
6075 } else {
6076 let client_rotator = self.client_rotator.clone();
6077 #[cfg(feature = "hedge")]
6078 let hedge_config = self.configuration.hedge.clone();
6079 let on_should_crawl_callback = self.on_should_crawl_callback.clone();
6080 let full_resources = self.configuration.full_resources;
6081 let return_page_links = self.configuration.return_page_links;
6082 let only_html = self.configuration.only_html && !full_resources;
6083 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
6084
6085 let (mut interval, throttle) = self.setup_crawl();
6086
6087 let mut links: HashSet<CaseInsensitiveString> = website.drain_extra_links().collect();
6088
6089 links.extend(website._crawl_establish(client, &mut selector, false).await);
6090
6091 let semaphore = self.setup_semaphore();
6092
6093 let shared = Arc::new((
6094 client.to_owned(),
6095 selector,
6096 self.channel.clone(),
6097 self.configuration.external_domains_caseless.clone(),
6098 self.channel_guard.clone(),
6099 self.configuration.retry,
6100 self.configuration.full_resources,
6101 PageLinkBuildSettings::new_full(
6102 false,
6103 self.configuration.full_resources,
6104 self.configuration.subdomains,
6105 self.configuration.tld,
6106 self.configuration.normalize,
6107 ),
6108 self.domain_parsed.clone(),
6109 self.on_link_find_callback.clone(),
6110 self.configuration.remote_multimodal.clone(),
6111 ));
6112
6113 let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> = JoinSet::new();
6114
6115 let mut exceeded_budget = false;
6117 let concurrency = throttle.is_zero();
6118
6119 website
6120 .dequeue(&mut q, &mut links, &mut exceeded_budget)
6121 .await;
6122
6123 if !concurrency && !links.is_empty() {
6124 tokio::time::sleep(*throttle).await;
6125 }
6126
6127 let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
6128 Some(Instant::now())
6129 } else {
6130 None
6131 };
6132
6133 'outer: loop {
6134 #[cfg(all(feature = "agent", feature = "serde"))]
6135 self.apply_url_prefilter(&mut links).await;
6136
6137 let mut stream =
6138 tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect());
6139
6140 loop {
6141 if !concurrency {
6142 tokio::time::sleep(*throttle).await;
6143 }
6144
6145 let semaphore =
6146 get_semaphore(&semaphore, !self.configuration.shared_queue).await;
6147
6148 tokio::select! {
6149 biased;
6150 Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
6151 if !self.handle_process(handle, &mut interval, async {
6152 emit_log_shutdown(link.inner());
6153 let permits = set.len();
6154 set.shutdown().await;
6155 semaphore.add_permits(permits);
6156 }).await {
6157 break 'outer;
6158 }
6159 let allowed = website.is_allowed(&link);
6160
6161 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
6162 exceeded_budget = true;
6163 break;
6164 }
6165
6166 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
6167 continue;
6168 }
6169
6170 emit_log(link.inner());
6171
6172 website.insert_link(link.clone()).await;
6173
6174 if let Ok(permit) = semaphore.clone().acquire_owned().await {
6175 let shared = shared.clone();
6176 let on_should_crawl_callback = on_should_crawl_callback.clone();
6177 let rotator = client_rotator.clone();
6178 #[cfg(feature = "hedge")]
6179 let hedge_cfg = hedge_config.clone();
6180 spawn_set("page_fetch", &mut set, async move {
6181 let link_result = match &shared.9 {
6182 Some(cb) => cb(link, None),
6183 _ => (link, None),
6184 };
6185
6186 let target_url = link_result.0.as_ref();
6187 let external_domains_caseless = &shared.3;
6188
6189 #[cfg(feature = "hedge")]
6191 let (mut page, mut links, mut links_pages) = {
6192 let should_hedge = if let Some(ref hcfg) = hedge_cfg {
6193 hcfg.enabled && rotator.as_ref().map_or(false, |r| r.len() > 1)
6194 } else {
6195 false
6196 };
6197
6198 if should_hedge {
6199 let hcfg = hedge_cfg.as_ref().unwrap();
6200 let rot = rotator.as_ref().unwrap();
6201 let (primary_client, hedge_client_opt) = rot.next_pair();
6202
6203 if let Some(hedge_client) = hedge_client_opt {
6204 let delay = hcfg.delay;
6205
6206 let primary_fut = async {
6207 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
6208 let mut links_pages = if return_page_links { Some(HashSet::new()) } else { None };
6209 let mut selectors = shared.1.clone();
6210 let mut r_settings = shared.7;
6211 r_settings.ssg_build = true;
6212 let mut domain_parsed = None;
6213 let page = Page::new_page_streaming(
6214 target_url, primary_client, only_html,
6215 &mut selectors, external_domains_caseless,
6216 &r_settings, &mut links, None, &shared.8,
6217 &mut domain_parsed, &mut links_pages).await;
6218 (page, links, links_pages)
6219 };
6220
6221 tokio::pin!(primary_fut);
6222
6223 tokio::select! {
6224 biased;
6225 result = &mut primary_fut => result,
6226 _ = tokio::time::sleep(delay) => {
6227 log::info!("[hedge] fired after {}ms url={}", delay.as_millis(), target_url);
6228
6229 let hedge_fut = async {
6230 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
6231 let mut links_pages = if return_page_links { Some(HashSet::new()) } else { None };
6232 let mut selectors = shared.1.clone();
6233 let mut r_settings = shared.7;
6234 r_settings.ssg_build = true;
6235 let mut domain_parsed = None;
6236 let page = Page::new_page_streaming(
6237 target_url, hedge_client, only_html,
6238 &mut selectors, external_domains_caseless,
6239 &r_settings, &mut links, None, &shared.8,
6240 &mut domain_parsed, &mut links_pages).await;
6241 (page, links, links_pages)
6242 };
6243
6244 tokio::pin!(hedge_fut);
6245
6246 tokio::select! {
6247 biased;
6248 result = &mut primary_fut => {
6249 log::info!("[hedge] winner: primary url={}", target_url);
6250 result
6251 }
6252 result = &mut hedge_fut => {
6253 log::info!("[hedge] winner: hedge url={}", target_url);
6254 result
6255 }
6256 }
6257 }
6258 }
6259 } else {
6260 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
6261 let mut links_pages = if return_page_links { Some(HashSet::new()) } else { None };
6262 let mut selectors = shared.1.clone();
6263 let mut r_settings = shared.7;
6264 r_settings.ssg_build = true;
6265 let mut domain_parsed = None;
6266 let page = Page::new_page_streaming(
6267 target_url, primary_client, only_html,
6268 &mut selectors, external_domains_caseless,
6269 &r_settings, &mut links, None, &shared.8,
6270 &mut domain_parsed, &mut links_pages).await;
6271 (page, links, links_pages)
6272 }
6273 } else {
6274 let client = match &rotator {
6275 Some(r) => r.next(),
6276 None => &shared.0,
6277 };
6278 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
6279 let mut links_pages = if return_page_links { Some(HashSet::new()) } else { None };
6280 let mut selectors = shared.1.clone();
6281 let mut r_settings = shared.7;
6282 r_settings.ssg_build = true;
6283 let mut domain_parsed = None;
6284 let page = Page::new_page_streaming(
6285 target_url, client, only_html,
6286 &mut selectors, external_domains_caseless,
6287 &r_settings, &mut links, None, &shared.8,
6288 &mut domain_parsed, &mut links_pages).await;
6289 (page, links, links_pages)
6290 }
6291 };
6292
6293 #[cfg(not(feature = "hedge"))]
6294 let (mut page, mut links, mut links_pages) = {
6295 let client = match &rotator {
6296 Some(r) => r.next(),
6297 None => &shared.0,
6298 };
6299 let mut links: HashSet<CaseInsensitiveString> = HashSet::new();
6300 let mut links_pages = if return_page_links {
6301 Some(links.clone())
6302 } else {
6303 None
6304 };
6305 let mut relative_selectors = shared.1.clone();
6306 let mut r_settings = shared.7;
6307 r_settings.ssg_build = true;
6308 let mut domain_parsed = None;
6309 let page = Page::new_page_streaming(
6310 target_url,
6311 client, only_html,
6312 &mut relative_selectors,
6313 external_domains_caseless,
6314 &r_settings,
6315 &mut links,
6316 None,
6317 &shared.8,
6318 &mut domain_parsed,
6319 &mut links_pages).await;
6320 (page, links, links_pages)
6321 };
6322
6323 let mut retry_count = shared.5;
6324
6325 while page.should_retry && retry_count > 0 {
6326 retry_count -= 1;
6327
6328 if let Some(timeout) = page.get_timeout() {
6329 tokio::time::sleep(timeout).await;
6330 }
6331
6332 let retry_client = match &rotator {
6333 Some(r) => r.next(),
6334 None => &shared.0,
6335 };
6336
6337 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
6338 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
6339 let mut domain_parsed = None;
6340 let mut retry_r_settings = shared.7;
6341 retry_r_settings.ssg_build = true;
6342 let next_page = Page::new_page_streaming(
6343 target_url,
6344 retry_client, only_html,
6345 &mut shared.1.clone(),
6346 external_domains_caseless,
6347 &retry_r_settings,
6348 &mut links,
6349 None,
6350 &shared.8,
6351 &mut domain_parsed,
6352 &mut links_pages).await;
6353
6354 page.clone_from(&next_page);
6355
6356 }).await
6357 {
6358 log::warn!("Handler timeout exceeded {elasped}");
6359 }
6360
6361 } else {
6362 let mut domain_parsed = None;
6363 let mut retry_r_settings = shared.7;
6364 retry_r_settings.ssg_build = true;
6365 page.clone_from(&Page::new_page_streaming(
6366 target_url,
6367 retry_client,
6368 only_html,
6369 &mut shared.1.clone(),
6370 external_domains_caseless,
6371 &retry_r_settings,
6372 &mut links,
6373 None,
6374 &shared.8,
6375 &mut domain_parsed,
6376 &mut links_pages).await);
6377 }
6378 }
6379
6380 if return_page_links {
6381 page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new);
6382 }
6383
6384 #[cfg(all(feature = "agent", feature = "serde"))]
6386 if shared.10.is_some() {
6387 let html = page.get_html();
6388 if !html.is_empty() {
6389 use crate::features::automation::{run_remote_multimodal_extraction, AutomationResultExt};
6390 let title = page.metadata.as_ref().and_then(|m| m.title.as_ref()).map(|t| t.as_str());
6391 if let Ok(Some(result)) = run_remote_multimodal_extraction(
6392 &shared.10,
6393 &html,
6394 target_url,
6395 title,
6396 ).await {
6397 match page.remote_multimodal_usage.as_mut() {
6399 Some(v) => v.push(result.usage.clone()),
6400 None => page.remote_multimodal_usage = Some(vec![result.usage.clone()]),
6401 }
6402 if result.extracted.is_some() || result.screenshot.is_some() {
6404 let automation_result = result.to_automation_results();
6405 match page.extra_remote_multimodal_data.as_mut() {
6406 Some(v) => v.push(automation_result),
6407 None => page.extra_remote_multimodal_data = Some(vec![automation_result]),
6408 }
6409 }
6410 }
6411 }
6412 }
6413
6414 if let Some(ref cb) = on_should_crawl_callback {
6415 if !cb.call(&page) {
6416 page.blocked_crawl = true;
6417 channel_send_page(&shared.2, page, &shared.4);
6418 drop(permit);
6419 return Default::default()
6420 }
6421 }
6422
6423 let signature = page.signature;
6424
6425 channel_send_page(&shared.2, page, &shared.4);
6426
6427 drop(permit);
6428
6429 (links, signature)
6430 });
6431 }
6432
6433 website.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6434 },
6435 Some(result) = set.join_next(), if !set.is_empty() => {
6436 if let Ok(res) = result {
6437 match res.1 {
6438 Some(signature) => {
6439 if website.is_signature_allowed(signature).await {
6440 website.insert_signature(signature).await;
6441 website.links_visited.extend_links(&mut links, res.0);
6442 }
6443 }
6444 _ => {
6445 website.links_visited.extend_links(&mut links, res.0);
6446 }
6447 }
6448 } else {
6449 break;
6450 }
6451 }
6452 else => break,
6453 }
6454
6455 website
6456 .dequeue(&mut q, &mut links, &mut exceeded_budget)
6457 .await;
6458
6459 if links.is_empty() && set.is_empty() || exceeded_budget {
6460 if exceeded_budget {
6462 while set.join_next().await.is_some() {}
6463 }
6464 break 'outer;
6465 }
6466 }
6467
6468 website.subscription_guard().await;
6469 website
6470 .dequeue(&mut q, &mut links, &mut exceeded_budget)
6471 .await;
6472
6473 if links.is_empty() && set.is_empty() {
6474 break;
6475 }
6476 }
6477 website
6478 }
6479 }
6480
6481 #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
6483 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6484 async fn crawl_concurrent_send(
6485 &self,
6486 client: &Client,
6487 handle: &Option<Arc<AtomicI8>>,
6488 url: &Option<&str>,
6489 ) -> Website {
6490 use crate::features::chrome::attempt_navigation;
6491
6492 match self.setup_browser().await {
6493 Some(mut b) => {
6494 match attempt_navigation(
6495 "about:blank",
6496 &b.browser.0,
6497 &self.configuration.request_timeout,
6498 &b.browser.2,
6499 &self.configuration.viewport,
6500 )
6501 .await
6502 {
6503 Ok(new_page) => {
6504 let mut selectors = self.setup_selectors();
6505 let mut website = self.to_owned();
6506
6507 if let Some(u) = url {
6508 match &website.domain_parsed {
6509 Some(domain_url) => {
6510 if domain_url.as_str().starts_with(u) {
6511 website.set_url_only(u);
6512 } else {
6513 website.set_url(u);
6514 }
6515 }
6516 _ => {
6517 website.set_url(u);
6518 }
6519 }
6520 }
6521
6522 if !website.send_configured {
6523 website.configure_setup().await;
6524 }
6525
6526 let base_links = website
6527 .crawl_establish(client, &mut selectors, false, &new_page)
6528 .await;
6529
6530 drop(new_page);
6531
6532 if self.single_page() {
6533 website.subscription_guard().await;
6534 b.dispose();
6535 website
6536 } else {
6537 let semaphore: Arc<Semaphore> = self.setup_semaphore();
6538 let (mut interval, throttle) = self.setup_crawl();
6539
6540 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
6541
6542 let mut links: HashSet<CaseInsensitiveString> =
6543 *self.extra_links.clone();
6544
6545 links.extend(base_links);
6546
6547 let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> =
6548 JoinSet::new();
6549
6550 let shared = Arc::new((
6551 client.to_owned(),
6552 selectors,
6553 self.channel.clone(),
6554 self.configuration.external_domains_caseless.clone(),
6555 self.channel_guard.clone(),
6556 b.browser.0.clone(),
6557 self.configuration.clone(),
6558 self.url.inner().to_string(),
6559 b.browser.2.clone(),
6560 self.domain_parsed.clone(),
6561 self.on_link_find_callback.clone(),
6562 ));
6563
6564 let add_external = !shared.3.is_empty();
6565 let on_should_crawl_callback = self.on_should_crawl_callback.clone();
6566 let full_resources = self.configuration.full_resources;
6567 let return_page_links = self.configuration.return_page_links;
6568 let mut exceeded_budget = false;
6569 let concurrency = throttle.is_zero();
6570
6571 website
6572 .dequeue(&mut q, &mut links, &mut exceeded_budget)
6573 .await;
6574
6575 if !concurrency && !links.is_empty() {
6576 tokio::time::sleep(*throttle).await;
6577 }
6578
6579 let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
6580 Some(Instant::now())
6581 } else {
6582 None
6583 };
6584
6585 'outer: loop {
6586 #[cfg(all(feature = "agent", feature = "serde"))]
6587 self.apply_url_prefilter(&mut links).await;
6588
6589 let mut stream = tokio_stream::iter::<HashSet<CaseInsensitiveString>>(
6590 links.drain().collect(),
6591 );
6592
6593 loop {
6594 if !concurrency {
6595 tokio::time::sleep(*throttle).await;
6596 }
6597
6598 let semaphore =
6599 get_semaphore(&semaphore, !self.configuration.shared_queue)
6600 .await;
6601
6602 tokio::select! {
6603 biased;
6604 Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
6605 if !self
6606 .handle_process(
6607 handle,
6608 &mut interval,
6609 async {
6610 emit_log_shutdown(link.inner());
6611 let permits = set.len();
6612 set.shutdown().await;
6613 semaphore.add_permits(permits);
6614 },
6615 )
6616 .await
6617 {
6618 break 'outer;
6619 }
6620
6621 let allowed = website.is_allowed(&link);
6622
6623 if allowed
6624 .eq(&ProcessLinkStatus::BudgetExceeded)
6625 {
6626 exceeded_budget = true;
6627 break;
6628 }
6629 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
6630 continue;
6631 }
6632
6633 emit_log(link.inner());
6634
6635 website.insert_link(link.clone()).await;
6636
6637 if let Ok(permit) = semaphore.clone().acquire_owned().await {
6638 let shared = shared.clone();
6639 let on_should_crawl_callback = on_should_crawl_callback.clone();
6640 spawn_set("page_fetch", &mut set, async move {
6641 let results = match attempt_navigation("about:blank", &shared.5, &shared.6.request_timeout, &shared.8, &shared.6.viewport).await {
6642 Ok(new_page) => {
6643 let (_, intercept_handle) = tokio::join!(
6644 crate::features::chrome::setup_chrome_events(&new_page, &shared.6),
6645 crate::features::chrome::setup_chrome_interception_base(
6646 &new_page,
6647 shared.6.chrome_intercept.enabled,
6648 &shared.6.auth_challenge_response,
6649 shared.6.chrome_intercept.block_visuals,
6650 &shared.7,
6651 )
6652 );
6653
6654 let link_result =
6655 match &shared.10 {
6656 Some(cb) => cb(link, None),
6657 _ => (link, None),
6658 };
6659
6660 let target_url = link_result.0.as_ref();
6661
6662 let mut page = Page::new(
6663 target_url,
6664 &shared.0,
6665 &new_page,
6666 &shared.6.wait_for,
6667 &shared.6.screenshot,
6668 false,
6669 &shared.6.openai_config,
6670 &shared.6.execution_scripts,
6671 &shared.6.automation_scripts,
6672 &shared.6.viewport,
6673 &shared.6.request_timeout,
6674 &shared.6.track_events,
6675 shared.6.referer.clone(),
6676 shared.6.max_page_bytes,
6677 shared.6.get_cache_options(),
6678 &shared.6.cache_policy,
6679 &shared.6.remote_multimodal,
6680 )
6681 .await;
6682
6683 let mut retry_count = shared.6.retry;
6684
6685 while page.should_retry && retry_count > 0 {
6686 retry_count -= 1;
6687 if let Some(timeout) = page.get_timeout() {
6688 tokio::time::sleep(timeout).await;
6689 }
6690 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
6691 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
6692 let p = Page::new(
6693 target_url,
6694 &shared.0,
6695 &new_page,
6696 &shared.6.wait_for,
6697 &shared.6.screenshot,
6698 false,
6699 &shared.6.openai_config,
6700 &shared.6.execution_scripts,
6701 &shared.6.automation_scripts,
6702 &shared.6.viewport,
6703 &shared.6.request_timeout,
6704 &shared.6.track_events,
6705 shared.6.referer.clone(),
6706 shared.6.max_page_bytes,
6707 shared.6.get_cache_options(),
6708 &shared.6.cache_policy,
6709 &shared.6.remote_multimodal,
6710 ).await;
6711 page.clone_from(&p);
6712
6713 }).await {
6714 log::info!("{target_url} backoff gateway timeout exceeded {elasped}");
6715 }
6716 } else {
6717 page.clone_from(
6718 &Page::new(
6719 target_url,
6720 &shared.0,
6721 &new_page,
6722 &shared.6.wait_for,
6723 &shared.6.screenshot,
6724 false,
6725 &shared.6.openai_config,
6726 &shared.6.execution_scripts,
6727 &shared.6.automation_scripts,
6728 &shared.6.viewport,
6729 &shared.6.request_timeout,
6730 &shared.6.track_events,
6731 shared.6.referer.clone(),
6732 shared.6.max_page_bytes,
6733 shared.6.get_cache_options(),
6734 &shared.6.cache_policy,
6735 &shared.6.remote_multimodal,
6736 )
6737 .await,
6738 );
6739 }
6740 }
6741
6742 if let Some(h) = intercept_handle {
6743 let abort_handle = h.abort_handle();
6744 if let Err(elasped) = tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await {
6745 log::warn!("Handler timeout exceeded {elasped}");
6746 abort_handle.abort();
6747 }
6748 }
6749
6750 if add_external {
6751 page.set_external(shared.3.clone());
6752 }
6753
6754 let prev_domain = page.base.take();
6755
6756 page.set_url_parsed_direct();
6759 let page_base = page.base.take().map(Box::new);
6760
6761 if return_page_links {
6762 page.page_links = Some(Default::default());
6763 }
6764
6765 let links = if full_resources {
6766 page.links_full(&shared.1, &page_base).await
6767 } else {
6768 page.links(&shared.1, &page_base).await
6769 };
6770
6771 page.base = prev_domain;
6772
6773 if shared.6.normalize {
6774 page.signature.replace(crate::utils::hash_html(page.get_html_bytes_u8()).await);
6775 }
6776
6777 if let Some(ref cb) = on_should_crawl_callback {
6778 if !cb.call(&page) {
6779 page.blocked_crawl = true;
6780 channel_send_page(&shared.2, page, &shared.4);
6781 drop(permit);
6782 return Default::default()
6783 }
6784 }
6785
6786 let signature = page.signature;
6787
6788 channel_send_page(
6789 &shared.2, page, &shared.4,
6790 );
6791
6792 (links, signature)
6793 }
6794 _ => Default::default(),
6795 };
6796
6797
6798 drop(permit);
6799
6800 results
6801 });
6802 }
6803
6804 website.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
6805 }
6806 Some(result) = set.join_next(), if !set.is_empty() => {
6807 if let Ok(res) = result {
6808 match res.1 {
6809 Some(signature) => {
6810 if website.is_signature_allowed(signature).await {
6811 website.insert_signature(signature).await;
6812 website.links_visited.extend_links(&mut links, res.0);
6813 }
6814 }
6815 _ => {
6816 website.links_visited.extend_links(&mut links, res.0);
6817 }
6818 }
6819 } else{
6820 break
6821 }
6822 }
6823 else => break,
6824 };
6825
6826 if links.is_empty() && set.is_empty() || exceeded_budget {
6827 if exceeded_budget {
6828 while set.join_next().await.is_some() {}
6829 }
6830 break 'outer;
6831 }
6832 }
6833
6834 website
6835 .dequeue(&mut q, &mut links, &mut exceeded_budget)
6836 .await;
6837
6838 if links.is_empty() && set.is_empty() {
6839 break;
6840 }
6841 }
6842
6843 website.subscription_guard().await;
6844 b.dispose();
6845
6846 website
6847 }
6848 }
6849 Err(err) => {
6850 b.dispose();
6851 log::error!("{}", err);
6852 self.clone()
6853 }
6854 }
6855 }
6856 _ => {
6857 log::error!("Chrome initialization failed.");
6858 self.clone()
6859 }
6860 }
6861 }
6862
6863 #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
6865 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6866 async fn _fetch_chrome(&self, client: &Client, url: &Option<&str>) {
6867 use crate::features::chrome::attempt_navigation;
6868
6869 match self.setup_browser().await {
6870 Some(mut b) => {
6871 match attempt_navigation(
6872 "about:blank",
6873 &b.browser.0,
6874 &self.configuration.request_timeout,
6875 &b.browser.2,
6876 &self.configuration.viewport,
6877 )
6878 .await
6879 {
6880 Ok(new_page) => {
6881 let mut selectors = self.setup_selectors();
6882 self.crawl_establish_chrome_one(client, &mut selectors, url, &new_page)
6883 .await;
6884 self.subscription_guard().await;
6885 b.dispose();
6886 }
6887 Err(err) => {
6888 b.dispose();
6889 log::error!("{}", err);
6890 }
6891 }
6892 }
6893 _ => {
6894 log::error!("Chrome initialization failed.");
6895 }
6896 }
6897 }
6898
6899 #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
6901 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6902 async fn _fetch_chrome_persisted(
6903 &self,
6904 client: &Client,
6905 url: &Option<&str>,
6906 b: &crate::features::chrome::BrowserController,
6907 ) {
6908 use crate::features::chrome::attempt_navigation;
6909 match attempt_navigation(
6910 "about:blank",
6911 &b.browser.0,
6912 &self.configuration.request_timeout,
6913 &b.browser.2,
6914 &self.configuration.viewport,
6915 )
6916 .await
6917 {
6918 Ok(new_page) => {
6919 let mut selectors = self.setup_selectors();
6920 self.crawl_establish_chrome_one(client, &mut selectors, url, &new_page)
6921 .await;
6922 self.subscription_guard().await;
6923 }
6924 Err(err) => {
6925 log::error!("{}", err);
6926 }
6927 }
6928 }
6929
6930 #[cfg(all(
6932 not(feature = "decentralized"),
6933 not(feature = "chrome"),
6934 feature = "webdriver"
6935 ))]
6936 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
6937 async fn crawl_concurrent_webdriver(
6938 &mut self,
6939 client: &Client,
6940 handle: &Option<Arc<AtomicI8>>,
6941 ) {
6942 self.start();
6943
6944 match self.setup_webdriver().await {
6945 Some(mut controller) => {
6946 let driver = controller.driver();
6947 let mut selectors = self.setup_selectors();
6948 self.status = CrawlStatus::Active;
6949
6950 if self.single_page() {
6951 self.crawl_establish_webdriver_one(client, &mut selectors, &None, driver)
6952 .await;
6953 self.subscription_guard().await;
6954 controller.dispose();
6955 } else {
6956 let semaphore: Arc<Semaphore> = self.setup_semaphore();
6957 let (mut interval, throttle) = self.setup_crawl();
6958
6959 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
6960
6961 let base_links = self
6962 .crawl_establish_webdriver_one(client, &mut selectors, &None, driver)
6963 .await;
6964
6965 let mut links: HashSet<CaseInsensitiveString> =
6966 self.drain_extra_links().collect();
6967
6968 links.extend(base_links);
6969
6970 self.configuration.configure_allowlist();
6971
6972 let timeout = self
6973 .configuration
6974 .webdriver_config
6975 .as_ref()
6976 .and_then(|c| c.timeout);
6977
6978 let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> =
6979 JoinSet::new();
6980
6981 let shared = Arc::new((
6982 client.to_owned(),
6983 selectors,
6984 self.channel.clone(),
6985 self.configuration.external_domains_caseless.clone(),
6986 self.channel_guard.clone(),
6987 driver.clone(),
6988 self.configuration.clone(),
6989 self.url.inner().to_string(),
6990 self.domain_parsed.clone(),
6991 self.on_link_find_callback.clone(),
6992 timeout,
6993 ));
6994
6995 let add_external = !shared.3.is_empty();
6996 let on_should_crawl_callback = self.on_should_crawl_callback.clone();
6997 let full_resources = self.configuration.full_resources;
6998 let return_page_links = self.configuration.return_page_links;
6999 let mut exceeded_budget = false;
7000 let concurrency = throttle.is_zero();
7001
7002 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7003
7004 if !concurrency && !links.is_empty() {
7005 tokio::time::sleep(*throttle).await;
7006 }
7007
7008 let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
7009 Some(Instant::now())
7010 } else {
7011 None
7012 };
7013
7014 'outer: loop {
7015 #[cfg(all(feature = "agent", feature = "serde"))]
7016 self.apply_url_prefilter(&mut links).await;
7017
7018 let mut stream = tokio_stream::iter::<HashSet<CaseInsensitiveString>>(
7019 links.drain().collect(),
7020 );
7021
7022 loop {
7023 if !concurrency {
7024 tokio::time::sleep(*throttle).await;
7025 }
7026
7027 let semaphore =
7028 get_semaphore(&semaphore, !self.configuration.shared_queue).await;
7029
7030 tokio::select! {
7031 biased;
7032 Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
7033 if !self
7034 .handle_process(
7035 handle,
7036 &mut interval,
7037 async {
7038 emit_log_shutdown(link.inner());
7039 let permits = set.len();
7040 set.shutdown().await;
7041 semaphore.add_permits(permits);
7042 },
7043 )
7044 .await
7045 {
7046 break 'outer;
7047 }
7048
7049 let allowed = self.is_allowed(&link);
7050
7051 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
7052 exceeded_budget = true;
7053 break;
7054 }
7055 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
7056 continue;
7057 }
7058
7059 emit_log(link.inner());
7060
7061 self.insert_link(link.clone()).await;
7062
7063 if let Ok(permit) = semaphore.clone().acquire_owned().await {
7064 let shared = shared.clone();
7065 let on_should_crawl_callback = on_should_crawl_callback.clone();
7066
7067 spawn_set("page_fetch_webdriver", &mut set, async move {
7068 let link_result = match &shared.9 {
7069 Some(cb) => cb(link, None),
7070 _ => (link, None),
7071 };
7072
7073 let target_url = link_result.0.as_ref();
7074
7075 crate::features::webdriver::setup_driver_events(&shared.5, &shared.6).await;
7077
7078 let mut page = Page::new_page_webdriver(
7079 target_url,
7080 &shared.5,
7081 shared.10,
7082 )
7083 .await;
7084
7085 let mut retry_count = shared.6.retry;
7086
7087 while page.should_retry && retry_count > 0 {
7088 retry_count -= 1;
7089 if let Some(timeout_duration) = page.get_timeout() {
7090 tokio::time::sleep(timeout_duration).await;
7091 }
7092 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
7093 if let Err(elapsed) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
7094 let p = Page::new_page_webdriver(
7095 target_url,
7096 &shared.5,
7097 shared.10,
7098 ).await;
7099 page.clone_from(&p);
7100 }).await {
7101 log::info!("{target_url} backoff gateway timeout exceeded {elapsed}");
7102 }
7103 } else {
7104 page.clone_from(
7105 &Page::new_page_webdriver(
7106 target_url,
7107 &shared.5,
7108 shared.10,
7109 )
7110 .await,
7111 );
7112 }
7113 }
7114
7115 if add_external {
7116 page.set_external(shared.3.clone());
7117 }
7118
7119 let prev_domain = page.base.take();
7120
7121 page.set_url_parsed_direct();
7124 let page_base = page.base.take().map(Box::new);
7125
7126 if return_page_links {
7127 page.page_links = Some(Default::default());
7128 }
7129
7130 let links = if full_resources {
7131 page.links_full(&shared.1, &page_base).await
7132 } else {
7133 page.links(&shared.1, &page_base).await
7134 };
7135
7136 page.base = prev_domain;
7137
7138 if shared.6.normalize {
7139 page.signature.replace(crate::utils::hash_html(page.get_html_bytes_u8()).await);
7140 }
7141
7142 if let Some(ref cb) = on_should_crawl_callback {
7143 if !cb.call(&page) {
7144 page.blocked_crawl = true;
7145 channel_send_page(&shared.2, page, &shared.4);
7146 drop(permit);
7147 return Default::default();
7148 }
7149 }
7150
7151 let signature = page.signature;
7152
7153 channel_send_page(&shared.2, page, &shared.4);
7154
7155 drop(permit);
7156
7157 (links, signature)
7158 });
7159 }
7160
7161 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7162 }
7163 Some(result) = set.join_next(), if !set.is_empty() => {
7164 if let Ok(res) = result {
7165 match res.1 {
7166 Some(signature) => {
7167 if self.is_signature_allowed(signature).await {
7168 self.insert_signature(signature).await;
7169 self.links_visited.extend_links(&mut links, res.0);
7170 }
7171 }
7172 _ => {
7173 self.links_visited.extend_links(&mut links, res.0);
7174 }
7175 }
7176 } else {
7177 break
7178 }
7179
7180 if links.is_empty() && set.is_empty() || exceeded_budget {
7181 if exceeded_budget {
7182 while set.join_next().await.is_some() {}
7183 }
7184 break 'outer;
7185 }
7186 }
7187 else => break,
7188 };
7189
7190 if links.is_empty() && set.is_empty() || exceeded_budget {
7191 if exceeded_budget {
7192 while set.join_next().await.is_some() {}
7193 }
7194 break 'outer;
7195 }
7196 }
7197
7198 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7199
7200 if links.is_empty() && set.is_empty() {
7201 break;
7202 }
7203 }
7204
7205 self.subscription_guard().await;
7206 controller.dispose();
7207
7208 if !links.is_empty() {
7209 self.extra_links.extend(links);
7210 }
7211 }
7212 }
7213 None => {
7214 log::error!("WebDriver initialization failed.");
7215 }
7216 }
7217 }
7218
7219 #[cfg(all(
7221 not(feature = "decentralized"),
7222 not(feature = "chrome"),
7223 feature = "webdriver"
7224 ))]
7225 pub async fn crawl_concurrent(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
7226 if self.configuration.webdriver_config.is_some() {
7228 self.crawl_concurrent_webdriver(client, handle).await
7229 } else {
7230 self.crawl_concurrent_raw(client, handle).await
7231 }
7232 }
7233
7234 #[cfg(all(
7236 not(feature = "decentralized"),
7237 not(feature = "chrome"),
7238 not(feature = "webdriver")
7239 ))]
7240 pub async fn crawl_concurrent(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
7241 self.crawl_concurrent_raw(client, handle).await
7242 }
7243
7244 #[cfg(feature = "decentralized")]
7246 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7247 pub async fn crawl_concurrent(&mut self, client: &Client, handle: &Option<Arc<AtomicI8>>) {
7248 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
7249
7250 self.configuration.configure_allowlist();
7251 let domain = self.url.inner().as_str();
7252 let mut interval = Box::pin(tokio::time::interval(Duration::from_millis(10)));
7253 let throttle = Box::pin(self.get_delay());
7254 let on_link_find_callback = self.on_link_find_callback.clone();
7255 let http_worker = std::env::var("SPIDER_WORKER")
7257 .unwrap_or_else(|_| "http:".to_string())
7258 .starts_with("http:");
7259
7260 let mut links: HashSet<CaseInsensitiveString> = self
7261 .crawl_establish(client, &(domain.into(), Default::default()), http_worker)
7262 .await;
7263
7264 let mut set: JoinSet<HashSet<CaseInsensitiveString>> = JoinSet::new();
7265 let mut exceeded_budget = false;
7266
7267 'outer: loop {
7268 #[cfg(all(feature = "agent", feature = "serde"))]
7269 self.apply_url_prefilter(&mut links).await;
7270
7271 let stream =
7272 tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect())
7273 .throttle(*throttle);
7274 tokio::pin!(stream);
7275
7276 loop {
7277 match stream.next().await {
7278 Some(link) => {
7279 if !self
7280 .handle_process(handle, &mut interval, async {
7281 emit_log_shutdown(link.inner());
7282 set.shutdown().await;
7283 })
7284 .await
7285 {
7286 break 'outer;
7287 }
7288
7289 let allowed = self.is_allowed(&link);
7290
7291 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
7292 exceeded_budget = true;
7293 break;
7294 }
7295 if allowed.eq(&ProcessLinkStatus::Blocked)
7296 || !self.is_allowed_disk(&link).await
7297 {
7298 continue;
7299 }
7300
7301 emit_log(link.inner());
7302
7303 self.insert_link(link.clone()).await;
7304
7305 if let Ok(permit) = SEM.acquire().await {
7306 let client = client.clone();
7307 let on_link_find_callback = on_link_find_callback.clone();
7308
7309 spawn_set("page_fetch", &mut set, async move {
7310 let link_results = match &on_link_find_callback.clone() {
7311 Some(cb) => cb(link, None),
7312 _ => (link, None),
7313 };
7314 let link_results = link_results.0.as_ref();
7315 let page = Page::new_links_only(
7316 &if http_worker && link_results.starts_with("https") {
7317 link_results.replacen("https", "http", 1).to_string()
7318 } else {
7319 link_results.to_string()
7320 },
7321 &client,
7322 )
7323 .await;
7324
7325 drop(permit);
7326
7327 page.links
7328 });
7329
7330 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7331 }
7332 }
7333 _ => break,
7334 }
7335 if exceeded_budget {
7336 break;
7337 }
7338 }
7339
7340 while let Some(res) = set.join_next().await {
7341 if let Ok(msg) = res {
7342 self.links_visited.extend_links(&mut links, msg);
7343 }
7344 }
7345
7346 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7347
7348 if links.is_empty() || exceeded_budget {
7349 break;
7350 }
7351 }
7352
7353 if !links.is_empty() {
7354 self.extra_links.extend(links);
7355 }
7356 }
7357
7358 #[cfg(all(feature = "chrome", feature = "real_browser"))]
7359 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7360 pub async fn warm_up_gemini(&mut self) {
7362 use crate::features::chrome::attempt_navigation;
7363
7364 if let Some(mut b) = self.setup_browser().await {
7365 if let Ok(page) = attempt_navigation(
7366 "about:blank",
7367 &b.browser.0,
7368 &self.configuration.request_timeout,
7369 &b.browser.2,
7370 &self.configuration.viewport,
7371 )
7372 .await
7373 {
7374 let _ = crate::features::solvers::warm_gemini_model(&page).await;
7375 b.dispose();
7376 }
7377 }
7378 }
7379
7380 #[cfg(all(not(feature = "decentralized"), feature = "smart"))]
7382 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7383 pub async fn crawl_concurrent_smart(
7384 &mut self,
7385 client: &Client,
7386 handle: &Option<Arc<AtomicI8>>,
7387 ) {
7388 use tokio::sync::OnceCell;
7389 self.start();
7390 self.status = CrawlStatus::Active;
7391 let browser: crate::features::chrome::OnceBrowser = OnceCell::new();
7392
7393 let mut selectors: (
7394 CompactString,
7395 smallvec::SmallVec<[CompactString; 2]>,
7396 CompactString,
7397 ) = self.setup_selectors();
7398
7399 if self.single_page() {
7400 self.subscription_guard().await;
7401 self.crawl_establish_smart(&client, &mut selectors, &browser)
7402 .await;
7403 } else {
7404 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
7405
7406 let mut links: HashSet<CaseInsensitiveString> = self.drain_extra_links().collect();
7407
7408 let (mut interval, throttle) = self.setup_crawl();
7409 let on_should_crawl_callback = self.on_should_crawl_callback.clone();
7410 let return_page_links = self.configuration.return_page_links;
7411
7412 links.extend(
7413 self.crawl_establish_smart(&client, &mut selectors, &browser)
7414 .await,
7415 );
7416
7417 self.configuration.configure_allowlist();
7418
7419 let mut set: JoinSet<(HashSet<CaseInsensitiveString>, Option<u64>)> = JoinSet::new();
7420 let semaphore = self.setup_semaphore();
7421
7422 let shared = Arc::new((
7423 client.to_owned(),
7424 selectors,
7425 self.channel.clone(),
7426 self.channel_guard.clone(),
7427 self.configuration.clone(),
7428 self.domain_parsed.clone(),
7429 browser,
7430 self.on_link_find_callback.clone(),
7431 self.cookie_jar.clone(),
7432 ));
7433
7434 let add_external = self.configuration.external_domains_caseless.len() > 0;
7435 let mut exceeded_budget = false;
7436 let concurrency = throttle.is_zero();
7437
7438 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7439
7440 if !concurrency && !links.is_empty() {
7441 tokio::time::sleep(*throttle).await;
7442 }
7443
7444 let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
7445 Some(Instant::now())
7446 } else {
7447 None
7448 };
7449
7450 'outer: loop {
7451 #[cfg(all(feature = "agent", feature = "serde"))]
7452 self.apply_url_prefilter(&mut links).await;
7453
7454 let mut stream =
7455 tokio_stream::iter::<HashSet<CaseInsensitiveString>>(links.drain().collect());
7456
7457 loop {
7458 if !concurrency {
7459 tokio::time::sleep(*throttle).await;
7460 }
7461
7462 let semaphore =
7463 get_semaphore(&semaphore, !self.configuration.shared_queue).await;
7464
7465 tokio::select! {
7466 biased;
7467 Some(link) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
7468 if !self
7469 .handle_process(
7470 handle,
7471 &mut interval,
7472 async {
7473 emit_log_shutdown(&link.inner());
7474 let permits = set.len();
7475 set.shutdown().await;
7476 semaphore.add_permits(permits);
7477
7478 },
7479 )
7480 .await
7481 {
7482 break 'outer;
7483 }
7484
7485 let allowed = self.is_allowed(&link);
7486
7487 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
7488 exceeded_budget = true;
7489 break;
7490 }
7491 if allowed.eq(&ProcessLinkStatus::Blocked) || !self.is_allowed_disk(&link).await {
7492 continue;
7493 }
7494
7495 emit_log(&link.inner());
7496 self.insert_link(link.clone()).await;
7497
7498 if let Ok(permit) = semaphore.clone().acquire_owned().await {
7499 let shared = shared.clone();
7500 let on_should_crawl_callback = on_should_crawl_callback.clone();
7501 spawn_set("page_fetch", &mut set, async move {
7502 let link_result = match &shared.7 {
7503 Some(cb) => cb(link, None),
7504 _ => (link, None),
7505 };
7506
7507 let url = link_result.0.as_ref();
7508 let mut page = Page::new_page_with_cache(
7509 &url,
7510 &shared.0,
7511 shared.4.get_cache_options(),
7512 &shared.4.cache_policy,
7513 )
7514 .await;
7515
7516 let mut retry_count = shared.4.retry;
7517
7518 while page.should_retry && retry_count > 0 {
7519 retry_count -= 1;
7520
7521 if let Some(timeout) = page.get_timeout() {
7522 tokio::time::sleep(timeout).await;
7523 }
7524
7525 if page.status_code == StatusCode::GATEWAY_TIMEOUT {
7526
7527 if let Err(elasped) = tokio::time::timeout(BACKOFF_MAX_DURATION, async {
7528 if retry_count.is_power_of_two() {
7529 Website::render_chrome_page(
7530 &shared.4, &shared.0,
7531 &mut page, url,
7532 &shared.5,
7533 &shared.6,
7534 )
7535 .await;
7536 } else {
7537 let next_page = Page::new_page_with_cache(
7538 url,
7539 &shared.0,
7540 shared.4.get_cache_options(),
7541 &shared.4.cache_policy,
7542 )
7543 .await;
7544
7545 page.clone_from(&next_page)
7546 };
7547
7548 }).await
7549 {
7550 log::info!("backoff gateway timeout exceeded {elasped}");
7551 }
7552
7553 } else {
7554
7555 if retry_count.is_power_of_two() {
7556 Website::render_chrome_page(
7557 &shared.4, &shared.0,
7558 &mut page, url,
7559 &shared.5,
7560 &shared.6,
7561 )
7562 .await;
7563 } else {
7564 page.clone_from(
7565 &Page::new_page_with_cache(
7566 url,
7567 &shared.0,
7568 shared.4.get_cache_options(),
7569 &shared.4.cache_policy,
7570 )
7571 .await,
7572 );
7573 }
7574 }
7575 }
7576
7577 if add_external {
7578 page.set_external(
7579 shared
7580 .4
7581 .external_domains_caseless
7582 .clone(),
7583 );
7584 }
7585
7586 let prev_domain = page.base.take();
7587
7588 page.set_url_parsed_direct();
7591 let page_base = page.base.take().map(Box::new);
7592
7593 if return_page_links {
7594 page.page_links = Some(Default::default());
7595 }
7596
7597 let (links, bytes_transferred ) = page
7598 .smart_links(
7599 &shared.1, &shared.4, &page_base, &shared.6, Some(&shared.8)
7600 )
7601 .await;
7602
7603 page.base = prev_domain;
7604 page.bytes_transferred = bytes_transferred;
7605
7606 if shared.4.normalize {
7607 page.signature.replace(crate::utils::hash_html(&page.get_html_bytes_u8()).await);
7608 }
7609
7610 #[cfg(all(feature = "agent", feature = "serde"))]
7612 if shared.4.remote_multimodal.is_some() {
7613 let html = page.get_html();
7614 if !html.is_empty() {
7615 use crate::features::automation::{run_remote_multimodal_extraction, AutomationResultExt};
7616 let title = page.metadata.as_ref().and_then(|m| m.title.as_ref()).map(|t| t.as_str());
7617 if let Ok(Some(result)) = run_remote_multimodal_extraction(
7618 &shared.4.remote_multimodal,
7619 &html,
7620 url,
7621 title,
7622 ).await {
7623 match page.remote_multimodal_usage.as_mut() {
7625 Some(v) => v.push(result.usage.clone()),
7626 None => page.remote_multimodal_usage = Some(vec![result.usage.clone()]),
7627 }
7628 if result.extracted.is_some() || result.screenshot.is_some() {
7630 let automation_result = result.to_automation_results();
7631 match page.extra_remote_multimodal_data.as_mut() {
7632 Some(v) => v.push(automation_result),
7633 None => page.extra_remote_multimodal_data = Some(vec![automation_result]),
7634 }
7635 }
7636 }
7637 }
7638 }
7639
7640 if let Some(ref cb) = on_should_crawl_callback {
7641 if !cb.call(&page) {
7642 page.blocked_crawl = true;
7643 channel_send_page(&shared.2, page, &shared.3);
7644 drop(permit);
7645 return Default::default()
7646 }
7647 }
7648
7649 let signature = page.signature;
7650
7651 channel_send_page(&shared.2, page, &shared.3);
7652
7653 drop(permit);
7654
7655 (links, signature)
7656 });
7657 }
7658
7659 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7660 }
7661 Some(result) = set.join_next(), if !set.is_empty() => {
7662 if let Ok(res) = result {
7663 match res.1 {
7664 Some(signature) => {
7665 if self.is_signature_allowed(signature).await {
7666 self.insert_signature(signature).await;
7667 self.links_visited.extend_links(&mut links, res.0);
7668 }
7669 }
7670 _ => {
7671 self.links_visited.extend_links(&mut links, res.0);
7672 }
7673 }
7674 } else{
7675 break
7676 }
7677 }
7678 else => break,
7679 }
7680
7681 if links.is_empty() && set.is_empty() || exceeded_budget {
7682 if exceeded_budget {
7683 while set.join_next().await.is_some() {}
7684 }
7685 break 'outer;
7686 }
7687 }
7688
7689 self.subscription_guard().await;
7690 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7691
7692 if links.is_empty() && set.is_empty() {
7693 break;
7694 }
7695 }
7696
7697 if !links.is_empty() {
7698 self.extra_links.extend(links);
7699 }
7700 }
7701 }
7702
7703 #[cfg(not(feature = "sitemap"))]
7705 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7706 pub async fn sitemap_crawl(
7707 &mut self,
7708 _client: &Client,
7709 _handle: &Option<Arc<AtomicI8>>,
7710 _scrape: bool,
7711 ) {
7712 }
7713
7714 #[cfg(not(feature = "sitemap"))]
7716 pub async fn sitemap_crawl_chain(
7717 &mut self,
7718 _client: &Client,
7719 _handle: &Option<Arc<AtomicI8>>,
7720 _scrape: bool,
7721 ) {
7722 }
7723
7724 #[cfg(feature = "sitemap")]
7726 pub(crate) fn get_sitemap_setup(&self, domain: &str) -> (&str, bool) {
7727 let (sitemap_path, needs_trailing) = match &self.configuration.sitemap_url {
7728 Some(sitemap_path) => {
7729 let sitemap_path = sitemap_path.as_str();
7730 if domain.ends_with('/') && sitemap_path.starts_with('/') {
7731 (&sitemap_path[1..], false)
7732 } else if !domain.ends_with('/')
7733 && !sitemap_path.is_empty()
7734 && !sitemap_path.starts_with('/')
7735 {
7736 (sitemap_path, true)
7737 } else {
7738 (sitemap_path, false)
7739 }
7740 }
7741 _ => ("sitemap.xml", !domain.ends_with("/")),
7742 };
7743
7744 (sitemap_path, needs_trailing)
7745 }
7746
7747 #[cfg(feature = "sitemap")]
7749 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7750 pub(crate) async fn sitemap_crawl_raw(
7751 &mut self,
7752 client: &Client,
7753 handle: &Option<Arc<AtomicI8>>,
7754 scrape: bool,
7755 ) {
7756 let mut exceeded_budget = self.is_over_wild_budget(&self.configuration.budget);
7757
7758 if !exceeded_budget {
7759 let selectors = self.setup_selectors();
7760 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
7761 let domain = self.url.inner().as_str();
7762 self.domain_parsed = parse_absolute_url(domain);
7763
7764 let persist_links = self.status == CrawlStatus::Start;
7765
7766 let mut interval: Interval = tokio::time::interval(Duration::from_millis(15));
7767
7768 let (sitemap_path, needs_trailing) = self.get_sitemap_setup(domain);
7769
7770 self.configuration.sitemap_url = Some(Box::new(
7771 string_concat!(domain, if needs_trailing { "/" } else { "" }, sitemap_path).into(),
7772 ));
7773
7774 self.configuration.configure_allowlist();
7775
7776 let domain_parsed_ref = self.domain_parsed.as_deref().cloned().map(Box::new);
7777
7778 let shared = Arc::new((
7779 self.channel.clone(),
7780 self.channel_guard.clone(),
7781 selectors,
7782 domain_parsed_ref,
7783 ));
7784 let mut sitemaps = match &self.configuration.sitemap_url {
7785 Some(sitemap) => Vec::from([sitemap.to_owned()]),
7786 _ => Default::default(),
7787 };
7788
7789 let return_page_links = self.configuration.return_page_links;
7790
7791 let mut extra_links = self.extra_links.clone();
7792 self.dequeue(&mut q, &mut extra_links, &mut exceeded_budget)
7793 .await;
7794 self.extra_links.clone_from(&extra_links);
7795
7796 let whitelist_changes = self.configuration.add_sitemap_to_whitelist();
7797
7798 if whitelist_changes.modified() {
7799 self.configuration.set_whitelist();
7800 }
7801
7802 'outer: loop {
7803 let stream =
7804 tokio_stream::iter::<Vec<Box<CompactString>>>(sitemaps.drain(..).collect());
7805 tokio::pin!(stream);
7806
7807 let mut first_request = false;
7808 let mut attempted_correct = false;
7809
7810 while let Some(mut sitemap_url) = stream.next().await {
7811 if !self.handle_process(handle, &mut interval, async {}).await {
7812 break 'outer;
7813 }
7814
7815 let link = <CompactString as Clone>::clone(&(*sitemap_url)).into();
7816
7817 let allowed = self.is_allowed_budgetless(&link);
7818
7819 if allowed.eq(&ProcessLinkStatus::Blocked) {
7820 continue;
7821 }
7822
7823 self.insert_link(link).await;
7824
7825 let (tx, mut rx) = tokio::sync::mpsc::channel::<Page>(100);
7826
7827 let shared = shared.clone();
7828
7829 let handles = crate::utils::spawn_task("page_fetch", async move {
7830 let mut pages = Vec::new();
7831
7832 while let Some(mut page) = rx.recv().await {
7833 if page.page_links.is_none() {
7834 let links = page.links(&shared.2, &shared.3).await;
7835 page.page_links = Some(links.into());
7836 }
7837
7838 if scrape || persist_links {
7839 pages.push(page.clone());
7840 };
7841
7842 if !return_page_links {
7844 page.page_links = None;
7845 }
7846
7847 if shared.0.is_some() {
7848 channel_send_page(&shared.0, page, &shared.1);
7849 }
7850 }
7851
7852 pages
7853 });
7854
7855 while !first_request {
7856 match client.get(sitemap_url.as_str()).send().await {
7858 Ok(response) => {
7859 let limit = *crate::utils::MAX_SIZE_BYTES as u64;
7860
7861 if let Some(response_content_length) = response.content_length() {
7862 if limit > 0 && response_content_length >= limit {
7863 first_request = true;
7865 log::info!(
7866 "{} exceeded parse limit: {:?}",
7867 sitemap_url,
7868 limit
7869 );
7870 break;
7871 }
7872 }
7873
7874 if response.status() == 404 {
7875 if !self
7876 .sitemap_parse(
7877 client,
7878 &mut first_request,
7879 &mut sitemap_url,
7880 &mut attempted_correct,
7881 )
7882 .await
7883 {
7884 break;
7885 }
7886 } else {
7887 match response.bytes().await {
7888 Ok(b) => {
7889 first_request = true;
7890 self.sitemap_parse_crawl(
7891 client,
7892 handle,
7893 b,
7894 &mut interval,
7895 &mut exceeded_budget,
7896 &tx,
7897 &mut sitemaps,
7898 true,
7899 )
7900 .await;
7901 }
7902 Err(err) => {
7903 first_request = true;
7904 log::info!("http parse error: {:?}", err.to_string())
7905 }
7906 };
7907 }
7908 }
7909 Err(err) => {
7910 if attempted_correct {
7912 first_request = true;
7913 break;
7914 }
7915
7916 log::info!("attempting to find sitemap path: {}", err.to_string());
7917
7918 if !self
7919 .sitemap_parse(
7920 client,
7921 &mut first_request,
7922 &mut sitemap_url,
7923 &mut attempted_correct,
7924 )
7925 .await
7926 {
7927 break;
7928 }
7929 }
7930 };
7931 }
7932
7933 drop(tx);
7934
7935 if let Ok(mut handle) = handles.await {
7936 for page in handle.iter_mut() {
7937 if let Some(mut links) = page.page_links.clone() {
7938 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
7939 self.extra_links.extend(*links)
7940 }
7941 }
7942 if scrape {
7943 if let Some(p) = self.pages.as_mut() {
7944 p.extend(handle);
7945 }
7946 }
7947 }
7948
7949 if exceeded_budget {
7950 break;
7951 }
7952 }
7953
7954 if sitemaps.is_empty() || exceeded_budget {
7955 break;
7956 }
7957 }
7958
7959 self.configuration
7960 .remove_sitemap_from_whitelist(whitelist_changes);
7961 }
7962 }
7963
7964 #[cfg(all(
7966 feature = "sitemap",
7967 feature = "chrome",
7968 not(feature = "decentralized")
7969 ))]
7970 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
7971 pub(crate) async fn sitemap_crawl_chrome(
7972 &mut self,
7973 client: &Client,
7974 handle: &Option<Arc<AtomicI8>>,
7975 scrape: bool,
7976 ) {
7977 use crate::features::chrome::attempt_navigation;
7978 use sitemap::{
7979 reader::{SiteMapEntity, SiteMapReader},
7980 structs::Location,
7981 };
7982
7983 let mut exceeded_budget = self.is_over_wild_budget(&self.configuration.budget);
7984
7985 if !exceeded_budget {
7986 if let Some(mut b) = self.setup_browser().await {
7987 let selectors = self.setup_selectors();
7988 let semaphore: Arc<Semaphore> = self.setup_semaphore();
7989 let mut q = self.channel_queue.as_ref().map(|q| q.0.subscribe());
7990 let domain = self.url.inner().as_str();
7991 self.domain_parsed = parse_absolute_url(&domain);
7992 let persist_links = self.status == CrawlStatus::Start;
7993
7994 let mut interval = tokio::time::interval(Duration::from_millis(15));
7995
7996 let (sitemap_path, needs_trailing) = self.get_sitemap_setup(&domain);
7997
7998 self.configuration.sitemap_url = Some(Box::new(
7999 string_concat!(domain, if needs_trailing { "/" } else { "" }, sitemap_path)
8000 .into(),
8001 ));
8002
8003 self.configuration.configure_allowlist();
8004
8005 let domain_parsed_ref = self.domain_parsed.as_deref().cloned().map(Box::new);
8006
8007 let shared = Arc::new((
8008 self.channel.clone(),
8009 self.channel_guard.clone(),
8010 b.browser.0.clone(),
8011 self.configuration.clone(),
8012 self.url.inner().to_string(),
8013 b.browser.2.clone(),
8014 selectors.clone(),
8015 domain_parsed_ref,
8016 ));
8017
8018 let mut sitemaps = match &self.configuration.sitemap_url {
8019 Some(sitemap) => Vec::from([sitemap.to_owned()]),
8020 _ => Default::default(),
8021 };
8022
8023 let crawl_breaker = if self.configuration.crawl_timeout.is_some() {
8024 Some(Instant::now())
8025 } else {
8026 None
8027 };
8028
8029 let mut extra_links = self.extra_links.clone();
8030 self.dequeue(&mut q, &mut *extra_links, &mut exceeded_budget)
8031 .await;
8032 self.extra_links.clone_from(&extra_links);
8033 let mut set: JoinSet<Option<Page>> = JoinSet::new();
8034
8035 let whitelist_changes = self.configuration.add_sitemap_to_whitelist();
8036
8037 if whitelist_changes.modified() {
8038 self.configuration.set_whitelist();
8039 }
8040
8041 'outer: loop {
8042 let stream: tokio_stream::Iter<std::vec::IntoIter<Box<CompactString>>> =
8043 tokio_stream::iter::<Vec<Box<CompactString>>>(sitemaps.drain(..).collect());
8044 tokio::pin!(stream);
8045
8046 tokio::select! {
8047 biased;
8048 Some(sitemap_url) = stream.next(), if semaphore.available_permits() > 0 && !crawl_duration_expired(&self.configuration.crawl_timeout, &crawl_breaker) => {
8049 if !self.handle_process(handle, &mut interval, async {}).await {
8050 break 'outer;
8051 }
8052
8053 let link = <CompactString as Clone>::clone(&(*sitemap_url)).into();
8054
8055 let allowed = self.is_allowed_budgetless(&link);
8056
8057 if allowed.eq(&ProcessLinkStatus::Blocked) {
8058 continue;
8059 }
8060
8061 self.insert_link(link).await;
8062
8063 match attempt_navigation(
8064 "about:blank",
8065 &shared.2,
8066 &self.configuration.request_timeout,
8067 &shared.5,
8068 &self.configuration.viewport,
8069 )
8070 .await {
8071 Ok(new_page) => {
8072 let (_, intercept_handle) = tokio::join!(
8073 crate::features::chrome::setup_chrome_events(
8074 &new_page,
8075 &self.configuration
8076 ),
8077 self.setup_chrome_interception(&new_page)
8078 );
8079
8080 let mut page = Page::new(
8081 &sitemap_url,
8082 &client,
8083 &new_page,
8084 &self.configuration.wait_for,
8085 &self.configuration.screenshot,
8086 false, &self.configuration.openai_config,
8088 &self.configuration.execution_scripts,
8089 &self.configuration.automation_scripts,
8090 &self.configuration.viewport,
8091 &self.configuration.request_timeout,
8092 &self.configuration.track_events,
8093 self.configuration.referer.clone(),
8094 self.configuration.max_page_bytes,
8095 self.configuration.get_cache_options(),
8096 &self.configuration.cache_policy,
8097 &self.configuration.remote_multimodal,
8098 )
8099 .await;
8100
8101 if let Some(h) = intercept_handle {
8102 let abort_handle = h.abort_handle();
8103 if let Err(elasped) =
8104 tokio::time::timeout(tokio::time::Duration::from_secs(10), h).await
8105 {
8106 log::warn!("Handler timeout exceeded {elasped}");
8107 abort_handle.abort();
8108 }
8109 }
8110
8111 drop(new_page);
8112
8113 let is_xml_entry = page.get_html_bytes_u8().starts_with(b"<?xml");
8114 let is_xml = is_xml_entry
8115 && !page.get_html_bytes_u8().ends_with(b"</html>");
8116
8117 if is_xml {
8118 let reader = SiteMapReader::new(&*page.get_html_bytes_u8());
8119 let mut stream = tokio_stream::iter(reader);
8120
8121 while let Some(entity) = stream.next().await {
8122 if !self.handle_process(handle, &mut interval, async {}).await {
8123 break;
8124 }
8125 match entity {
8126 SiteMapEntity::Url(url_entry) => match url_entry.loc {
8127 Location::Url(url) => {
8128 let link: CaseInsensitiveString = url.as_str().into();
8129
8130 let allowed = self.is_allowed(&link);
8131
8132 if allowed.eq(&ProcessLinkStatus::Blocked) {
8133 continue;
8134 }
8135 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
8136 exceeded_budget = true;
8137 break;
8138 }
8139
8140 self.insert_link(link.clone()).await;
8141
8142 let client = client.clone();
8143 let shared = shared.clone();
8144
8145 spawn_set("page_fetch", &mut set, async move {
8146 if let Ok(new_page) = attempt_navigation(
8147 "about:blank",
8148 &shared.2,
8149 &shared.3.request_timeout,
8150 &shared.5,
8151 &shared.3.viewport,
8152 )
8153 .await
8154 {
8155 let (_, intercept_handle) = tokio::join!(
8156 crate::features::chrome::setup_chrome_events(
8157 &new_page, &shared.3,
8158 ),
8159 crate::features::chrome::setup_chrome_interception_base(
8160 &new_page,
8161 shared.3.chrome_intercept.enabled,
8162 &shared.3.auth_challenge_response,
8163 shared.3.chrome_intercept.block_visuals,
8164 &shared.4,
8165 )
8166 );
8167
8168 let mut page = Page::new(
8169 &link.inner(),
8170 &client,
8171 &new_page,
8172 &shared.3.wait_for,
8173 &shared.3.screenshot,
8174 false,
8175 &shared.3.openai_config,
8176 &shared.3.execution_scripts,
8177 &shared.3.automation_scripts,
8178 &shared.3.viewport,
8179 &shared.3.request_timeout,
8180 &shared.3.track_events,
8181 shared.3.referer.clone(),
8182 shared.3.max_page_bytes,
8183 shared.3.get_cache_options(),
8184 &shared.3.cache_policy,
8185 &shared.3.remote_multimodal,
8186 )
8187 .await;
8188
8189 if let Some(intercept_handle) = intercept_handle
8190 {
8191 let abort_handle =
8192 intercept_handle.abort_handle();
8193
8194 if let Err(elasped) = tokio::time::timeout(
8195 tokio::time::Duration::from_secs(10),
8196 async { intercept_handle.await },
8197 )
8198 .await
8199 {
8200 log::warn!("Handler timeout exceeded {elasped}");
8201 abort_handle.abort();
8202 }
8203 }
8204
8205 if page.page_links.is_none() {
8206 let links =
8207 page.links(&shared.6, &shared.7).await;
8208 page.page_links = Some(links.into());
8209 }
8210
8211 Some(page)
8212 } else {
8213 None
8214 }
8215 });
8216 }
8217 Location::None | Location::ParseErr(_) => (),
8218 },
8219 SiteMapEntity::SiteMap(sitemap_entry) => {
8220 match sitemap_entry.loc {
8221 Location::Url(url) => {
8222 sitemaps.push(Box::new(CompactString::new(
8223 &url.as_str(),
8224 )));
8225 }
8226 Location::None | Location::ParseErr(_) => (),
8227 }
8228 }
8229 SiteMapEntity::Err(err) => {
8230 log::info!("incorrect sitemap error: {:?}", err.msg(),)
8231 }
8232 };
8233
8234 if exceeded_budget {
8235 break;
8236 }
8237 }
8238 } else {
8239
8240 if is_xml_entry {
8241 page.modify_xml_html();
8242 }
8243
8244 let links = page.links(&shared.6, &shared.7).await;
8245
8246 let mut stream = tokio_stream::iter(links);
8247
8248 while let Some(link) = stream.next().await {
8249 if !self.handle_process(handle, &mut interval, async {}).await {
8250 break;
8251 }
8252
8253 if link.ends_with(".xml") {
8254 sitemaps.push(Box::new(link.inner().clone()));
8255 continue;
8256 }
8257
8258 let allowed = self.is_allowed(&link);
8259
8260 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
8261 exceeded_budget = true;
8262 break;
8263 }
8264 if allowed.eq(&ProcessLinkStatus::Blocked) {
8265 continue;
8266 }
8267
8268 self.insert_link(link.clone()).await;
8269
8270 let client = client.clone();
8271 let shared = shared.clone();
8272
8273 spawn_set("page_fetch", &mut set, async move {
8274 match attempt_navigation(
8275 "about:blank",
8276 &shared.2,
8277 &shared.3.request_timeout,
8278 &shared.5,
8279 &shared.3.viewport,
8280 )
8281 .await {
8282 Ok(new_page) => {
8283 let (_, intercept_handle) = tokio::join!(
8284 crate::features::chrome::setup_chrome_events(
8285 &new_page, &shared.3,
8286 ),
8287 crate::features::chrome::setup_chrome_interception_base(
8288 &new_page,
8289 shared.3.chrome_intercept.enabled,
8290 &shared.3.auth_challenge_response,
8291 shared.3.chrome_intercept.block_visuals,
8292 &shared.4,
8293 )
8294 );
8295
8296 let mut page = Page::new(
8297 &link.inner(),
8298 &client,
8299 &new_page,
8300 &shared.3.wait_for,
8301 &shared.3.screenshot,
8302 false,
8303 &shared.3.openai_config,
8304 &shared.3.execution_scripts,
8305 &shared.3.automation_scripts,
8306 &shared.3.viewport,
8307 &shared.3.request_timeout,
8308 &shared.3.track_events,
8309 shared.3.referer.clone(),
8310 shared.3.max_page_bytes,
8311 shared.3.get_cache_options(),
8312 &shared.3.cache_policy,
8313 &shared.3.remote_multimodal,
8314 )
8315 .await;
8316
8317 if let Some(intercept_handle) = intercept_handle {
8318 let abort_handle = intercept_handle.abort_handle();
8319
8320 if let Err(elasped) = tokio::time::timeout(
8321 tokio::time::Duration::from_secs(10),
8322 async { intercept_handle.await },
8323 )
8324 .await
8325 {
8326 log::warn!("Handler timeout exceeded {elasped}");
8327 abort_handle.abort();
8328 }
8329 }
8330
8331 if page.page_links.is_none() {
8332 let links = page.links(&shared.6, &shared.7).await;
8333 page.page_links = Some(links.into());
8334 }
8335
8336 Some(page)
8337 }
8338 Err(err) => {
8339 log::error!("chrome failed to open: {:?}", err);
8340 None
8341 }
8342 }
8343 });
8344
8345 if exceeded_budget {
8346 break;
8347 }
8348 }
8349 }
8350 }
8351 Err(err) => {
8352 log::error!("chrome failed to open: {:?}", err);
8353 }
8354 }
8355
8356
8357 },
8358 Some(result) = set.join_next(), if !set.is_empty() => {
8359 if let Ok(res) = result {
8360 match res {
8361 Some(page) => {
8362 if let Some(signature) = page.signature {
8363 if self.is_signature_allowed(signature).await {
8364 if let Some(mut links) = page.page_links.clone() {
8365 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
8366 self.extra_links.extend(*links)
8367 }
8368 self.insert_signature(signature).await;
8369
8370 channel_send_page(
8371 &shared.0, page.clone(), &shared.1,
8372 );
8373
8374 if scrape || persist_links {
8375 if let Some(p) = self.pages.as_mut() {
8376 p.push(page);
8377 }
8378 }
8379 }
8380 } else {
8381 if let Some(mut links) = page.page_links.clone() {
8382 self.dequeue(&mut q, &mut links, &mut exceeded_budget).await;
8383 self.extra_links.extend(*links)
8384 }
8385 channel_send_page(
8386 &shared.0, page.clone(), &shared.1,
8387 );
8388 if scrape || persist_links {
8389 if let Some(p) = self.pages.as_mut() {
8390 p.push(page);
8391 }
8392 }
8393 }
8394 }
8395 _ => ()
8396 }
8397 } else {
8398 break;
8399 }
8400 }
8401 else => break,
8402 }
8403
8404 if sitemaps.len() == 0 || exceeded_budget {
8405 break;
8406 }
8407 }
8408
8409 while let Some(result) = set.join_next().await {
8410 if let Ok(res) = result {
8411 match res {
8412 Some(page) => {
8413 if let Some(signature) = page.signature {
8414 if self.is_signature_allowed(signature).await {
8415 if let Some(mut links) = page.page_links.clone() {
8416 self.dequeue(&mut q, &mut links, &mut exceeded_budget)
8417 .await;
8418 self.extra_links.extend(*links)
8419 }
8420 self.insert_signature(signature).await;
8421 channel_send_page(&shared.0, page.clone(), &shared.1);
8422 if scrape || persist_links {
8423 if let Some(p) = self.pages.as_mut() {
8424 p.push(page);
8425 }
8426 }
8427 }
8428 } else {
8429 if let Some(mut links) = page.page_links.clone() {
8430 self.dequeue(&mut q, &mut links, &mut exceeded_budget)
8431 .await;
8432 self.extra_links.extend(*links)
8433 }
8434 channel_send_page(&shared.0, page.clone(), &shared.1);
8435 if scrape || persist_links {
8436 if let Some(p) = self.pages.as_mut() {
8437 p.push(page);
8438 }
8439 }
8440 }
8441 }
8442 _ => (),
8443 }
8444 }
8445 }
8446 b.dispose();
8447 self.configuration
8448 .remove_sitemap_from_whitelist(whitelist_changes);
8449 }
8450 }
8451 }
8452
8453 #[cfg(feature = "sitemap")]
8455 pub async fn sitemap_crawl(
8456 &mut self,
8457 client: &Client,
8458 handle: &Option<Arc<AtomicI8>>,
8459 scrape: bool,
8460 ) {
8461 self.sitemap_crawl_raw(client, handle, scrape).await
8462 }
8463
8464 #[cfg(all(
8466 feature = "sitemap",
8467 any(not(feature = "chrome"), feature = "decentralized")
8468 ))]
8469 async fn sitemap_crawl_chain(
8470 &mut self,
8471 client: &Client,
8472 handle: &Option<Arc<AtomicI8>>,
8473 scrape: bool,
8474 ) {
8475 if !self.configuration.ignore_sitemap {
8476 self.sitemap_crawl_raw(client, handle, scrape).await
8477 }
8478 }
8479
8480 #[cfg(all(
8482 feature = "sitemap",
8483 feature = "chrome",
8484 not(feature = "decentralized")
8485 ))]
8486 pub async fn sitemap_crawl_chain(
8487 &mut self,
8488 client: &Client,
8489 handle: &Option<Arc<AtomicI8>>,
8490 scrape: bool,
8491 ) {
8492 if !self.configuration.ignore_sitemap {
8493 self.sitemap_crawl_chrome(client, handle, scrape).await
8494 }
8495 }
8496
8497 #[cfg(feature = "sitemap")]
8499 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
8500 pub async fn sitemap_parse(
8501 &mut self,
8502 client: &Client,
8503 first_request: &mut bool,
8504 sitemap_url: &mut Box<CompactString>,
8505 attempted_correct: &mut bool,
8506 ) -> bool {
8507 let mut valid = !*attempted_correct;
8508
8509 if valid {
8510 if let Some(domain) = &self.domain_parsed {
8511 match client.get(domain.as_str()).send().await {
8513 Ok(response) => {
8514 let limit = *crate::utils::MAX_SIZE_BYTES as u64;
8515
8516 if let Some(response_content_length) = response.content_length() {
8517 if limit > 0 && response_content_length >= limit {
8518 log::info!("{} exceeded parse limit: {:?}", domain, limit);
8519 *first_request = true;
8520 *attempted_correct = true;
8521 valid = false;
8522 }
8523 }
8524
8525 if valid {
8526 let cell = tokio::sync::OnceCell::new();
8528
8529 let rewriter_settings = lol_html::Settings {
8530 element_content_handlers: vec![lol_html::element!(
8531 r#"link[rel="sitemap"]"#,
8532 |el| {
8533 if let Some(href) = el.get_attribute("href") {
8534 let _ = cell.set(href);
8535 }
8536 Ok(())
8537 }
8538 )],
8539 adjust_charset_on_meta_tag: false,
8540 ..lol_html::send::Settings::new_for_handler_types()
8541 };
8542
8543 let mut rewriter = lol_html::send::HtmlRewriter::new(
8544 rewriter_settings,
8545 |_c: &[u8]| {},
8546 );
8547
8548 let mut wrote_error = false;
8549 let mut stream = response.bytes_stream();
8550
8551 while let Some(chunk) = stream.next().await {
8552 if let Ok(chunk) = chunk {
8553 if rewriter.write(&chunk).is_err() {
8554 wrote_error = true;
8555 break;
8556 }
8557 }
8558 if cell.initialized() {
8559 break;
8560 }
8561 }
8562
8563 if !wrote_error {
8564 let _ = rewriter.end();
8565 }
8566
8567 if let Some(sitemap) = cell.get() {
8568 if sitemap.is_empty() {
8569 *first_request = true;
8570 }
8571
8572 if domain.join(sitemap).is_err() {
8573 *first_request = true;
8574 }
8575 *sitemap_url = Box::new(sitemap.into());
8577 *attempted_correct = true;
8578 } else {
8579 *first_request = true;
8580 }
8581 }
8582 }
8583 Err(err) => {
8584 *first_request = true;
8585 valid = false;
8586 log::info!("http parse error: {:?}", err.to_string())
8587 }
8588 };
8589 }
8590 }
8591
8592 valid
8593 }
8594 #[cfg(feature = "sitemap")]
8596 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
8597 async fn sitemap_parse_crawl(
8598 &mut self,
8599 client: &Client,
8600 handle: &Option<Arc<AtomicI8>>,
8601 b: bytes::Bytes,
8602 interval: &mut Interval,
8603 exceeded_budget: &mut bool,
8604 tx: &tokio::sync::mpsc::Sender<Page>,
8605 sitemaps: &mut Vec<Box<CompactString>>,
8606 crawl: bool,
8607 ) {
8608 use sitemap::reader::{SiteMapEntity, SiteMapReader};
8609 use sitemap::structs::Location;
8610
8611 if !b.is_empty() && b.starts_with(b"<?xml") {
8612 let mut stream = tokio_stream::iter(SiteMapReader::new(&*b));
8613
8614 let retry = self.configuration.retry;
8615
8616 while let Some(entity) = stream.next().await {
8617 if !self.handle_process(handle, interval, async {}).await {
8618 break;
8619 }
8620 match entity {
8621 SiteMapEntity::Url(url_entry) => match url_entry.loc {
8622 Location::Url(url) => {
8623 let link: CaseInsensitiveString = url.as_str().into();
8624
8625 let allowed = self.is_allowed(&link);
8626
8627 if allowed.eq(&ProcessLinkStatus::Blocked) {
8628 continue;
8629 }
8630
8631 if allowed.eq(&ProcessLinkStatus::BudgetExceeded) {
8632 *exceeded_budget = true;
8633 break;
8634 }
8635
8636 self.insert_link(link.clone()).await;
8637
8638 if crawl {
8639 let client = client.clone();
8640 let tx = tx.clone();
8641 let cache_options = self.configuration.get_cache_options();
8642 let cache_policy = self.configuration.cache_policy.clone();
8643
8644 crate::utils::spawn_task("page_fetch", async move {
8645 let mut page = Page::new_page_with_cache(
8646 link.inner(),
8647 &client,
8648 cache_options.clone(),
8649 &cache_policy,
8650 )
8651 .await;
8652
8653 let mut retry_count = retry;
8654
8655 while page.should_retry && retry_count > 0 {
8656 if let Some(timeout) = page.get_timeout() {
8657 tokio::time::sleep(timeout).await;
8658 }
8659 page.clone_from(
8660 &Page::new_page_with_cache(
8661 link.inner(),
8662 &client,
8663 cache_options.clone(),
8664 &cache_policy,
8665 )
8666 .await,
8667 );
8668 retry_count -= 1;
8669 }
8670
8671 if let Ok(permit) = tx.reserve().await {
8672 permit.send(page);
8673 }
8674 });
8675 }
8676 }
8677 Location::None | Location::ParseErr(_) => (),
8678 },
8679 SiteMapEntity::SiteMap(sitemap_entry) => match sitemap_entry.loc {
8680 Location::Url(url) => {
8681 sitemaps.push(Box::new(CompactString::new(url.as_str())));
8682 }
8683 Location::None | Location::ParseErr(_) => (),
8684 },
8685 SiteMapEntity::Err(err) => {
8686 log::info!("incorrect sitemap error: {:?}", err.msg())
8687 }
8688 };
8689
8690 if *exceeded_budget {
8691 break;
8692 }
8693 }
8694 }
8695 }
8696
8697 #[cfg(feature = "regex")]
8699 pub fn get_base_link(&self) -> &CaseInsensitiveString {
8700 &self.url
8701 }
8702
8703 #[cfg(not(feature = "regex"))]
8705 pub fn get_base_link(&self) -> &CompactString {
8706 self.url.inner()
8707 }
8708
8709 pub async fn subscription_guard(&self) {
8711 if let Some(channel) = &self.channel {
8712 if !channel.1.is_empty() {
8713 if let Some(guard_counter) = &self.channel_guard {
8714 guard_counter.lock().await
8715 }
8716 }
8717 }
8718 }
8719
8720 #[cfg(feature = "chrome")]
8722 pub async fn setup_browser_base(
8723 config: &Configuration,
8724 url_parsed: &Option<Box<Url>>,
8725 jar: Option<&Arc<crate::client::cookie::Jar>>,
8726 ) -> Option<crate::features::chrome::BrowserController> {
8727 match crate::features::chrome::launch_browser_cookies(config, url_parsed, jar).await {
8728 Some((browser, browser_handle, context_id)) => {
8729 let browser: Arc<chromiumoxide::Browser> = Arc::new(browser);
8730 let b = (browser, Some(browser_handle), context_id);
8731
8732 Some(crate::features::chrome::BrowserController::new(b))
8733 }
8734 _ => None,
8735 }
8736 }
8737
8738 #[cfg(feature = "chrome")]
8740 pub async fn setup_browser(&self) -> Option<crate::features::chrome::BrowserController> {
8741 Website::setup_browser_base(
8742 &self.configuration,
8743 self.get_url_parsed(),
8744 Some(&self.cookie_jar),
8745 )
8746 .await
8747 }
8748
8749 #[cfg(feature = "webdriver")]
8751 pub async fn setup_webdriver(&self) -> Option<crate::features::webdriver::WebDriverController> {
8752 crate::features::webdriver::launch_driver(&self.configuration).await
8753 }
8754
8755 #[cfg(feature = "webdriver")]
8757 pub async fn render_webdriver_page(
8758 &self,
8759 url: &str,
8760 driver: &std::sync::Arc<thirtyfour::WebDriver>,
8761 ) -> Option<String> {
8762 use crate::features::webdriver::{
8763 attempt_navigation, get_page_content, setup_driver_events,
8764 };
8765
8766 let timeout = self
8767 .configuration
8768 .webdriver_config
8769 .as_ref()
8770 .and_then(|c| c.timeout);
8771
8772 if let Err(e) = attempt_navigation(url, driver, &timeout).await {
8774 log::error!("WebDriver navigation failed: {:?}", e);
8775 return None;
8776 }
8777
8778 setup_driver_events(driver, &self.configuration).await;
8780
8781 match get_page_content(driver).await {
8783 Ok(content) => Some(content),
8784 Err(e) => {
8785 log::error!("Failed to get WebDriver page content: {:?}", e);
8786 None
8787 }
8788 }
8789 }
8790
8791 pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self {
8793 self.configuration
8794 .with_respect_robots_txt(respect_robots_txt);
8795 self
8796 }
8797
8798 pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self {
8800 self.configuration.with_subdomains(subdomains);
8801 self
8802 }
8803
8804 pub fn with_csp_bypass(&mut self, enabled: bool) -> &mut Self {
8806 self.configuration.with_csp_bypass(enabled);
8807 self
8808 }
8809
8810 #[cfg(feature = "webdriver")]
8813 pub fn with_webdriver(
8814 &mut self,
8815 webdriver_config: crate::features::webdriver_common::WebDriverConfig,
8816 ) -> &mut Self {
8817 self.configuration
8818 .with_webdriver_config(Some(webdriver_config));
8819 self
8820 }
8821
8822 #[cfg(not(feature = "webdriver"))]
8824 pub fn with_webdriver(&mut self, _webdriver_config: ()) -> &mut Self {
8825 self
8826 }
8827
8828 #[cfg(feature = "disk")]
8830 pub fn with_sqlite(&mut self, sqlite: bool) -> &mut Self {
8831 if sqlite {
8832 self.enable_sqlite = true;
8833 } else {
8834 self.enable_sqlite = false;
8835 self.sqlite = None;
8836 };
8837 self
8838 }
8839
8840 #[cfg(not(feature = "disk"))]
8842 pub fn with_sqlite(&mut self, _sqlite: bool) -> &mut Self {
8843 self
8844 }
8845
8846 pub fn with_tld(&mut self, tld: bool) -> &mut Self {
8848 self.configuration.with_tld(tld);
8849 self
8850 }
8851
8852 pub fn with_crawl_timeout(&mut self, crawl_timeout: Option<Duration>) -> &mut Self {
8854 self.configuration.with_crawl_timeout(crawl_timeout);
8855 self
8856 }
8857
8858 pub fn with_http2_prior_knowledge(&mut self, http2_prior_knowledge: bool) -> &mut Self {
8860 self.configuration
8861 .with_http2_prior_knowledge(http2_prior_knowledge);
8862 self
8863 }
8864
8865 pub fn with_delay(&mut self, delay: u64) -> &mut Self {
8867 self.configuration.with_delay(delay);
8868 self
8869 }
8870
8871 pub fn with_request_timeout(&mut self, request_timeout: Option<Duration>) -> &mut Self {
8873 self.configuration.with_request_timeout(request_timeout);
8874 self
8875 }
8876
8877 pub fn with_danger_accept_invalid_certs(&mut self, accept_invalid_certs: bool) -> &mut Self {
8879 self.configuration
8880 .with_danger_accept_invalid_certs(accept_invalid_certs);
8881 self
8882 }
8883
8884 pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self {
8886 self.configuration.with_user_agent(user_agent);
8887 self
8888 }
8889
8890 pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self {
8892 self.configuration.with_preserve_host_header(preserve);
8893 self
8894 }
8895
8896 #[cfg(feature = "sitemap")]
8897 pub fn with_sitemap(&mut self, sitemap_url: Option<&str>) -> &mut Self {
8899 self.configuration.with_sitemap(sitemap_url);
8900 self
8901 }
8902
8903 #[cfg(not(feature = "sitemap"))]
8904 pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self {
8906 self
8907 }
8908
8909 pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self {
8911 self.configuration.with_proxies(proxies);
8912 self
8913 }
8914
8915 pub fn with_proxies_direct(
8917 &mut self,
8918 proxies: Option<Vec<crate::configuration::RequestProxy>>,
8919 ) -> &mut Self {
8920 self.configuration.with_proxies_direct(proxies);
8921 self
8922 }
8923
8924 pub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self {
8926 self.configuration.with_concurrency_limit(limit);
8927 self
8928 }
8929
8930 #[cfg(not(feature = "control"))]
8932 pub fn with_crawl_id(&mut self, _crawl_id: String) -> &mut Self {
8933 self
8934 }
8935
8936 #[cfg(feature = "control")]
8938 pub fn with_crawl_id(&mut self, crawl_id: String) -> &mut Self {
8939 self.crawl_id = crawl_id.into();
8940 self
8941 }
8942
8943 pub fn with_blacklist_url<T>(&mut self, blacklist_url: Option<Vec<T>>) -> &mut Self
8945 where
8946 Vec<CompactString>: From<Vec<T>>,
8947 {
8948 self.configuration.with_blacklist_url(blacklist_url);
8949 self
8950 }
8951
8952 pub fn with_retry(&mut self, retry: u8) -> &mut Self {
8954 self.configuration.with_retry(retry);
8955 self
8956 }
8957
8958 pub fn with_no_control_thread(&mut self, no_control_thread: bool) -> &mut Self {
8960 self.configuration.with_no_control_thread(no_control_thread);
8961 self
8962 }
8963
8964 pub fn with_whitelist_url<T>(&mut self, whitelist_url: Option<Vec<T>>) -> &mut Self
8966 where
8967 Vec<CompactString>: From<Vec<T>>,
8968 {
8969 self.configuration.with_whitelist_url(whitelist_url);
8970 self
8971 }
8972
8973 #[cfg(feature = "chrome")]
8974 pub fn with_event_tracker(
8976 &mut self,
8977 track_events: Option<crate::configuration::ChromeEventTracker>,
8978 ) -> &mut Self {
8979 self.configuration.with_event_tracker(track_events);
8980 self
8981 }
8982
8983 pub fn with_headers(&mut self, headers: Option<reqwest::header::HeaderMap>) -> &mut Self {
8985 self.configuration.with_headers(headers);
8986 self
8987 }
8988
8989 pub fn with_modify_headers(&mut self, modify_headers: bool) -> &mut Self {
8991 self.configuration.with_modify_headers(modify_headers);
8992 self
8993 }
8994
8995 pub fn with_modify_http_client_headers(
8997 &mut self,
8998 modify_http_client_headers: bool,
8999 ) -> &mut Self {
9000 self.configuration
9001 .with_modify_http_client_headers(modify_http_client_headers);
9002 self
9003 }
9004
9005 pub fn with_budget(&mut self, budget: Option<HashMap<&str, u32>>) -> &mut Self {
9007 self.configuration.with_budget(budget);
9008 self
9009 }
9010
9011 pub fn set_crawl_budget(&mut self, budget: Option<HashMap<CaseInsensitiveString, u32>>) {
9013 self.configuration.budget = budget;
9014 }
9015
9016 pub fn with_depth(&mut self, depth: usize) -> &mut Self {
9018 self.configuration.with_depth(depth);
9019 self
9020 }
9021
9022 pub fn with_external_domains<'a, 'b>(
9024 &mut self,
9025 external_domains: Option<impl Iterator<Item = String> + 'a>,
9026 ) -> &mut Self {
9027 self.configuration.with_external_domains(external_domains);
9028 self
9029 }
9030
9031 pub fn with_on_link_find_callback(
9033 &mut self,
9034 on_link_find_callback: Option<OnLinkFindCallback>,
9035 ) -> &mut Self {
9036 match on_link_find_callback {
9037 Some(callback) => self.on_link_find_callback = Some(callback),
9038 _ => self.on_link_find_callback = None,
9039 };
9040 self
9041 }
9042
9043 pub fn set_on_link_find<F>(&mut self, f: F)
9045 where
9046 F: Fn(CaseInsensitiveString, Option<String>) -> (CaseInsensitiveString, Option<String>)
9047 + Send
9048 + Sync
9049 + 'static,
9050 {
9051 self.on_link_find_callback = Some(Arc::new(f));
9052 }
9053
9054 pub fn with_on_should_crawl_callback(
9056 &mut self,
9057 on_should_crawl_callback: Option<fn(&Page) -> bool>,
9058 ) -> &mut Self {
9059 match on_should_crawl_callback {
9060 Some(callback) => {
9061 self.on_should_crawl_callback = Some(OnShouldCrawlCallback::Fn(callback))
9062 }
9063 _ => self.on_should_crawl_callback = None,
9064 };
9065 self
9066 }
9067
9068 pub fn with_on_should_crawl_callback_closure<F: OnShouldCrawlClosure>(
9072 &mut self,
9073 on_should_crawl_closure: Option<F>,
9074 ) -> &mut Self {
9075 match on_should_crawl_closure {
9076 Some(callback) => {
9077 self.on_should_crawl_callback =
9078 Some(OnShouldCrawlCallback::Closure(Arc::new(callback)))
9079 }
9080 _ => self.on_should_crawl_callback = None,
9081 };
9082 self
9083 }
9084
9085 pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self {
9087 self.configuration.with_cookies(cookie_str);
9088 self
9089 }
9090
9091 pub fn with_cron(&mut self, cron_str: &str, cron_type: CronType) -> &mut Self {
9093 self.configuration.with_cron(cron_str, cron_type);
9094 self
9095 }
9096
9097 pub fn with_locale(&mut self, locale: Option<String>) -> &mut Self {
9099 self.configuration.with_locale(locale);
9100 self
9101 }
9102
9103 pub fn with_stealth(&mut self, stealth_mode: bool) -> &mut Self {
9105 self.configuration.with_stealth(stealth_mode);
9106 self
9107 }
9108
9109 #[cfg(feature = "chrome")]
9111 pub fn with_stealth_advanced(
9112 &mut self,
9113 stealth_mode: spider_fingerprint::configs::Tier,
9114 ) -> &mut Self {
9115 self.configuration.with_stealth_advanced(stealth_mode);
9116 self
9117 }
9118
9119 pub fn with_cache_policy(
9121 &mut self,
9122 cache_policy: Option<crate::utils::BasicCachePolicy>,
9123 ) -> &mut Self {
9124 self.configuration.with_cache_policy(cache_policy);
9125
9126 self
9127 }
9128
9129 pub fn with_openai(&mut self, openai_configs: Option<configuration::GPTConfigs>) -> &mut Self {
9131 self.configuration.with_openai(openai_configs);
9132 self
9133 }
9134
9135 #[cfg(feature = "chrome")]
9208 pub fn with_remote_multimodal(
9209 &mut self,
9210 cfg: Option<crate::features::automation::RemoteMultimodalConfigs>,
9211 ) -> &mut Self {
9212 self.configuration.with_remote_multimodal(cfg);
9213 self
9214 }
9215
9216 pub fn with_gemini(
9218 &mut self,
9219 gemini_configs: Option<configuration::GeminiConfigs>,
9220 ) -> &mut Self {
9221 self.configuration.with_gemini(gemini_configs);
9222 self
9223 }
9224
9225 pub fn with_caching(&mut self, cache: bool) -> &mut Self {
9227 self.configuration.with_caching(cache);
9228 self
9229 }
9230
9231 pub fn with_cache_skip_browser(&mut self, skip: bool) -> &mut Self {
9233 self.configuration.with_cache_skip_browser(skip);
9234 self
9235 }
9236
9237 pub fn with_service_worker_enabled(&mut self, enabled: bool) -> &mut Self {
9239 self.configuration.with_service_worker_enabled(enabled);
9240 self
9241 }
9242
9243 pub fn with_auto_geolocation(&mut self, enabled: bool) -> &mut Self {
9245 self.configuration.with_auto_geolocation(enabled);
9246 self
9247 }
9248
9249 #[cfg(feature = "chrome")]
9250 pub fn with_fingerprint_advanced(
9252 &mut self,
9253 fingerprint: crate::configuration::Fingerprint,
9254 ) -> &mut Self {
9255 self.configuration.with_fingerprint_advanced(fingerprint);
9256 self
9257 }
9258
9259 pub fn with_fingerprint(&mut self, fingerprint: bool) -> &mut Self {
9261 self.configuration.with_fingerprint(fingerprint);
9262 self
9263 }
9264
9265 pub fn with_viewport(&mut self, viewport: Option<crate::configuration::Viewport>) -> &mut Self {
9267 self.configuration.with_viewport(viewport);
9268 self
9269 }
9270
9271 pub fn with_wait_for_idle_network(
9273 &mut self,
9274 wait_for_idle_network: Option<crate::configuration::WaitForIdleNetwork>,
9275 ) -> &mut Self {
9276 self.configuration
9277 .with_wait_for_idle_network(wait_for_idle_network);
9278 self
9279 }
9280
9281 pub fn with_wait_for_idle_network0(
9283 &mut self,
9284 wait_for_idle_network: Option<crate::configuration::WaitForIdleNetwork>,
9285 ) -> &mut Self {
9286 self.configuration
9287 .with_wait_for_idle_network0(wait_for_idle_network);
9288 self
9289 }
9290
9291 pub fn with_wait_for_almost_idle_network0(
9293 &mut self,
9294 wait_for_idle_network: Option<crate::configuration::WaitForIdleNetwork>,
9295 ) -> &mut Self {
9296 self.configuration
9297 .with_wait_for_almost_idle_network0(wait_for_idle_network);
9298 self
9299 }
9300
9301 pub fn with_wait_for_selector(
9303 &mut self,
9304 wait_for_selector: Option<crate::configuration::WaitForSelector>,
9305 ) -> &mut Self {
9306 self.configuration.with_wait_for_selector(wait_for_selector);
9307 self
9308 }
9309
9310 pub fn with_wait_for_idle_dom(
9312 &mut self,
9313 wait_for_selector: Option<crate::configuration::WaitForSelector>,
9314 ) -> &mut Self {
9315 self.configuration.with_wait_for_idle_dom(wait_for_selector);
9316 self
9317 }
9318
9319 pub fn with_wait_for_delay(
9321 &mut self,
9322 wait_for_delay: Option<crate::configuration::WaitForDelay>,
9323 ) -> &mut Self {
9324 self.configuration.with_wait_for_delay(wait_for_delay);
9325 self
9326 }
9327
9328 pub fn with_default_http_connect_timeout(
9330 &mut self,
9331 default_http_connect_timeout: Option<Duration>,
9332 ) -> &mut Self {
9333 self.configuration
9334 .with_default_http_connect_timeout(default_http_connect_timeout);
9335
9336 self
9337 }
9338
9339 pub fn with_default_http_read_timeout(
9341 &mut self,
9342 default_http_read_timeout: Option<Duration>,
9343 ) -> &mut Self {
9344 self.configuration
9345 .with_default_http_read_timeout(default_http_read_timeout);
9346
9347 self
9348 }
9349
9350 pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self {
9352 self.configuration.with_redirect_limit(redirect_limit);
9353 self
9354 }
9355
9356 pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self {
9358 self.configuration.with_redirect_policy(policy);
9359 self
9360 }
9361
9362 pub fn with_chrome_intercept(
9364 &mut self,
9365 chrome_intercept: RequestInterceptConfiguration,
9366 ) -> &mut Self {
9367 self.configuration
9368 .with_chrome_intercept(chrome_intercept, &self.domain_parsed);
9369 self
9370 }
9371
9372 pub fn with_referer(&mut self, referer: Option<String>) -> &mut Self {
9374 self.configuration.with_referer(referer);
9375 self
9376 }
9377
9378 pub fn with_referrer(&mut self, referer: Option<String>) -> &mut Self {
9380 self.configuration.with_referrer(referer);
9381 self
9382 }
9383
9384 pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self {
9386 self.configuration.with_full_resources(full_resources);
9387 self
9388 }
9389
9390 pub fn with_dismiss_dialogs(&mut self, full_resources: bool) -> &mut Self {
9392 self.configuration.with_dismiss_dialogs(full_resources);
9393 self
9394 }
9395
9396 #[cfg(feature = "wreq")]
9398 pub fn with_emulation(&mut self, emulation: Option<wreq_util::Emulation>) -> &mut Self {
9399 self.configuration.with_emulation(emulation);
9400 self
9401 }
9402
9403 pub fn with_ignore_sitemap(&mut self, ignore_sitemap: bool) -> &mut Self {
9405 self.configuration.with_ignore_sitemap(ignore_sitemap);
9406 self
9407 }
9408
9409 pub fn with_timezone_id(&mut self, timezone_id: Option<String>) -> &mut Self {
9411 self.configuration.with_timezone_id(timezone_id);
9412 self
9413 }
9414
9415 pub fn with_evaluate_on_new_document(
9417 &mut self,
9418 evaluate_on_new_document: Option<Box<String>>,
9419 ) -> &mut Self {
9420 self.configuration
9421 .with_evaluate_on_new_document(evaluate_on_new_document);
9422
9423 self
9424 }
9425
9426 pub fn with_limit(&mut self, limit: u32) -> &mut Self {
9428 self.configuration.with_limit(limit);
9429 self
9430 }
9431
9432 pub fn with_screenshot(
9434 &mut self,
9435 screenshot_config: Option<configuration::ScreenShotConfig>,
9436 ) -> &mut Self {
9437 self.configuration.with_screenshot(screenshot_config);
9438 self
9439 }
9440
9441 pub fn with_shared_queue(&mut self, shared_queue: bool) -> &mut Self {
9443 self.configuration.with_shared_queue(shared_queue);
9444 self
9445 }
9446
9447 pub fn with_auth_challenge_response(
9449 &mut self,
9450 auth_challenge_response: Option<configuration::AuthChallengeResponse>,
9451 ) -> &mut Self {
9452 self.configuration
9453 .with_auth_challenge_response(auth_challenge_response);
9454 self
9455 }
9456
9457 pub fn with_return_page_links(&mut self, return_page_links: bool) -> &mut Self {
9459 self.configuration.with_return_page_links(return_page_links);
9460 self
9461 }
9462
9463 pub fn with_chrome_connection(&mut self, chrome_connection_url: Option<String>) -> &mut Self {
9465 self.configuration
9466 .with_chrome_connection(chrome_connection_url);
9467 self
9468 }
9469
9470 pub fn with_execution_scripts(
9472 &mut self,
9473 execution_scripts: Option<ExecutionScriptsMap>,
9474 ) -> &mut Self {
9475 self.configuration.with_execution_scripts(execution_scripts);
9476 self
9477 }
9478
9479 pub fn with_automation_scripts(
9481 &mut self,
9482 automation_scripts: Option<AutomationScriptsMap>,
9483 ) -> &mut Self {
9484 self.configuration
9485 .with_automation_scripts(automation_scripts);
9486 self
9487 }
9488
9489 pub fn with_network_interface(&mut self, network_interface: Option<String>) -> &mut Self {
9491 self.configuration.with_network_interface(network_interface);
9492 self
9493 }
9494
9495 pub fn with_local_address(&mut self, local_address: Option<IpAddr>) -> &mut Self {
9497 self.configuration.with_local_address(local_address);
9498 self
9499 }
9500
9501 pub fn with_block_assets(&mut self, only_html: bool) -> &mut Self {
9503 self.configuration.with_block_assets(only_html);
9504 self
9505 }
9506
9507 pub fn with_normalize(&mut self, normalize: bool) -> &mut Self {
9509 self.configuration.with_normalize(normalize);
9510 self
9511 }
9512
9513 pub fn with_shared_state(&mut self, shared: bool) -> &mut Self {
9515 self.configuration.with_shared_state(shared);
9516 self
9517 }
9518
9519 pub fn with_max_page_bytes(&mut self, max_page_bytes: Option<f64>) -> &mut Self {
9521 self.configuration.with_max_page_bytes(max_page_bytes);
9522 self
9523 }
9524
9525 pub fn with_max_bytes_allowed(&mut self, max_bytes_allowed: Option<u64>) -> &mut Self {
9527 self.configuration.with_max_bytes_allowed(max_bytes_allowed);
9528 self
9529 }
9530
9531 pub fn with_config(&mut self, config: Configuration) -> &mut Self {
9533 self.configuration = config.into();
9534 self
9535 }
9536
9537 #[cfg(feature = "spider_cloud")]
9539 pub fn with_spider_cloud(&mut self, api_key: &str) -> &mut Self {
9540 self.configuration.with_spider_cloud(api_key);
9541 self
9542 }
9543
9544 #[cfg(not(feature = "spider_cloud"))]
9546 pub fn with_spider_cloud(&mut self, _api_key: &str) -> &mut Self {
9547 self
9548 }
9549
9550 #[cfg(feature = "spider_cloud")]
9552 pub fn with_spider_cloud_config(
9553 &mut self,
9554 config: crate::configuration::SpiderCloudConfig,
9555 ) -> &mut Self {
9556 self.configuration.with_spider_cloud_config(config);
9557 self
9558 }
9559
9560 #[cfg(not(feature = "spider_cloud"))]
9562 pub fn with_spider_cloud_config(&mut self, _config: ()) -> &mut Self {
9563 self
9564 }
9565
9566 #[cfg(feature = "hedge")]
9568 pub fn with_hedge(&mut self, config: crate::utils::hedge::HedgeConfig) -> &mut Self {
9569 self.configuration.with_hedge(config);
9570 self
9571 }
9572
9573 #[cfg(not(feature = "hedge"))]
9575 pub fn with_hedge(&mut self, _config: ()) -> &mut Self {
9576 self
9577 }
9578
9579 pub fn build(&self) -> Result<Self, Self> {
9581 if self.domain_parsed.is_none() {
9582 Err(self.to_owned())
9583 } else {
9584 Ok(self.to_owned())
9585 }
9586 }
9587
9588 pub fn clear_headers(&mut self) {
9590 if let Some(headers) = self.configuration.headers.as_mut() {
9591 headers.0.clear();
9592 }
9593 }
9594
9595 pub fn determine_limits(&mut self) {
9597 self.configuration.configure_budget();
9598 if self.configuration.inner_budget.is_some() {
9599 let wild_card_budget = match &self.configuration.inner_budget {
9600 Some(budget) => budget.contains_key(&*WILD_CARD_PATH),
9601 _ => false,
9602 };
9603 self.configuration.wild_card_budgeting = wild_card_budget;
9604 }
9605 if self.configuration.depth > 0 && self.domain_parsed.is_some() {
9606 if let Some(domain) = &self.domain_parsed {
9607 if let Some(segments) = domain.path_segments() {
9608 let segments_cnt = segments.count();
9609
9610 if segments_cnt > self.configuration.depth {
9611 self.configuration.depth_distance = self.configuration.depth
9612 + self.configuration.depth.abs_diff(segments_cnt);
9613 } else {
9614 self.configuration.depth_distance = self.configuration.depth;
9615 }
9616 }
9617 }
9618 }
9619 }
9620
9621 #[cfg(not(feature = "sync"))]
9622 pub fn subscribe(&mut self, _capacity: usize) -> Option<broadcast::Receiver<Page>> {
9651 None
9652 }
9653
9654 #[cfg(feature = "sync")]
9683 pub fn subscribe(&mut self, capacity: usize) -> Option<broadcast::Receiver<Page>> {
9684 let channel = self.channel.get_or_insert_with(|| {
9685 let (tx, rx) = broadcast::channel(
9686 (if capacity == 0 {
9687 *DEFAULT_PERMITS
9688 } else {
9689 capacity
9690 })
9691 .max(1),
9692 );
9693 (tx, Arc::new(rx))
9694 });
9695
9696 let rx2 = channel.0.subscribe();
9697
9698 Some(rx2)
9699 }
9700
9701 #[cfg(feature = "sync")]
9703 pub fn queue(&mut self, capacity: usize) -> Option<broadcast::Sender<String>> {
9704 let channel = self.channel_queue.get_or_insert_with(|| {
9705 let (tx, rx) = broadcast::channel(capacity);
9706 (tx, Arc::new(rx))
9707 });
9708
9709 Some(channel.0.to_owned())
9710 }
9711
9712 #[cfg(not(feature = "sync"))]
9714 pub fn queue(
9715 &mut self,
9716 _capacity: usize,
9717 ) -> Option<Arc<(broadcast::Sender<Page>, broadcast::Receiver<Page>)>> {
9718 None
9719 }
9720
9721 #[cfg(not(feature = "sync"))]
9723 pub fn unsubscribe(&mut self) {}
9724
9725 #[cfg(feature = "sync")]
9727 pub fn unsubscribe(&mut self) {
9728 self.channel.take();
9729 }
9730
9731 pub fn get_channel(
9733 &self,
9734 ) -> &Option<(broadcast::Sender<Page>, Arc<broadcast::Receiver<Page>>)> {
9735 &self.channel
9736 }
9737
9738 pub fn get_channel_guard(&self) -> &Option<ChannelGuard> {
9740 &self.channel_guard
9741 }
9742
9743 #[cfg(not(feature = "sync"))]
9780 pub fn subscribe_guard(&mut self) -> Option<ChannelGuard> {
9781 None
9782 }
9783
9784 #[cfg(feature = "sync")]
9821 pub fn subscribe_guard(&mut self) -> Option<ChannelGuard> {
9822 let channel_guard = self.channel_guard.get_or_insert_with(ChannelGuard::new);
9824 Some(channel_guard.clone())
9825 }
9826
9827 #[cfg(feature = "cron")]
9828 pub async fn run_cron(&self) -> Runner {
9830 async_job::Runner::new()
9831 .add(Box::new(self.clone()))
9832 .run()
9833 .await
9834 }
9835
9836 #[cfg(not(feature = "control"))]
9837 pub fn get_crawl_id(&self) -> Option<&Box<String>> {
9839 None
9840 }
9841
9842 #[cfg(feature = "control")]
9843 pub fn get_crawl_id(&self) -> Option<&Box<String>> {
9845 if self.crawl_id.is_empty() {
9846 None
9847 } else {
9848 Some(&self.crawl_id)
9849 }
9850 }
9851
9852 #[cfg(feature = "extra_information")]
9854 pub fn set_extra_info(&mut self, info: Option<String>) {
9855 self.extra_info = info.map(|f| f.into());
9856 }
9857
9858 #[cfg(feature = "extra_information")]
9860 pub fn get_extra_info(&self) -> Option<&Box<String>> {
9861 self.extra_info.as_ref()
9862 }
9863
9864 pub fn set_seeded_html(&mut self, html: Option<String>) {
9866 self.seed_html = html;
9867 }
9868
9869 pub fn get_seeded_html(&self) -> &Option<String> {
9871 &self.seed_html
9872 }
9873
9874 #[cfg(feature = "serde")]
9895 pub fn apply_prompt_configuration(
9896 &mut self,
9897 config: &crate::features::automation::PromptConfiguration,
9898 ) -> &mut Self {
9899 if let Some(v) = config.respect_robots_txt {
9901 self.configuration.respect_robots_txt = v;
9902 }
9903 if let Some(v) = config.subdomains {
9904 self.configuration.subdomains = v;
9905 }
9906 if let Some(v) = config.tld {
9907 self.configuration.tld = v;
9908 }
9909 if let Some(v) = config.depth {
9910 self.configuration.depth = v;
9911 }
9912 if let Some(v) = config.delay {
9913 self.configuration.delay = v;
9914 }
9915 if let Some(ms) = config.request_timeout_ms {
9916 self.configuration.request_timeout =
9917 Some(Box::new(std::time::Duration::from_millis(ms)));
9918 }
9919 if let Some(ms) = config.crawl_timeout_ms {
9920 self.configuration.crawl_timeout = Some(std::time::Duration::from_millis(ms));
9921 }
9922
9923 if let Some(ref urls) = config.blacklist_url {
9925 self.configuration.blacklist_url =
9926 Some(urls.iter().map(|s| s.as_str().into()).collect());
9927 }
9928 if let Some(ref urls) = config.whitelist_url {
9929 self.configuration.whitelist_url =
9930 Some(urls.iter().map(|s| s.as_str().into()).collect());
9931 }
9932 if let Some(ref domains) = config.external_domains {
9933 for domain in domains {
9934 self.configuration
9935 .external_domains_caseless
9936 .insert(case_insensitive_string::CaseInsensitiveString::new(domain));
9937 }
9938 }
9939
9940 if let Some(ref ua) = config.user_agent {
9942 self.configuration.user_agent = Some(Box::new(ua.as_str().into()));
9943 }
9944 if let Some(v) = config.http2_prior_knowledge {
9945 self.configuration.http2_prior_knowledge = v;
9946 }
9947 if let Some(v) = config.accept_invalid_certs {
9948 self.configuration.accept_invalid_certs = v;
9949 }
9950
9951 if let Some(v) = config.redirect_limit {
9953 self.configuration.redirect_limit = Box::new(v);
9954 }
9955 if let Some(ref budget_map) = config.budget {
9956 let mut budget = hashbrown::HashMap::new();
9957 for (k, v) in budget_map {
9958 budget.insert(case_insensitive_string::CaseInsensitiveString::new(k), *v);
9959 }
9960 self.configuration.budget = Some(budget);
9961 }
9962 if let Some(v) = config.max_page_bytes {
9963 self.configuration.max_page_bytes = Some(v);
9964 }
9965
9966 if let Some(v) = config.full_resources {
9968 self.configuration.full_resources = v;
9969 }
9970 if let Some(v) = config.only_html {
9971 self.configuration.only_html = v;
9972 }
9973 if let Some(v) = config.return_page_links {
9974 self.configuration.return_page_links = v;
9975 }
9976
9977 #[cfg(feature = "chrome")]
9979 if let Some(true) = config.use_chrome {
9980 }
9982 if let Some(ref mode) = config.stealth_mode {
9983 self.configuration.stealth_mode = match mode.to_lowercase().as_str() {
9984 "basic" => spider_fingerprint::configs::Tier::Basic,
9985 "low" => spider_fingerprint::configs::Tier::Low,
9986 "mid" => spider_fingerprint::configs::Tier::Mid,
9987 "full" => spider_fingerprint::configs::Tier::Full,
9988 _ => spider_fingerprint::configs::Tier::None,
9989 };
9990 }
9991 if config.viewport_width.is_some() || config.viewport_height.is_some() {
9992 let width = config.viewport_width.unwrap_or(800);
9993 let height = config.viewport_height.unwrap_or(600);
9994 self.configuration.viewport = Some(crate::configuration::Viewport::new(width, height));
9995 }
9996 #[cfg(feature = "chrome")]
9997 {
9998 let mut wait_for = self.configuration.wait_for.take().unwrap_or_default();
9999
10000 if let Some(true) = config.wait_for_idle_network {
10001 wait_for.idle_network =
10002 Some(crate::features::chrome_common::WaitForIdleNetwork::new(
10003 Some(std::time::Duration::from_secs(30)),
10004 ));
10005 }
10006 if let Some(ms) = config.wait_for_delay_ms {
10007 wait_for.delay = Some(crate::features::chrome_common::WaitForDelay::new(Some(
10008 std::time::Duration::from_millis(ms),
10009 )));
10010 }
10011 if let Some(ref selector) = config.wait_for_selector {
10012 wait_for.selector = Some(crate::features::chrome_common::WaitForSelector::new(
10013 Some(std::time::Duration::from_secs(30)),
10014 selector.clone(),
10015 ));
10016 }
10017
10018 if wait_for.idle_network.is_some()
10019 || wait_for.delay.is_some()
10020 || wait_for.selector.is_some()
10021 {
10022 self.configuration.wait_for = Some(wait_for);
10023 }
10024 }
10025 #[cfg(feature = "chrome")]
10026 if let Some(ref js) = config.evaluate_on_new_document {
10027 self.configuration.evaluate_on_new_document = Some(Box::new(js.clone()));
10028 }
10029
10030 if let Some(v) = config.shared_queue {
10032 self.configuration.shared_queue = v;
10033 }
10034 if let Some(v) = config.retry {
10035 self.configuration.retry = v;
10036 }
10037
10038 self
10039 }
10040
10041 #[cfg(all(feature = "agent", feature = "serde"))]
10066 pub async fn configure_from_prompt(
10067 &mut self,
10068 api_url: &str,
10069 model_name: &str,
10070 api_key: Option<&str>,
10071 prompt: &str,
10072 ) -> Result<&mut Self, crate::features::automation::EngineError> {
10073 let config = crate::features::automation::configure_crawler_from_prompt(
10074 api_url, model_name, api_key, prompt,
10075 )
10076 .await?;
10077 Ok(self.apply_prompt_configuration(&config))
10078 }
10079}
10080
10081pub fn channel_send_page(
10083 channel: &Option<(
10084 tokio::sync::broadcast::Sender<Page>,
10085 std::sync::Arc<tokio::sync::broadcast::Receiver<Page>>,
10086 )>,
10087 page: Page,
10088 channel_guard: &Option<ChannelGuard>,
10089) {
10090 if let Some(c) = channel {
10091 if c.0.send(page).is_ok() {
10092 if let Some(guard) = channel_guard {
10093 ChannelGuard::inc_guard(&guard.0 .1)
10094 }
10095 }
10096 }
10097}
10098
10099#[derive(Debug, Clone)]
10101pub struct ChannelGuard(Arc<(AtomicBool, AtomicUsize, AtomicUsize)>);
10102
10103impl ChannelGuard {
10104 #[cfg(feature = "sync")]
10106 pub(crate) fn new() -> ChannelGuard {
10107 ChannelGuard(Arc::new((
10108 AtomicBool::new(true),
10109 AtomicUsize::new(0),
10110 AtomicUsize::new(0),
10111 )))
10112 }
10113 pub(crate) async fn lock(&self) {
10115 if self.0 .0.load(Ordering::Relaxed) {
10116 let old = self.0 .1.load(Ordering::Relaxed);
10117
10118 while self
10119 .0
10120 .2
10121 .compare_exchange_weak(old, 0, Ordering::Acquire, Ordering::Relaxed)
10122 .is_err()
10123 {
10124 tokio::task::yield_now().await;
10125 }
10126 std::sync::atomic::fence(Ordering::Acquire);
10127 }
10128 }
10129
10130 pub fn guard(&mut self, guard: bool) {
10132 self.0 .0.store(guard, Ordering::Release);
10133 }
10134
10135 pub fn inc(&mut self) {
10138 self.0 .2.fetch_add(1, std::sync::atomic::Ordering::Release);
10139 }
10140
10141 pub(crate) fn inc_guard(guard: &AtomicUsize) {
10143 guard.fetch_add(1, std::sync::atomic::Ordering::Release);
10144 }
10145}
10146
10147impl Drop for ChannelGuard {
10148 fn drop(&mut self) {
10149 self.0 .0.store(false, Ordering::Release);
10150 }
10151}
10152
10153#[cfg(feature = "cron")]
10154pub async fn run_cron(website: Website) -> Runner {
10156 async_job::Runner::new().add(Box::new(website)).run().await
10157}
10158
10159#[cfg(feature = "cron")]
10160#[async_trait]
10161impl Job for Website {
10162 fn schedule(&self) -> Option<async_job::Schedule> {
10163 match self.configuration.cron_str.parse() {
10164 Ok(schedule) => Some(schedule),
10165 Err(e) => {
10166 log::error!("{:?}", e);
10167 None
10168 }
10169 }
10170 }
10171 async fn handle(&mut self) {
10172 log::info!(
10173 "CRON: {} - cron job running {}",
10174 self.get_url().as_ref(),
10175 self.now()
10176 );
10177 if self.configuration.cron_type == CronType::Crawl {
10178 self.crawl().await;
10179 } else {
10180 self.scrape().await;
10181 }
10182 }
10183}
10184
10185impl std::fmt::Display for Website {
10186 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
10187 write!(
10188 f,
10189 "Website:\n URL: {}\n ID: {:?}\n Configuration: {:?}",
10190 self.get_url(),
10191 self.get_crawl_id(),
10192 self.configuration
10193 )
10194 }
10195}
10196
10197impl std::error::Error for Website {}
10198
10199#[tokio::test]
10200#[cfg(not(feature = "decentralized"))]
10201async fn crawl() {
10202 let url = "https://choosealicense.com";
10203 let mut website: Website = Website::new(url);
10204 website.crawl().await;
10205 assert!(
10206 website
10207 .links_visited
10208 .contains(&"https://choosealicense.com/licenses/".into()),
10209 "{:?}",
10210 website.links_visited
10211 );
10212}
10213
10214#[tokio::test]
10215#[cfg(feature = "cron")]
10216async fn crawl_cron() {
10217 let url = "https://choosealicense.com";
10218 let mut website: Website = Website::new(url)
10219 .with_cron("1/5 * * * * *", Default::default())
10220 .build()
10221 .unwrap();
10222 let mut rx2 = website.subscribe(16).unwrap();
10223
10224 let join_handle = tokio::spawn(async move {
10226 let mut links_visited = HashSet::new();
10227 while let Ok(res) = rx2.recv().await {
10228 let url = res.get_url();
10229 links_visited.insert(CaseInsensitiveString::new(url));
10230 }
10231 assert!(
10232 links_visited.contains(&CaseInsensitiveString::from(
10233 "https://choosealicense.com/licenses/"
10234 )),
10235 "{:?}",
10236 links_visited
10237 );
10238 });
10239
10240 let mut runner = website.run_cron().await;
10241 log::debug!("Starting the Runner for 10 seconds");
10242 tokio::time::sleep(Duration::from_secs(10)).await;
10243 runner.stop().await;
10244 join_handle.abort();
10245 let _ = join_handle.await;
10246}
10247
10248#[tokio::test]
10249#[cfg(feature = "cron")]
10250async fn crawl_cron_own() {
10251 let url = "https://choosealicense.com";
10252 let mut website: Website = Website::new(url)
10253 .with_cron("1/5 * * * * *", Default::default())
10254 .build()
10255 .unwrap();
10256 let mut rx2 = website.subscribe(16).unwrap();
10257
10258 let join_handle = tokio::spawn(async move {
10260 let mut links_visited = HashSet::new();
10261 while let Ok(res) = rx2.recv().await {
10262 let url = res.get_url();
10263 links_visited.insert(CaseInsensitiveString::new(url));
10264 }
10265 assert!(
10266 links_visited.contains(&CaseInsensitiveString::from(
10267 "https://choosealicense.com/licenses/"
10268 )),
10269 "{:?}",
10270 links_visited
10271 );
10272 });
10273
10274 let mut runner = run_cron(website).await;
10275 log::debug!("Starting the Runner for 10 seconds");
10276 tokio::time::sleep(Duration::from_secs(10)).await;
10277 let _ = tokio::join!(runner.stop(), join_handle);
10278}
10279
10280#[tokio::test]
10281#[cfg(not(feature = "decentralized"))]
10282async fn scrape() {
10283 let mut website: Website = Website::new("https://choosealicense.com");
10284 website.scrape().await;
10285 assert!(
10286 website
10287 .links_visited
10288 .contains(&"https://choosealicense.com/licenses/".into()),
10289 "{:?}",
10290 website.links_visited
10291 );
10292
10293 assert!(!website.get_pages().unwrap()[0].get_html().is_empty());
10294}
10295
10296#[tokio::test]
10297#[cfg(not(feature = "decentralized"))]
10298async fn crawl_invalid() {
10299 let mut website: Website = Website::new("https://w.com");
10300 website.crawl().await;
10301 assert!(website.links_visited.len() <= 1); }
10303
10304#[tokio::test]
10305#[cfg(feature = "decentralized")]
10306async fn crawl_invalid() {
10307 let domain = "https://w.com";
10308 let mut website: Website = Website::new(domain);
10309 website.crawl().await;
10310 let links = website.links_visited.get_links();
10311 let root = CaseInsensitiveString::from(format!("{}/", domain));
10312
10313 assert!(links.contains(&root), "{:?}", links);
10314
10315 #[cfg(feature = "sitemap")]
10316 {
10317 let sitemap = CaseInsensitiveString::from(format!("{}/sitemap.xml", domain));
10318 assert!(links.len() <= 2, "{:?}", links);
10319 if links.len() == 2 {
10320 assert!(links.contains(&sitemap), "{:?}", links);
10321 }
10322 }
10323
10324 #[cfg(not(feature = "sitemap"))]
10325 {
10326 assert_eq!(links.len(), 1, "{:?}", links);
10327 }
10328}
10329
10330#[tokio::test]
10331async fn not_crawl_blacklist() {
10332 let mut website: Website = Website::new("https://choosealicense.com");
10333 website.configuration.blacklist_url = Some(Vec::from([CompactString::from(
10334 "https://choosealicense.com/licenses/",
10335 )]));
10336
10337 website.crawl().await;
10338 assert!(
10339 !website
10340 .links_visited
10341 .contains(&"https://choosealicense.com/licenses/".into()),
10342 "{:?}",
10343 website.links_visited
10344 );
10345}
10346
10347#[tokio::test]
10348#[cfg(feature = "regex")]
10349async fn not_crawl_blacklist_regex() {
10350 let mut website: Website = Website::new("https://choosealicense.com");
10351 website.with_blacklist_url(Some(Vec::from(["choosealicense.com".into()])));
10352 website.crawl().await;
10353 assert_eq!(website.links_visited.len(), 0);
10354}
10355
10356#[test]
10357#[cfg(feature = "ua_generator")]
10358fn randomize_website_agent() {
10359 assert!(!get_ua(false).is_empty());
10360}
10361
10362#[tokio::test]
10363#[cfg(not(feature = "decentralized"))]
10364async fn test_respect_robots_txt() {
10365 let mut website: Website = Website::new("https://stackoverflow.com");
10366 website.configuration.respect_robots_txt = true;
10367 website.configuration.user_agent = Some(Box::new("*".into()));
10368
10369 let (client, _): (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) =
10370 website.setup().await;
10371
10372 website.configure_robots_parser(&client).await;
10373
10374 assert_eq!(website.configuration.delay, 0);
10375
10376 assert!(!&website
10377 .is_allowed(&"https://stackoverflow.com/posts/".into())
10378 .eq(&ProcessLinkStatus::Allowed));
10379
10380 let mut website_second: Website = Website::new("https://www.mongodb.com");
10382 website_second.configuration.respect_robots_txt = true;
10383 website_second.configuration.user_agent = Some(Box::new("bingbot".into()));
10384
10385 let (client_second, _): (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) =
10386 website_second.setup().await;
10387 website_second.configure_robots_parser(&client_second).await;
10388
10389 assert!(!&website
10390 .is_allowed(&"https://www.mongodb.com/community/forums/auth/".into())
10391 .eq(&ProcessLinkStatus::Allowed));
10392
10393 }
10395
10396#[tokio::test]
10397#[cfg(not(feature = "decentralized"))]
10398#[ignore]
10399async fn test_crawl_subdomains() {
10400 let mut website: Website = Website::new("https://choosealicense.com");
10401 website.configuration.subdomains = true;
10402 website.crawl().await;
10403 assert!(
10404 website
10405 .links_visited
10406 .contains(&"https://choosealicense.com/licenses/".into()),
10407 "{:?}",
10408 website.links_visited
10409 );
10410}
10411
10412#[tokio::test]
10413#[cfg(all(
10414 not(feature = "regex"),
10415 not(feature = "openai"),
10416 not(feature = "gemini")
10417))]
10418async fn test_with_configuration() {
10419 let mut website = Website::new("https://choosealicense.com");
10420
10421 website
10422 .with_respect_robots_txt(true)
10423 .with_subdomains(true)
10424 .with_tld(false)
10425 .with_delay(0)
10426 .with_request_timeout(None)
10427 .with_http2_prior_knowledge(false)
10428 .with_user_agent(Some(crate::page::TEST_AGENT_NAME))
10429 .with_headers(None)
10430 .with_proxies(None);
10431
10432 let mut configuration = Box::new(configuration::Configuration::new());
10433
10434 configuration.respect_robots_txt = true;
10435 configuration.subdomains = true;
10436 configuration.tld = false;
10437 configuration.delay = 0;
10438 configuration.request_timeout = None;
10439 configuration.http2_prior_knowledge = false;
10440 configuration.user_agent = Some(Box::new(CompactString::new(crate::page::TEST_AGENT_NAME)));
10441 configuration.headers = None;
10442 configuration.proxies = None;
10443
10444 assert!(
10445 website.configuration == configuration,
10446 "Left\n{:?}\n\nRight\n{:?}",
10447 website.configuration,
10448 configuration
10449 );
10450}
10451
10452#[tokio::test]
10453#[cfg(all(feature = "glob", not(feature = "decentralized")))]
10454async fn test_crawl_glob() {
10455 let mut website: Website =
10456 Website::new("https://choosealicense.com/licenses/{mit,apache-2.0,mpl-2.0}/");
10457 website.crawl().await;
10458
10459 assert!(
10461 website
10462 .links_visited
10463 .contains(&"https://choosealicense.com/licenses/".into())
10464 || website
10465 .links_visited
10466 .contains(&"http://choosealicense.com/licenses/".into()),
10467 "{:?}",
10468 website.links_visited
10469 );
10470}
10471
10472#[tokio::test]
10473#[ignore]
10474#[cfg(not(feature = "decentralized"))]
10475async fn test_crawl_tld() {
10476 let mut website: Website = Website::new("https://choosealicense.com");
10477 website.configuration.tld = true;
10478 website.with_limit(10);
10479 website.crawl().await;
10480
10481 assert!(
10482 website.links_visited.len() > 1,
10483 "expected more than 1 link visited with tld enabled, got {:?}",
10484 website.links_visited
10485 );
10486}
10487
10488#[tokio::test]
10489#[cfg(all(feature = "sync", not(feature = "decentralized")))]
10490async fn test_crawl_subscription() {
10491 let mut website: Website = Website::new("https://choosealicense.com");
10492 let mut rx2 = website.subscribe(100).unwrap();
10493
10494 let join_handle = tokio::spawn(async move {
10495 let mut count = 0;
10496
10497 while let Ok(_) = rx2.recv().await {
10498 count += 1;
10499 }
10500 count
10501 });
10502
10503 website.crawl().await;
10504 website.unsubscribe();
10505 let website_links = website.get_links().len();
10506 let count = join_handle.await.unwrap();
10507
10508 assert!(count == website_links, "{:?}", true);
10510}
10511
10512#[tokio::test]
10513#[cfg(all(feature = "socks", not(feature = "decentralized")))]
10514async fn test_crawl_proxy() {
10515 let mut website: Website = Website::new("https://choosealicense.com");
10516 website
10517 .configuration
10518 .proxies
10519 .get_or_insert(Default::default())
10520 .push("socks5://127.0.0.1:1080".into());
10521
10522 website.crawl().await;
10523
10524 let mut license_found = false;
10525
10526 for links_visited in website.get_links() {
10527 if links_visited.as_ref().contains("/licenses/") {
10530 license_found = true;
10531 };
10532 }
10533
10534 assert!(license_found, "{:?}", website.links_visited);
10535}
10536
10537#[tokio::test]
10538async fn test_link_duplicates() {
10539 fn has_unique_elements<T>(iter: T) -> bool
10540 where
10541 T: IntoIterator,
10542 T::Item: Eq + std::hash::Hash,
10543 {
10544 let mut uniq = HashSet::new();
10545 iter.into_iter().all(move |x| uniq.insert(x))
10546 }
10547
10548 let mut website: Website = Website::new("http://0.0.0.0:8000");
10549 website.crawl().await;
10550
10551 assert!(has_unique_elements(website.links_visited.get_links()));
10552}
10553
10554#[tokio::test]
10555async fn test_crawl_budget() {
10556 let mut website: Website = Website::new("https://choosealicense.com");
10557 website.with_budget(Some(HashMap::from([("*", 1), ("/licenses", 1)])));
10558 website.crawl().await;
10559
10560 assert!(website.links_visited.len() <= 1);
10561}
10562
10563#[tokio::test]
10564#[cfg(feature = "control")]
10565#[ignore]
10566async fn test_crawl_pause_resume() {
10567 use crate::utils::{pause, resume};
10568
10569 let domain = "https://choosealicense.com/";
10570 let mut website: Website = Website::new(domain);
10571
10572 let start = tokio::time::Instant::now();
10573
10574 tokio::spawn(async move {
10575 pause(domain).await;
10576 tokio::time::sleep(Duration::from_millis(5000)).await;
10578 resume(domain).await;
10579 });
10580
10581 website.crawl().await;
10582
10583 let duration = start.elapsed();
10584
10585 assert!(duration.as_secs() >= 5, "{:?}", duration);
10586
10587 assert!(
10588 website
10589 .links_visited
10590 .contains(&"https://choosealicense.com/licenses/".into()),
10591 "{:?}",
10592 website.links_visited
10593 );
10594}
10595
10596#[cfg(feature = "control")]
10597#[ignore]
10598#[tokio::test]
10599async fn test_crawl_shutdown() {
10600 use crate::utils::shutdown;
10601
10602 let domain = "https://spider.cloud/";
10604 let mut website: Website = Website::new(domain);
10605
10606 tokio::spawn(async move {
10607 shutdown(domain).await;
10608 });
10609
10610 website.crawl().await;
10611 let links_visited_count = website.links_visited.len();
10612
10613 assert!(links_visited_count <= 1, "{:?}", links_visited_count);
10614}
10615
10616#[tokio::test]
10617#[cfg(all(feature = "cache_request", not(feature = "decentralized")))]
10618async fn test_cache() {
10619 let domain = "https://choosealicense.com/";
10620 let mut website: Website = Website::new(&domain);
10621 website.configuration.cache = true;
10622
10623 let fresh_start = tokio::time::Instant::now();
10624 website.crawl().await;
10625 let fresh_duration = fresh_start.elapsed();
10626
10627 let cached_start = tokio::time::Instant::now();
10628 website.crawl().await;
10629 let cached_duration = cached_start.elapsed();
10630
10631 assert!(
10633 fresh_duration.as_millis() > cached_duration.as_millis() * 5,
10634 "{:?}",
10635 cached_duration
10636 );
10637}
10638
10639#[tokio::test]
10640#[cfg(all(
10641 not(feature = "decentralized"),
10642 feature = "smart",
10643 feature = "cache_chrome_hybrid"
10644))]
10645async fn test_crawl_smart_uses_seeded_cache_with_skip_browser() {
10646 use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
10647 use std::collections::HashMap as StdHashMap;
10648
10649 let target_url = "http://localhost:9/cache-smart-test";
10650 let cache_key = create_cache_key_raw(target_url, None, None);
10651
10652 let mut response_headers = StdHashMap::new();
10653 response_headers.insert("content-type".to_string(), "text/html".to_string());
10654 response_headers.insert(
10655 "cache-control".to_string(),
10656 "public, max-age=3600".to_string(),
10657 );
10658
10659 let body =
10660 b"<html><head><title>Cached Smart Test</title></head><body>cached</body></html>".to_vec();
10661 let http_response = HttpResponse {
10662 body,
10663 headers: response_headers,
10664 status: 200,
10665 url: Url::parse(target_url).expect("valid cache test url"),
10666 version: HttpVersion::Http11,
10667 };
10668
10669 let request_headers = StdHashMap::new();
10670
10671 put_hybrid_cache(&cache_key, http_response, "GET", request_headers).await;
10672
10673 let mut website = Website::new(target_url);
10674 website.configuration.cache = true;
10675 website.with_cache_skip_browser(true);
10676 website.with_budget(Some(HashMap::from([("*", 1)])));
10677
10678 let start = tokio::time::Instant::now();
10679 website.crawl_smart().await;
10680 let elapsed = start.elapsed();
10681
10682 assert_eq!(website.initial_status_code, StatusCode::OK);
10683 assert!(website.initial_html_length > 0);
10684 assert!(!website.initial_page_should_retry);
10685 assert!(
10686 website.links_visited.contains(&target_url.into()),
10687 "expected smart crawl to visit the cached target"
10688 );
10689
10690 eprintln!("crawl_smart cached latency: {}ms", elapsed.as_millis());
10691}
10692
10693#[tokio::test]
10694#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
10695async fn test_cache_shortcircuit_single_page() {
10696 use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
10697 use std::collections::HashMap as StdHashMap;
10698
10699 let target_url = "http://localhost:9/shortcircuit-test";
10700 let cache_key = create_cache_key_raw(target_url, None, None);
10701
10702 let mut response_headers = StdHashMap::new();
10703 response_headers.insert("content-type".to_string(), "text/html".to_string());
10704 response_headers.insert(
10705 "cache-control".to_string(),
10706 "public, max-age=3600".to_string(),
10707 );
10708
10709 let body =
10710 b"<html><head><title>Shortcircuit</title></head><body><h1>Cached!</h1></body></html>"
10711 .to_vec();
10712 let http_response = HttpResponse {
10713 body,
10714 headers: response_headers,
10715 status: 200,
10716 url: Url::parse(target_url).expect("valid url"),
10717 version: HttpVersion::Http11,
10718 };
10719
10720 let request_headers = StdHashMap::new();
10721
10722 put_hybrid_cache(&cache_key, http_response, "GET", request_headers).await;
10723
10724 let mut website = Website::new(target_url);
10725 website.configuration.cache = true;
10726 website.with_cache_skip_browser(true);
10727 website.with_budget(Some(HashMap::from([("*", 1)])));
10728
10729 let mut rx = website.subscribe(4).unwrap();
10730 let handle = tokio::spawn(async move { rx.recv().await.ok() });
10731
10732 let start = tokio::time::Instant::now();
10733 website.crawl().await;
10734 let elapsed = start.elapsed();
10735
10736 let page = handle.await.unwrap().expect("page received via channel");
10737 assert!(
10738 page.get_html().contains("Cached!"),
10739 "expected cached HTML content"
10740 );
10741 assert_eq!(page.status_code, StatusCode::OK);
10742 assert_eq!(website.initial_status_code, StatusCode::OK);
10743 assert!(website.initial_html_length > 0);
10744 assert!(
10746 elapsed.as_millis() < 2000,
10747 "shortcircuit too slow: {elapsed:?}"
10748 );
10749 eprintln!(
10750 "shortcircuit single_page latency: {}ms",
10751 elapsed.as_millis()
10752 );
10753}
10754
10755#[tokio::test]
10756#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
10757async fn test_cache_shortcircuit_miss_falls_through() {
10758 let mut website = Website::new("http://localhost:9/uncached-shortcircuit");
10759 website.configuration.cache = true;
10760 website.with_cache_skip_browser(true);
10761 website.with_budget(Some(HashMap::from([("*", 1)])));
10762
10763 website.crawl_raw().await;
10765}
10766
10767#[tokio::test]
10768#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
10769async fn test_cache_shortcircuit_not_without_skip_browser() {
10770 use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
10771 use std::collections::HashMap as StdHashMap;
10772
10773 let target_url = "http://localhost:9/no-skip-shortcircuit";
10774 let cache_key = create_cache_key_raw(target_url, None, None);
10775
10776 let mut response_headers = StdHashMap::new();
10777 response_headers.insert("content-type".to_string(), "text/html".to_string());
10778 response_headers.insert(
10779 "cache-control".to_string(),
10780 "public, max-age=3600".to_string(),
10781 );
10782
10783 let body = b"<html><body>No Skip</body></html>".to_vec();
10784 let http_response = HttpResponse {
10785 body,
10786 headers: response_headers,
10787 status: 200,
10788 url: Url::parse(target_url).expect("valid url"),
10789 version: HttpVersion::Http11,
10790 };
10791
10792 let request_headers = StdHashMap::new();
10793
10794 put_hybrid_cache(&cache_key, http_response, "GET", request_headers).await;
10795
10796 let mut website = Website::new(target_url);
10797 website.configuration.cache = true;
10798 website.with_cache_skip_browser(false); website.with_budget(Some(HashMap::from([("*", 1)])));
10800
10801 website.configuration.configure_budget();
10803 assert!(
10804 !crate::utils::cache_skip_browser(&website.configuration.get_cache_options()),
10805 "cache_skip_browser should be false when skip_browser is disabled"
10806 );
10807
10808 website.crawl_raw().await;
10810}
10811
10812#[tokio::test]
10813#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
10814async fn test_cache_shortcircuit_not_for_multi_page() {
10815 use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
10816 use std::collections::HashMap as StdHashMap;
10817
10818 let target_url = "http://localhost:9/multi-page-shortcircuit";
10819 let cache_key = create_cache_key_raw(target_url, None, None);
10820
10821 let mut response_headers = StdHashMap::new();
10822 response_headers.insert("content-type".to_string(), "text/html".to_string());
10823 response_headers.insert(
10824 "cache-control".to_string(),
10825 "public, max-age=3600".to_string(),
10826 );
10827
10828 let body = b"<html><body>Multi Page</body></html>".to_vec();
10829 let http_response = HttpResponse {
10830 body,
10831 headers: response_headers,
10832 status: 200,
10833 url: Url::parse(target_url).expect("valid url"),
10834 version: HttpVersion::Http11,
10835 };
10836
10837 let request_headers = StdHashMap::new();
10838
10839 put_hybrid_cache(&cache_key, http_response, "GET", request_headers).await;
10840
10841 let mut website = Website::new(target_url);
10842 website.configuration.cache = true;
10843 website.with_cache_skip_browser(true);
10844 website.with_budget(Some(HashMap::from([("*", 5)]))); website.crawl_raw().await;
10849}
10850
10851#[tokio::test]
10852#[cfg(all(
10853 not(feature = "decentralized"),
10854 feature = "smart",
10855 feature = "cache_chrome_hybrid"
10856))]
10857async fn test_cache_shortcircuit_crawl_smart() {
10858 use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
10859 use std::collections::HashMap as StdHashMap;
10860
10861 let target_url = "http://localhost:9/smart-shortcircuit-test";
10862 let cache_key = create_cache_key_raw(target_url, None, None);
10863
10864 let mut response_headers = StdHashMap::new();
10865 response_headers.insert("content-type".to_string(), "text/html".to_string());
10866 response_headers.insert(
10867 "cache-control".to_string(),
10868 "public, max-age=3600".to_string(),
10869 );
10870
10871 let body =
10872 b"<html><head><title>Smart Shortcircuit</title></head><body>Smart Cached</body></html>"
10873 .to_vec();
10874 let http_response = HttpResponse {
10875 body,
10876 headers: response_headers,
10877 status: 200,
10878 url: Url::parse(target_url).expect("valid url"),
10879 version: HttpVersion::Http11,
10880 };
10881
10882 let request_headers = StdHashMap::new();
10883
10884 put_hybrid_cache(&cache_key, http_response, "GET", request_headers).await;
10885
10886 let mut website = Website::new(target_url);
10887 website.configuration.cache = true;
10888 website.with_cache_skip_browser(true);
10889 website.with_budget(Some(HashMap::from([("*", 1)])));
10890
10891 let mut rx = website.subscribe(4).unwrap();
10892 let handle = tokio::spawn(async move { rx.recv().await.ok() });
10893
10894 let start = tokio::time::Instant::now();
10895 website.crawl_smart().await;
10896 let elapsed = start.elapsed();
10897
10898 let page = handle.await.unwrap().expect("page received");
10899 assert!(
10900 page.get_html().contains("Smart Cached"),
10901 "expected cached HTML in crawl_smart"
10902 );
10903 assert_eq!(website.initial_status_code, StatusCode::OK);
10904 assert!(website.initial_html_length > 0);
10905 assert!(
10906 elapsed.as_millis() < 2000,
10907 "crawl_smart shortcircuit too slow: {elapsed:?}"
10908 );
10909 eprintln!(
10910 "crawl_smart shortcircuit latency: {}ms",
10911 elapsed.as_millis()
10912 );
10913}
10914
10915#[cfg(test)]
10916mod tests {
10917
10918 #[cfg(not(feature = "decentralized"))]
10919 #[test]
10920 fn test_client_rotator_round_robin() {
10921 let clients: Vec<crate::Client> = (0..3)
10923 .map(|_| {
10924 #[cfg(not(feature = "cache_request"))]
10925 {
10926 unsafe { crate::ClientBuilder::new().build().unwrap_unchecked() }
10927 }
10928 #[cfg(feature = "cache_request")]
10929 {
10930 reqwest_middleware::ClientBuilder::new(unsafe {
10931 reqwest::ClientBuilder::new().build().unwrap_unchecked()
10932 })
10933 .build()
10934 }
10935 })
10936 .collect();
10937
10938 let rotator = crate::website::ClientRotator::new(clients);
10939 assert_eq!(rotator.len(), 3);
10940 assert!(!rotator.is_empty());
10941
10942 let _ = rotator.next(); let _ = rotator.next(); let _ = rotator.next(); let _ = rotator.next(); let current_idx = rotator.index.load(crate::website::Ordering::Relaxed);
10951 assert_eq!(current_idx, 4);
10952 }
10953
10954 #[cfg(not(feature = "decentralized"))]
10955 #[test]
10956 fn test_build_rotated_clients_with_multiple_proxies() {
10957 let mut website = crate::website::Website::new("http://example.com");
10958 website.configuration.with_proxies(Some(vec![
10959 "http://proxy1.example.com:8080".to_string(),
10960 "http://proxy2.example.com:8080".to_string(),
10961 "http://proxy3.example.com:8080".to_string(),
10962 ]));
10963
10964 let rotator = website.build_rotated_clients();
10965 assert!(rotator.is_some(), "Should build rotator with 3 proxies");
10966 let rotator = rotator.unwrap();
10967 assert_eq!(rotator.len(), 3);
10968 }
10969
10970 #[cfg(not(feature = "decentralized"))]
10971 #[test]
10972 fn test_build_rotated_clients_single_proxy_returns_none() {
10973 let mut website = crate::website::Website::new("http://example.com");
10974 website
10975 .configuration
10976 .with_proxies(Some(vec!["http://proxy1.example.com:8080".to_string()]));
10977
10978 let rotator = website.build_rotated_clients();
10979 assert!(
10980 rotator.is_none(),
10981 "Should not build rotator with only 1 proxy"
10982 );
10983 }
10984
10985 #[cfg(not(feature = "decentralized"))]
10986 #[test]
10987 fn test_build_rotated_clients_no_proxies_returns_none() {
10988 let website = crate::website::Website::new("http://example.com");
10989 let rotator = website.build_rotated_clients();
10990 assert!(
10991 rotator.is_none(),
10992 "Should not build rotator with no proxies"
10993 );
10994 }
10995}
10996
10997#[tokio::test]
10998#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
10999async fn test_cache_phase_multi_page_all_cached() {
11000 use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
11001 use std::collections::HashMap as StdHashMap;
11002
11003 let root_url = "http://localhost:9/cache-phase-root";
11004 let sub1_url = "http://localhost:9/cache-phase-sub1";
11005 let sub2_url = "http://localhost:9/cache-phase-sub2";
11006
11007 let root_html = format!(
11009 "<html><head><title>Root</title></head><body>\
11010 <a href=\"{}\">Sub1</a>\
11011 <a href=\"{}\">Sub2</a>\
11012 </body></html>",
11013 sub1_url, sub2_url
11014 );
11015 let sub1_html =
11017 "<html><head><title>Sub1</title></head><body><h1>Sub1 Content</h1></body></html>";
11018 let sub2_html =
11019 "<html><head><title>Sub2</title></head><body><h1>Sub2 Content</h1></body></html>";
11020
11021 let request_headers = StdHashMap::new();
11022 let response_headers = {
11023 let mut h = StdHashMap::new();
11024 h.insert("content-type".to_string(), "text/html".to_string());
11025 h.insert(
11026 "cache-control".to_string(),
11027 "public, max-age=3600".to_string(),
11028 );
11029 h
11030 };
11031
11032 for (url, html) in [
11034 (root_url, root_html.as_str()),
11035 (sub1_url, sub1_html),
11036 (sub2_url, sub2_html),
11037 ] {
11038 let cache_key = create_cache_key_raw(url, None, None);
11039 let http_response = HttpResponse {
11040 body: html.as_bytes().to_vec(),
11041 headers: response_headers.clone(),
11042 status: 200,
11043 url: Url::parse(url).expect("valid url"),
11044 version: HttpVersion::Http11,
11045 };
11046 put_hybrid_cache(&cache_key, http_response, "GET", request_headers.clone()).await;
11047 }
11048
11049 let mut website = Website::new(root_url);
11050 website.configuration.cache = true;
11051 website.with_cache_skip_browser(true);
11052 website.with_budget(Some(HashMap::from([("*", 10)])));
11053
11054 let mut rx = website.subscribe(16).unwrap();
11055
11056 website.crawl_raw().await;
11057
11058 let mut pages = Vec::new();
11060 while let Ok(page) = rx.try_recv() {
11061 pages.push(page.get_url().to_string());
11062 }
11063
11064 assert!(
11065 pages.contains(&root_url.to_string()),
11066 "root page should be served from cache"
11067 );
11068 assert!(
11069 pages.contains(&sub1_url.to_string()),
11070 "sub1 page should be served from cache"
11071 );
11072 assert!(
11073 pages.contains(&sub2_url.to_string()),
11074 "sub2 page should be served from cache"
11075 );
11076 assert_eq!(pages.len(), 3, "exactly 3 pages expected");
11077 assert_eq!(website.initial_status_code, StatusCode::OK);
11078 assert!(website.initial_html_length > 0);
11079}
11080
11081#[tokio::test]
11082#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
11083async fn test_cache_phase_partial_miss() {
11084 use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
11085 use std::collections::HashMap as StdHashMap;
11086
11087 let root_url = "http://localhost:9/cache-phase-partial-root";
11088 let sub_url = "http://localhost:9/cache-phase-partial-sub";
11089
11090 let root_html = format!(
11092 "<html><head><title>Root</title></head><body>\
11093 <a href=\"{}\">Sub</a></body></html>",
11094 sub_url
11095 );
11096
11097 let request_headers = StdHashMap::new();
11098 let response_headers = {
11099 let mut h = StdHashMap::new();
11100 h.insert("content-type".to_string(), "text/html".to_string());
11101 h.insert(
11102 "cache-control".to_string(),
11103 "public, max-age=3600".to_string(),
11104 );
11105 h
11106 };
11107
11108 let cache_key = create_cache_key_raw(root_url, None, None);
11110 let http_response = HttpResponse {
11111 body: root_html.as_bytes().to_vec(),
11112 headers: response_headers,
11113 status: 200,
11114 url: Url::parse(root_url).expect("valid url"),
11115 version: HttpVersion::Http11,
11116 };
11117 put_hybrid_cache(&cache_key, http_response, "GET", request_headers).await;
11118
11119 let mut website = Website::new(root_url);
11120 website.configuration.cache = true;
11121 website.with_cache_skip_browser(true);
11122 website.with_budget(Some(HashMap::from([("*", 10)])));
11123
11124 let mut rx = website.subscribe(16).unwrap();
11125
11126 website.crawl_raw().await;
11129
11130 let mut pages = Vec::new();
11132 while let Ok(page) = rx.try_recv() {
11133 pages.push(page.get_url().to_string());
11134 }
11135
11136 assert!(
11137 pages.contains(&root_url.to_string()),
11138 "root page should be served from cache"
11139 );
11140 assert_eq!(website.initial_status_code, StatusCode::OK);
11143}
11144
11145#[tokio::test]
11146#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
11147async fn test_cache_phase_skipped_without_skip_browser() {
11148 use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
11149 use std::collections::HashMap as StdHashMap;
11150
11151 let root_url = "http://localhost:9/cache-phase-no-skip";
11152
11153 let request_headers = StdHashMap::new();
11154 let response_headers = {
11155 let mut h = StdHashMap::new();
11156 h.insert("content-type".to_string(), "text/html".to_string());
11157 h.insert(
11158 "cache-control".to_string(),
11159 "public, max-age=3600".to_string(),
11160 );
11161 h
11162 };
11163
11164 let cache_key = create_cache_key_raw(root_url, None, None);
11165 let http_response = HttpResponse {
11166 body: b"<html><body>Cached</body></html>".to_vec(),
11167 headers: response_headers,
11168 status: 200,
11169 url: Url::parse(root_url).expect("valid url"),
11170 version: HttpVersion::Http11,
11171 };
11172 put_hybrid_cache(&cache_key, http_response, "GET", request_headers).await;
11173
11174 let mut website = Website::new(root_url);
11175 website.configuration.cache = true;
11176 website.with_cache_skip_browser(false); website.with_budget(Some(HashMap::from([("*", 5)])));
11178
11179 website.configuration.configure_budget();
11181 assert!(
11182 !crate::utils::cache_skip_browser(&website.configuration.get_cache_options()),
11183 "cache_skip_browser should be false"
11184 );
11185
11186 website.crawl_raw().await;
11188}
11189
11190#[tokio::test]
11191#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
11192async fn test_cache_phase_respects_budget() {
11193 use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
11194 use std::collections::HashMap as StdHashMap;
11195
11196 let root_url = "http://localhost:9/cache-phase-budget";
11197 let sub1_url = "http://localhost:9/cache-phase-budget-s1";
11198 let sub2_url = "http://localhost:9/cache-phase-budget-s2";
11199
11200 let root_html = format!(
11201 "<html><body><a href=\"{}\">S1</a><a href=\"{}\">S2</a></body></html>",
11202 sub1_url, sub2_url
11203 );
11204 let sub_html = "<html><body>Sub</body></html>";
11205
11206 let request_headers = StdHashMap::new();
11207 let response_headers = {
11208 let mut h = StdHashMap::new();
11209 h.insert("content-type".to_string(), "text/html".to_string());
11210 h.insert(
11211 "cache-control".to_string(),
11212 "public, max-age=3600".to_string(),
11213 );
11214 h
11215 };
11216
11217 for (url, html) in [
11218 (root_url, root_html.as_str()),
11219 (sub1_url, sub_html),
11220 (sub2_url, sub_html),
11221 ] {
11222 let cache_key = create_cache_key_raw(url, None, None);
11223 let http_response = HttpResponse {
11224 body: html.as_bytes().to_vec(),
11225 headers: response_headers.clone(),
11226 status: 200,
11227 url: Url::parse(url).expect("valid url"),
11228 version: HttpVersion::Http11,
11229 };
11230 put_hybrid_cache(&cache_key, http_response, "GET", request_headers.clone()).await;
11231 }
11232
11233 let mut website = Website::new(root_url);
11235 website.configuration.cache = true;
11236 website.with_cache_skip_browser(true);
11237 website.with_budget(Some(HashMap::from([("*", 2)])));
11238
11239 let mut rx = website.subscribe(16).unwrap();
11240 website.crawl_raw().await;
11241
11242 let mut pages = Vec::new();
11243 while let Ok(page) = rx.try_recv() {
11244 pages.push(page.get_url().to_string());
11245 }
11246
11247 assert!(
11249 pages.len() <= 2,
11250 "budget should limit pages to at most 2, got {}",
11251 pages.len()
11252 );
11253 assert!(
11254 pages.contains(&root_url.to_string()),
11255 "root page should always be served"
11256 );
11257}
11258
11259#[tokio::test]
11260#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
11261async fn test_cache_phase_initial_miss_falls_through() {
11262 let root_url = "http://localhost:9/cache-phase-miss-initial";
11265
11266 let mut website = Website::new(root_url);
11267 website.configuration.cache = true;
11268 website.with_cache_skip_browser(true);
11269 website.with_budget(Some(HashMap::from([("*", 5)])));
11270
11271 website.crawl_raw().await;
11273}
11274
11275#[tokio::test]
11276#[cfg(all(not(feature = "decentralized"), feature = "cache_chrome_hybrid"))]
11277async fn test_cache_phase_dedup_signatures() {
11278 use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
11279 use std::collections::HashMap as StdHashMap;
11280
11281 let root_url = "http://localhost:9/cache-phase-dedup";
11282 let dup_url = "http://localhost:9/cache-phase-dedup-dup";
11283
11284 let html = "<html><body><a href=\"http://localhost:9/cache-phase-dedup-dup\">Link</a><p>Same Content</p></body></html>";
11286
11287 let request_headers = StdHashMap::new();
11288 let response_headers = {
11289 let mut h = StdHashMap::new();
11290 h.insert("content-type".to_string(), "text/html".to_string());
11291 h.insert(
11292 "cache-control".to_string(),
11293 "public, max-age=3600".to_string(),
11294 );
11295 h
11296 };
11297
11298 for url in [root_url, dup_url] {
11299 let cache_key = create_cache_key_raw(url, None, None);
11300 let http_response = HttpResponse {
11301 body: html.as_bytes().to_vec(),
11302 headers: response_headers.clone(),
11303 status: 200,
11304 url: Url::parse(url).expect("valid url"),
11305 version: HttpVersion::Http11,
11306 };
11307 put_hybrid_cache(&cache_key, http_response, "GET", request_headers.clone()).await;
11308 }
11309
11310 let mut website = Website::new(root_url);
11311 website.configuration.cache = true;
11312 website.configuration.normalize = true;
11313 website.with_cache_skip_browser(true);
11314 website.with_budget(Some(HashMap::from([("*", 10)])));
11315
11316 let mut rx = website.subscribe(16).unwrap();
11317 website.crawl_raw().await;
11318
11319 let mut pages = Vec::new();
11320 while let Ok(page) = rx.try_recv() {
11321 pages.push(page.get_url().to_string());
11322 }
11323
11324 assert!(
11326 pages.contains(&root_url.to_string()),
11327 "root page should be served"
11328 );
11329 assert!(
11332 pages.len() <= 2,
11333 "signature dedup should limit duplicate content"
11334 );
11335}
11336
11337#[tokio::test]
11340#[cfg(all(
11341 not(feature = "decentralized"),
11342 any(feature = "cache_chrome_hybrid", feature = "cache_chrome_hybrid_mem")
11343))]
11344async fn test_cache_shortcircuit_single_page_mem() {
11345 use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
11346 use std::collections::HashMap as StdHashMap;
11347
11348 let target_url = "http://localhost:9/shortcircuit-mem-test";
11349 let cache_key = create_cache_key_raw(target_url, None, None);
11350
11351 let mut response_headers = StdHashMap::new();
11352 response_headers.insert("content-type".to_string(), "text/html".to_string());
11353 response_headers.insert(
11354 "cache-control".to_string(),
11355 "public, max-age=3600".to_string(),
11356 );
11357
11358 let body = b"<html><head><title>MemCached</title></head><body><h1>In-Memory Cached!</h1></body></html>"
11359 .to_vec();
11360 let http_response = HttpResponse {
11361 body,
11362 headers: response_headers,
11363 status: 200,
11364 url: Url::parse(target_url).expect("valid url"),
11365 version: HttpVersion::Http11,
11366 };
11367
11368 let request_headers = StdHashMap::new();
11369
11370 put_hybrid_cache(&cache_key, http_response, "GET", request_headers).await;
11371
11372 let mut website = Website::new(target_url);
11373 website.configuration.cache = true;
11374 website.with_cache_skip_browser(true);
11375 website.with_budget(Some(HashMap::from([("*", 1)])));
11376
11377 let mut rx = website.subscribe(4).unwrap();
11378 let handle = tokio::spawn(async move { rx.recv().await.ok() });
11379
11380 let start = tokio::time::Instant::now();
11381 website.crawl().await;
11382 let elapsed = start.elapsed();
11383
11384 let page = handle.await.unwrap().expect("page received via channel");
11385 assert!(
11386 page.get_html().contains("In-Memory Cached!"),
11387 "expected cached HTML content from mem cache"
11388 );
11389 assert_eq!(page.status_code, StatusCode::OK);
11390 assert_eq!(website.initial_status_code, StatusCode::OK);
11391 assert!(website.initial_html_length > 0);
11392 assert!(
11394 elapsed.as_millis() < 2000,
11395 "shortcircuit too slow: {elapsed:?}"
11396 );
11397 eprintln!(
11398 "shortcircuit single_page (mem) latency: {}ms",
11399 elapsed.as_millis()
11400 );
11401}
11402
11403#[tokio::test]
11405#[cfg(all(
11406 not(feature = "decentralized"),
11407 any(feature = "cache_chrome_hybrid", feature = "cache_chrome_hybrid_mem")
11408))]
11409async fn test_cache_phase_multi_page_all_cached_mem() {
11410 use crate::utils::{create_cache_key_raw, put_hybrid_cache, HttpResponse, HttpVersion};
11411 use std::collections::HashMap as StdHashMap;
11412
11413 let root_url = "http://localhost:9/cache-phase-mem-root";
11414 let sub1_url = "http://localhost:9/cache-phase-mem-sub1";
11415 let sub2_url = "http://localhost:9/cache-phase-mem-sub2";
11416
11417 let root_html = format!(
11418 "<html><head><title>Root</title></head><body>\
11419 <a href=\"{}\">Sub1</a><a href=\"{}\">Sub2</a></body></html>",
11420 sub1_url, sub2_url
11421 );
11422 let sub1_html =
11423 "<html><head><title>Sub1</title></head><body><h1>Sub1 Mem Content</h1></body></html>";
11424 let sub2_html =
11425 "<html><head><title>Sub2</title></head><body><h1>Sub2 Mem Content</h1></body></html>";
11426
11427 let request_headers = StdHashMap::new();
11428 let response_headers = {
11429 let mut h = StdHashMap::new();
11430 h.insert("content-type".to_string(), "text/html".to_string());
11431 h.insert(
11432 "cache-control".to_string(),
11433 "public, max-age=3600".to_string(),
11434 );
11435 h
11436 };
11437
11438 for (url, html) in [
11439 (root_url, root_html.as_str()),
11440 (sub1_url, sub1_html),
11441 (sub2_url, sub2_html),
11442 ] {
11443 let cache_key = create_cache_key_raw(url, None, None);
11444 let http_response = HttpResponse {
11445 body: html.as_bytes().to_vec(),
11446 headers: response_headers.clone(),
11447 status: 200,
11448 url: Url::parse(url).expect("valid url"),
11449 version: HttpVersion::Http11,
11450 };
11451 put_hybrid_cache(&cache_key, http_response, "GET", request_headers.clone()).await;
11452 }
11453
11454 let mut website = Website::new(root_url);
11455 website.configuration.cache = true;
11456 website.with_cache_skip_browser(true);
11457 website.with_budget(Some(HashMap::from([("*", 10)])));
11458
11459 let mut rx = website.subscribe(16).unwrap();
11460 website.crawl_raw().await;
11461
11462 let mut pages = Vec::new();
11463 while let Ok(page) = rx.try_recv() {
11464 pages.push(page.get_url().to_string());
11465 }
11466
11467 assert!(
11468 pages.contains(&root_url.to_string()),
11469 "root page should be served from mem cache"
11470 );
11471 assert_eq!(pages.len(), 3, "exactly 3 pages expected from mem cache");
11472 assert_eq!(website.initial_status_code, StatusCode::OK);
11473 assert!(website.initial_html_length > 0);
11474}