1use crate::compact_str::CompactString;
2use crate::features::chrome_common::RequestInterceptConfiguration;
3pub use crate::features::chrome_common::{
4 AuthChallengeResponse, AuthChallengeResponseResponse, AutomationScripts, AutomationScriptsMap,
5 CaptureScreenshotFormat, CaptureScreenshotParams, ClipViewport, ExecutionScripts,
6 ExecutionScriptsMap, ScreenShotConfig, ScreenshotParams, Viewport, WaitFor, WaitForDelay,
7 WaitForIdleNetwork, WaitForSelector, WebAutomation,
8};
9pub use crate::features::gemini_common::GeminiConfigs;
10pub use crate::features::openai_common::GPTConfigs;
11#[cfg(feature = "search")]
12pub use crate::features::search::{
13 SearchError, SearchOptions, SearchResult, SearchResults, TimeRange,
14};
15pub use crate::features::webdriver_common::{WebDriverBrowser, WebDriverConfig};
16use crate::utils::get_domain_from_url;
17use crate::utils::BasicCachePolicy;
18use crate::website::CronType;
19use reqwest::header::{AsHeaderName, HeaderMap, HeaderName, HeaderValue, IntoHeaderName};
20use std::net::IpAddr;
21use std::sync::Arc;
22use std::time::Duration;
23
24#[cfg(feature = "chrome")]
25pub use spider_fingerprint::Fingerprint;
26
27#[derive(Debug, Default, Clone, PartialEq)]
29#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
30pub enum RedirectPolicy {
31 #[default]
32 #[cfg_attr(
33 feature = "serde",
34 serde(alias = "Loose", alias = "loose", alias = "LOOSE",)
35 )]
36 Loose,
38 #[cfg_attr(
39 feature = "serde",
40 serde(alias = "Strict", alias = "strict", alias = "STRICT",)
41 )]
42 Strict,
44 #[cfg_attr(
45 feature = "serde",
46 serde(alias = "None", alias = "none", alias = "NONE",)
47 )]
48 None,
50}
51
52#[cfg(not(feature = "regex"))]
53pub type AllowList = Vec<CompactString>;
55
56#[cfg(feature = "regex")]
57pub type AllowList = Box<regex::RegexSet>;
59
60#[derive(Debug, Default, Clone)]
62#[cfg_attr(not(feature = "regex"), derive(PartialEq, Eq))]
63pub struct AllowListSet(pub AllowList);
64
65#[cfg(feature = "chrome")]
66#[derive(Debug, PartialEq, Eq, Clone, Default)]
68#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
69pub struct ChromeEventTracker {
70 pub responses: bool,
72 pub requests: bool,
74 pub automation: bool,
76}
77
78#[cfg(feature = "chrome")]
79impl ChromeEventTracker {
80 pub fn new(requests: bool, responses: bool) -> Self {
82 ChromeEventTracker {
83 requests,
84 responses,
85 automation: true,
86 }
87 }
88}
89
90#[cfg(feature = "sitemap")]
91#[derive(Debug, Default)]
92pub struct SitemapWhitelistChanges {
94 pub added_default: bool,
96 pub added_custom: bool,
98}
99
100#[cfg(feature = "sitemap")]
101impl SitemapWhitelistChanges {
102 pub(crate) fn modified(&self) -> bool {
104 self.added_default || self.added_custom
105 }
106}
107
108#[derive(Debug, Default, Clone, PartialEq)]
110#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
111pub enum ProxyIgnore {
112 Chrome,
114 Http,
116 #[default]
117 No,
119}
120
121#[derive(Debug, Default, Clone, PartialEq)]
123#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
124pub struct RequestProxy {
125 pub addr: String,
127 pub ignore: ProxyIgnore,
129}
130
131#[derive(Debug, Default, Clone)]
141#[cfg_attr(
142 all(
143 not(feature = "regex"),
144 not(feature = "openai"),
145 not(feature = "cache_openai"),
146 not(feature = "gemini"),
147 not(feature = "cache_gemini")
148 ),
149 derive(PartialEq)
150)]
151#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
152#[cfg_attr(feature = "serde", serde(default))]
153pub struct Configuration {
154 pub respect_robots_txt: bool,
156 pub subdomains: bool,
158 pub tld: bool,
160 pub crawl_timeout: Option<Duration>,
162 pub preserve_host_header: bool,
164 pub blacklist_url: Option<Vec<CompactString>>,
166 pub whitelist_url: Option<Vec<CompactString>>,
168 pub user_agent: Option<Box<CompactString>>,
170 pub delay: u64,
172 pub request_timeout: Option<Duration>,
174 pub http2_prior_knowledge: bool,
176 pub proxies: Option<Vec<RequestProxy>>,
178 pub headers: Option<Box<SerializableHeaderMap>>,
180 #[cfg(feature = "sitemap")]
181 pub sitemap_url: Option<Box<CompactString>>,
183 #[cfg(feature = "sitemap")]
184 pub ignore_sitemap: bool,
186 pub redirect_limit: usize,
188 pub redirect_policy: RedirectPolicy,
190 #[cfg(feature = "cookies")]
191 pub cookie_str: String,
193 #[cfg(feature = "wreq")]
194 pub emulation: Option<wreq_util::Emulation>,
196 #[cfg(feature = "cron")]
197 pub cron_str: String,
199 #[cfg(feature = "cron")]
200 pub cron_type: CronType,
202 pub depth: usize,
204 pub depth_distance: usize,
206 pub stealth_mode: spider_fingerprint::configs::Tier,
208 pub viewport: Option<Viewport>,
210 pub budget: Option<hashbrown::HashMap<case_insensitive_string::CaseInsensitiveString, u32>>,
212 pub wild_card_budgeting: bool,
214 pub external_domains_caseless:
216 Arc<hashbrown::HashSet<case_insensitive_string::CaseInsensitiveString>>,
217 pub full_resources: bool,
219 pub accept_invalid_certs: bool,
221 pub auth_challenge_response: Option<AuthChallengeResponse>,
223 pub openai_config: Option<Box<GPTConfigs>>,
225 pub gemini_config: Option<Box<GeminiConfigs>>,
227 pub remote_multimodal: Option<Box<crate::features::automation::RemoteMultimodalConfigs>>,
230 pub shared_queue: bool,
232 pub return_page_links: bool,
234 pub retry: u8,
236 pub no_control_thread: bool,
238 blacklist: AllowListSet,
240 whitelist: AllowListSet,
242 pub(crate) inner_budget:
244 Option<hashbrown::HashMap<case_insensitive_string::CaseInsensitiveString, u32>>,
245 pub only_html: bool,
247 pub concurrency_limit: Option<usize>,
249 pub normalize: bool,
251 pub shared: bool,
253 pub modify_headers: bool,
255 pub modify_http_client_headers: bool,
257 #[cfg(any(
259 feature = "cache_request",
260 feature = "chrome",
261 feature = "chrome_remote_cache"
262 ))]
263 pub cache: bool,
264 #[cfg(any(
267 feature = "cache_request",
268 feature = "chrome",
269 feature = "chrome_remote_cache"
270 ))]
271 pub cache_skip_browser: bool,
272 #[cfg(feature = "chrome")]
273 pub service_worker_enabled: bool,
275 #[cfg(feature = "chrome")]
276 #[cfg(feature = "chrome")]
278 pub timezone_id: Option<Box<String>>,
279 #[cfg(feature = "chrome")]
281 pub locale: Option<Box<String>>,
282 #[cfg(feature = "chrome")]
284 pub evaluate_on_new_document: Option<Box<String>>,
285 #[cfg(feature = "chrome")]
286 pub dismiss_dialogs: Option<bool>,
288 #[cfg(feature = "chrome")]
289 pub wait_for: Option<WaitFor>,
291 #[cfg(feature = "chrome")]
292 pub screenshot: Option<ScreenShotConfig>,
294 #[cfg(feature = "chrome")]
295 pub track_events: Option<ChromeEventTracker>,
297 #[cfg(feature = "chrome")]
298 pub fingerprint: Fingerprint,
300 #[cfg(feature = "chrome")]
301 pub chrome_connection_url: Option<String>,
303 #[cfg(feature = "chrome")]
305 pub execution_scripts: Option<ExecutionScripts>,
306 #[cfg(feature = "chrome")]
308 pub automation_scripts: Option<AutomationScripts>,
309 #[cfg(feature = "chrome")]
311 pub chrome_intercept: RequestInterceptConfiguration,
312 pub referer: Option<String>,
314 pub max_page_bytes: Option<f64>,
316 pub max_bytes_allowed: Option<u64>,
318 #[cfg(feature = "chrome")]
319 pub disable_log: bool,
321 #[cfg(feature = "chrome")]
322 pub auto_geolocation: bool,
324 pub cache_policy: Option<BasicCachePolicy>,
326 #[cfg(feature = "chrome")]
327 pub bypass_csp: bool,
329 pub network_interface: Option<String>,
331 pub local_address: Option<IpAddr>,
333 pub default_http_connect_timeout: Option<Duration>,
335 pub default_http_read_timeout: Option<Duration>,
337 #[cfg(feature = "webdriver")]
338 pub webdriver_config: Option<Box<WebDriverConfig>>,
340 #[cfg(feature = "search")]
341 pub search_config: Option<Box<SearchConfig>>,
343 #[cfg(feature = "spider_cloud")]
344 pub spider_cloud: Option<Box<SpiderCloudConfig>>,
346 #[cfg(feature = "hedge")]
347 pub hedge: Option<crate::utils::hedge::HedgeConfig>,
350}
351
352#[derive(Default, Debug, Clone, PartialEq, Eq)]
353pub struct SerializableHeaderMap(pub HeaderMap);
355
356impl SerializableHeaderMap {
357 pub fn inner(&self) -> &HeaderMap {
359 &self.0
360 }
361 pub fn contains_key<K>(&self, key: K) -> bool
363 where
364 K: AsHeaderName,
365 {
366 self.0.contains_key(key)
367 }
368 pub fn insert<K>(
370 &mut self,
371 key: K,
372 val: reqwest::header::HeaderValue,
373 ) -> Option<reqwest::header::HeaderValue>
374 where
375 K: IntoHeaderName,
376 {
377 self.0.insert(key, val)
378 }
379 pub fn extend<I>(&mut self, iter: I)
381 where
382 I: IntoIterator<Item = (Option<HeaderName>, HeaderValue)>,
383 {
384 self.0.extend(iter);
385 }
386}
387
388pub fn get_referer(header_map: &Option<Box<SerializableHeaderMap>>) -> Option<String> {
390 match header_map {
391 Some(header_map) => {
392 header_map
393 .0
394 .get(crate::client::header::REFERER) .and_then(|value| value.to_str().ok()) .map(String::from) }
398 _ => None,
399 }
400}
401
402impl From<HeaderMap> for SerializableHeaderMap {
403 fn from(header_map: HeaderMap) -> Self {
404 SerializableHeaderMap(header_map)
405 }
406}
407
408#[cfg(feature = "serde")]
409impl serde::Serialize for SerializableHeaderMap {
410 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
411 where
412 S: serde::Serializer,
413 {
414 let map: std::collections::BTreeMap<String, String> = self
415 .0
416 .iter()
417 .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
418 .collect();
419 map.serialize(serializer)
420 }
421}
422
423#[cfg(feature = "serde")]
424impl<'de> serde::Deserialize<'de> for SerializableHeaderMap {
425 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
426 where
427 D: serde::Deserializer<'de>,
428 {
429 use reqwest::header::{HeaderName, HeaderValue};
430 use std::collections::BTreeMap;
431 let map: BTreeMap<String, String> = BTreeMap::deserialize(deserializer)?;
432 let mut headers = HeaderMap::with_capacity(map.len());
433 for (k, v) in map {
434 let key = HeaderName::from_bytes(k.as_bytes()).map_err(serde::de::Error::custom)?;
435 let value = HeaderValue::from_str(&v).map_err(serde::de::Error::custom)?;
436 headers.insert(key, value);
437 }
438 Ok(SerializableHeaderMap(headers))
439 }
440}
441
442#[cfg(feature = "serde")]
443impl serde::Serialize for AllowListSet {
444 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
445 where
446 S: serde::Serializer,
447 {
448 #[cfg(not(feature = "regex"))]
449 {
450 self.0.serialize(serializer)
451 }
452
453 #[cfg(feature = "regex")]
454 {
455 self.0
456 .patterns()
457 .iter()
458 .collect::<Vec<&String>>()
459 .serialize(serializer)
460 }
461 }
462}
463
464#[cfg(feature = "serde")]
465impl<'de> serde::Deserialize<'de> for AllowListSet {
466 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
467 where
468 D: serde::Deserializer<'de>,
469 {
470 #[cfg(not(feature = "regex"))]
471 {
472 let vec = Vec::<CompactString>::deserialize(deserializer)?;
473 Ok(AllowListSet(vec))
474 }
475
476 #[cfg(feature = "regex")]
477 {
478 let patterns = Vec::<String>::deserialize(deserializer)?;
479 let regex_set = regex::RegexSet::new(&patterns).map_err(serde::de::Error::custom)?;
480 Ok(AllowListSet(regex_set.into()))
481 }
482 }
483}
484
485#[cfg(feature = "ua_generator")]
487pub fn get_ua(chrome: bool) -> &'static str {
488 if chrome {
489 ua_generator::ua::spoof_chrome_ua()
490 } else {
491 ua_generator::ua::spoof_ua()
492 }
493}
494
495#[cfg(not(feature = "ua_generator"))]
497pub fn get_ua(_chrome: bool) -> &'static str {
498 use std::env;
499
500 lazy_static! {
501 static ref AGENT: &'static str =
502 concat!(env!("CARGO_PKG_NAME"), '/', env!("CARGO_PKG_VERSION"));
503 };
504
505 AGENT.as_ref()
506}
507
508impl Configuration {
509 #[cfg(not(feature = "chrome"))]
511 pub fn new() -> Self {
512 Self {
513 delay: 0,
514 depth: 25,
515 redirect_limit: 7,
516 request_timeout: Some(Duration::from_secs(120)),
517 only_html: true,
518 modify_headers: true,
519 ..Default::default()
520 }
521 }
522
523 #[cfg(feature = "chrome")]
525 pub fn new() -> Self {
526 Self {
527 delay: 0,
528 depth: 25,
529 redirect_limit: 7,
530 request_timeout: Some(Duration::from_secs(120)),
531 chrome_intercept: RequestInterceptConfiguration::new(cfg!(
532 feature = "chrome_intercept"
533 )),
534 user_agent: Some(Box::new(get_ua(true).into())),
535 only_html: true,
536 cache: true,
537 modify_headers: true,
538 service_worker_enabled: true,
539 fingerprint: Fingerprint::Basic,
540 auto_geolocation: false,
541 ..Default::default()
542 }
543 }
544
545 #[cfg(feature = "agent")]
548 pub fn build_remote_multimodal_engine(
549 &self,
550 ) -> Option<crate::features::automation::RemoteMultimodalEngine> {
551 let cfgs = self.remote_multimodal.as_ref()?;
552 let sem = cfgs
553 .concurrency_limit
554 .filter(|&n| n > 0)
555 .map(|n| std::sync::Arc::new(tokio::sync::Semaphore::new(n)));
556
557 #[allow(unused_mut)]
558 let mut engine = crate::features::automation::RemoteMultimodalEngine::new(
559 cfgs.api_url.clone(),
560 cfgs.model_name.clone(),
561 cfgs.system_prompt.clone(),
562 )
563 .with_api_key(cfgs.api_key.as_deref())
564 .with_system_prompt_extra(cfgs.system_prompt_extra.as_deref())
565 .with_user_message_extra(cfgs.user_message_extra.as_deref())
566 .with_remote_multimodal_config(cfgs.cfg.clone())
567 .with_prompt_url_gate(cfgs.prompt_url_gate.clone())
568 .with_vision_model(cfgs.vision_model.clone())
569 .with_text_model(cfgs.text_model.clone())
570 .with_vision_route_mode(cfgs.vision_route_mode)
571 .with_chrome_ai(cfgs.use_chrome_ai)
572 .with_semaphore(sem)
573 .to_owned();
574
575 #[cfg(feature = "agent_skills")]
576 if let Some(ref registry) = cfgs.skill_registry {
577 engine.with_skill_registry(Some(registry.clone()));
578 }
579
580 let model_pool = cfgs.model_pool.clone();
582 if model_pool.len() >= 3 {
583 let model_names: Vec<&str> =
584 model_pool.iter().map(|ep| ep.model_name.as_str()).collect();
585 let policy = crate::features::automation::auto_policy(&model_names);
586 engine.model_router = Some(crate::features::automation::ModelRouter::with_policy(
587 policy,
588 ));
589 }
590 engine.model_pool = model_pool;
591
592 Some(engine)
593 }
594
595 #[cfg(not(feature = "chrome"))]
597 pub(crate) fn only_chrome_agent(&self) -> bool {
598 false
599 }
600
601 #[cfg(feature = "chrome")]
603 pub(crate) fn only_chrome_agent(&self) -> bool {
604 self.chrome_connection_url.is_some()
605 || self.wait_for.is_some()
606 || self.chrome_intercept.enabled
607 || self.stealth_mode.stealth()
608 || self.fingerprint.valid()
609 }
610
611 #[cfg(feature = "regex")]
612 pub fn get_blacklist(&self) -> Box<regex::RegexSet> {
614 match &self.blacklist_url {
615 Some(blacklist) => match regex::RegexSet::new(&**blacklist) {
616 Ok(s) => Box::new(s),
617 _ => Default::default(),
618 },
619 _ => Default::default(),
620 }
621 }
622
623 #[cfg(not(feature = "regex"))]
624 pub fn get_blacklist(&self) -> AllowList {
626 match &self.blacklist_url {
627 Some(blacklist) => blacklist.to_owned(),
628 _ => Default::default(),
629 }
630 }
631
632 pub(crate) fn set_blacklist(&mut self) {
634 self.blacklist = AllowListSet(self.get_blacklist());
635 }
636
637 pub fn set_whitelist(&mut self) {
639 self.whitelist = AllowListSet(self.get_whitelist());
640 }
641
642 pub fn configure_allowlist(&mut self) {
644 self.set_whitelist();
645 self.set_blacklist();
646 }
647
648 pub fn get_blacklist_compiled(&self) -> &AllowList {
650 &self.blacklist.0
651 }
652
653 pub fn configure_budget(&mut self) {
655 self.inner_budget.clone_from(&self.budget);
656 }
657
658 pub fn get_whitelist_compiled(&self) -> &AllowList {
660 &self.whitelist.0
661 }
662
663 #[cfg(feature = "regex")]
664 pub fn get_whitelist(&self) -> Box<regex::RegexSet> {
666 match &self.whitelist_url {
667 Some(whitelist) => match regex::RegexSet::new(&**whitelist) {
668 Ok(s) => Box::new(s),
669 _ => Default::default(),
670 },
671 _ => Default::default(),
672 }
673 }
674
675 #[cfg(not(feature = "regex"))]
676 pub fn get_whitelist(&self) -> AllowList {
678 match &self.whitelist_url {
679 Some(whitelist) => whitelist.to_owned(),
680 _ => Default::default(),
681 }
682 }
683
684 #[cfg(feature = "sitemap")]
685 pub fn add_sitemap_to_whitelist(&mut self) -> SitemapWhitelistChanges {
687 let mut changes = SitemapWhitelistChanges::default();
688
689 if self.ignore_sitemap && self.whitelist_url.is_none() {
690 return changes;
691 }
692
693 if let Some(list) = self.whitelist_url.as_mut() {
694 if list.is_empty() {
695 return changes;
696 }
697
698 let default = CompactString::from("sitemap.xml");
699
700 if !list.contains(&default) {
701 list.push(default);
702 changes.added_default = true;
703 }
704
705 if let Some(custom) = &self.sitemap_url {
706 if !list.contains(custom) {
707 list.push(*custom.clone());
708 changes.added_custom = true;
709 }
710 }
711 }
712
713 changes
714 }
715
716 #[cfg(feature = "sitemap")]
717 pub fn remove_sitemap_from_whitelist(&mut self, changes: SitemapWhitelistChanges) {
719 if let Some(list) = self.whitelist_url.as_mut() {
720 if changes.added_default {
721 let default = CompactString::from("sitemap.xml");
722 if let Some(pos) = list.iter().position(|s| s == default) {
723 list.remove(pos);
724 }
725 }
726 if changes.added_custom {
727 if let Some(custom) = &self.sitemap_url {
728 if let Some(pos) = list.iter().position(|s| *s == **custom) {
729 list.remove(pos);
730 }
731 }
732 }
733 if list.is_empty() {
734 self.whitelist_url = None;
735 }
736 }
737 }
738
739 pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self {
741 self.respect_robots_txt = respect_robots_txt;
742 self
743 }
744
745 pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self {
747 self.subdomains = subdomains;
748 self
749 }
750
751 #[cfg(feature = "chrome")]
753 pub fn with_csp_bypass(&mut self, enabled: bool) -> &mut Self {
754 self.bypass_csp = enabled;
755 self
756 }
757
758 #[cfg(not(feature = "chrome"))]
760 pub fn with_csp_bypass(&mut self, _enabled: bool) -> &mut Self {
761 self
762 }
763
764 pub fn with_network_interface(&mut self, network_interface: Option<String>) -> &mut Self {
766 self.network_interface = network_interface;
767 self
768 }
769
770 pub fn with_local_address(&mut self, local_address: Option<IpAddr>) -> &mut Self {
772 self.local_address = local_address;
773 self
774 }
775
776 pub fn with_tld(&mut self, tld: bool) -> &mut Self {
778 self.tld = tld;
779 self
780 }
781
782 pub fn with_crawl_timeout(&mut self, crawl_timeout: Option<Duration>) -> &mut Self {
784 self.crawl_timeout = crawl_timeout;
785 self
786 }
787
788 pub fn with_delay(&mut self, delay: u64) -> &mut Self {
790 self.delay = delay;
791 self
792 }
793
794 pub fn with_http2_prior_knowledge(&mut self, http2_prior_knowledge: bool) -> &mut Self {
796 self.http2_prior_knowledge = http2_prior_knowledge;
797 self
798 }
799
800 pub fn with_request_timeout(&mut self, request_timeout: Option<Duration>) -> &mut Self {
802 match request_timeout {
803 Some(timeout) => self.request_timeout = Some(timeout),
804 _ => self.request_timeout = None,
805 };
806
807 self
808 }
809
810 #[cfg(feature = "sitemap")]
811 pub fn with_sitemap(&mut self, sitemap_url: Option<&str>) -> &mut Self {
813 match sitemap_url {
814 Some(sitemap_url) => {
815 self.sitemap_url = Some(CompactString::new(sitemap_url.to_string()).into())
816 }
817 _ => self.sitemap_url = None,
818 };
819 self
820 }
821
822 #[cfg(not(feature = "sitemap"))]
823 pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self {
825 self
826 }
827
828 #[cfg(feature = "sitemap")]
829 pub fn with_ignore_sitemap(&mut self, ignore_sitemap: bool) -> &mut Self {
831 self.ignore_sitemap = ignore_sitemap;
832 self
833 }
834
835 #[cfg(not(feature = "sitemap"))]
836 pub fn with_ignore_sitemap(&mut self, _ignore_sitemap: bool) -> &mut Self {
838 self
839 }
840
841 pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self {
843 match user_agent {
844 Some(agent) => self.user_agent = Some(CompactString::new(agent).into()),
845 _ => self.user_agent = None,
846 };
847 self
848 }
849
850 pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self {
852 self.preserve_host_header = preserve;
853 self
854 }
855
856 #[cfg(feature = "agent")]
859 pub fn with_remote_multimodal(
860 &mut self,
861 remote_multimodal: Option<crate::features::automation::RemoteMultimodalConfigs>,
862 ) -> &mut Self {
863 self.remote_multimodal = remote_multimodal.map(Box::new);
864 self
865 }
866
867 #[cfg(not(feature = "agent"))]
870 pub fn with_remote_multimodal(
871 &mut self,
872 remote_multimodal: Option<crate::features::automation::RemoteMultimodalConfigs>,
873 ) -> &mut Self {
874 self.remote_multimodal = remote_multimodal.map(Box::new);
875 self
876 }
877
878 #[cfg(not(feature = "openai"))]
879 pub fn with_openai(&mut self, _openai_config: Option<GPTConfigs>) -> &mut Self {
881 self
882 }
883
884 #[cfg(feature = "openai")]
886 pub fn with_openai(&mut self, openai_config: Option<GPTConfigs>) -> &mut Self {
887 match openai_config {
888 Some(openai_config) => self.openai_config = Some(Box::new(openai_config)),
889 _ => self.openai_config = None,
890 };
891 self
892 }
893
894 #[cfg(not(feature = "gemini"))]
895 pub fn with_gemini(&mut self, _gemini_config: Option<GeminiConfigs>) -> &mut Self {
897 self
898 }
899
900 #[cfg(feature = "gemini")]
902 pub fn with_gemini(&mut self, gemini_config: Option<GeminiConfigs>) -> &mut Self {
903 match gemini_config {
904 Some(gemini_config) => self.gemini_config = Some(Box::new(gemini_config)),
905 _ => self.gemini_config = None,
906 };
907 self
908 }
909
910 #[cfg(feature = "cookies")]
911 pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self {
913 self.cookie_str = cookie_str.into();
914 self
915 }
916
917 #[cfg(not(feature = "cookies"))]
918 pub fn with_cookies(&mut self, _cookie_str: &str) -> &mut Self {
920 self
921 }
922
923 #[cfg(feature = "chrome")]
924 pub fn with_fingerprint(&mut self, fingerprint: bool) -> &mut Self {
926 if fingerprint {
927 self.fingerprint = Fingerprint::Basic;
928 } else {
929 self.fingerprint = Fingerprint::None;
930 }
931 self
932 }
933
934 #[cfg(feature = "chrome")]
935 pub fn with_fingerprint_advanced(&mut self, fingerprint: Fingerprint) -> &mut Self {
937 self.fingerprint = fingerprint;
938 self
939 }
940
941 #[cfg(not(feature = "chrome"))]
942 pub fn with_fingerprint(&mut self, _fingerprint: bool) -> &mut Self {
944 self
945 }
946
947 pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self {
949 self.proxies = proxies.map(|p| {
950 p.iter()
951 .map(|addr| RequestProxy {
952 addr: addr.to_owned(),
953 ..Default::default()
954 })
955 .collect::<Vec<RequestProxy>>()
956 });
957 self
958 }
959
960 pub fn with_proxies_direct(&mut self, proxies: Option<Vec<RequestProxy>>) -> &mut Self {
962 self.proxies = proxies;
963 self
964 }
965
966 pub fn with_shared_queue(&mut self, shared_queue: bool) -> &mut Self {
968 self.shared_queue = shared_queue;
969 self
970 }
971
972 pub fn with_blacklist_url<T>(&mut self, blacklist_url: Option<Vec<T>>) -> &mut Self
974 where
975 Vec<CompactString>: From<Vec<T>>,
976 {
977 match blacklist_url {
978 Some(p) => self.blacklist_url = Some(p.into()),
979 _ => self.blacklist_url = None,
980 };
981 self
982 }
983
984 pub fn with_whitelist_url<T>(&mut self, whitelist_url: Option<Vec<T>>) -> &mut Self
986 where
987 Vec<CompactString>: From<Vec<T>>,
988 {
989 match whitelist_url {
990 Some(p) => self.whitelist_url = Some(p.into()),
991 _ => self.whitelist_url = None,
992 };
993 self
994 }
995
996 pub fn with_return_page_links(&mut self, return_page_links: bool) -> &mut Self {
998 self.return_page_links = return_page_links;
999 self
1000 }
1001
1002 pub fn with_headers(&mut self, headers: Option<reqwest::header::HeaderMap>) -> &mut Self {
1004 match headers {
1005 Some(m) => self.headers = Some(SerializableHeaderMap::from(m).into()),
1006 _ => self.headers = None,
1007 };
1008 self
1009 }
1010
1011 pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self {
1013 self.redirect_limit = redirect_limit;
1014 self
1015 }
1016
1017 pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self {
1019 self.redirect_policy = policy;
1020 self
1021 }
1022
1023 pub fn with_referer(&mut self, referer: Option<String>) -> &mut Self {
1025 self.referer = referer;
1026 self
1027 }
1028
1029 pub fn with_referrer(&mut self, referer: Option<String>) -> &mut Self {
1031 self.referer = referer;
1032 self
1033 }
1034
1035 pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self {
1037 self.full_resources = full_resources;
1038 self
1039 }
1040
1041 #[cfg(feature = "chrome")]
1043 pub fn with_dismiss_dialogs(&mut self, dismiss_dialogs: bool) -> &mut Self {
1044 self.dismiss_dialogs = Some(dismiss_dialogs);
1045 self
1046 }
1047
1048 #[cfg(not(feature = "chrome"))]
1050 pub fn with_dismiss_dialogs(&mut self, _dismiss_dialogs: bool) -> &mut Self {
1051 self
1052 }
1053
1054 #[cfg(feature = "wreq")]
1056 pub fn with_emulation(&mut self, emulation: Option<wreq_util::Emulation>) -> &mut Self {
1057 self.emulation = emulation;
1058 self
1059 }
1060
1061 #[cfg(feature = "cron")]
1062 pub fn with_cron(&mut self, cron_str: &str, cron_type: CronType) -> &mut Self {
1064 self.cron_str = cron_str.into();
1065 self.cron_type = cron_type;
1066 self
1067 }
1068
1069 #[cfg(not(feature = "cron"))]
1070 pub fn with_cron(&mut self, _cron_str: &str, _cron_type: CronType) -> &mut Self {
1072 self
1073 }
1074
1075 pub fn with_limit(&mut self, limit: u32) -> &mut Self {
1077 self.with_budget(Some(hashbrown::HashMap::from([("*", limit)])));
1078 self
1079 }
1080
1081 pub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self {
1083 self.concurrency_limit = limit;
1084 self
1085 }
1086
1087 #[cfg(feature = "chrome")]
1088 pub fn with_auth_challenge_response(
1090 &mut self,
1091 auth_challenge_response: Option<AuthChallengeResponse>,
1092 ) -> &mut Self {
1093 self.auth_challenge_response = auth_challenge_response;
1094 self
1095 }
1096
1097 #[cfg(feature = "chrome")]
1098 pub fn with_evaluate_on_new_document(
1100 &mut self,
1101 evaluate_on_new_document: Option<Box<String>>,
1102 ) -> &mut Self {
1103 self.evaluate_on_new_document = evaluate_on_new_document;
1104 self
1105 }
1106
1107 #[cfg(not(feature = "chrome"))]
1108 pub fn with_evaluate_on_new_document(
1110 &mut self,
1111 _evaluate_on_new_document: Option<Box<String>>,
1112 ) -> &mut Self {
1113 self
1114 }
1115
1116 #[cfg(not(feature = "chrome"))]
1117 pub fn with_auth_challenge_response(
1119 &mut self,
1120 _auth_challenge_response: Option<AuthChallengeResponse>,
1121 ) -> &mut Self {
1122 self
1123 }
1124
1125 pub fn with_depth(&mut self, depth: usize) -> &mut Self {
1127 self.depth = depth;
1128 self
1129 }
1130
1131 #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
1132 pub fn with_caching(&mut self, cache: bool) -> &mut Self {
1134 self.cache = cache;
1135 self
1136 }
1137
1138 #[cfg(not(any(feature = "cache_request", feature = "chrome_remote_cache")))]
1139 pub fn with_caching(&mut self, _cache: bool) -> &mut Self {
1141 self
1142 }
1143
1144 #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
1145 pub fn with_cache_skip_browser(&mut self, skip: bool) -> &mut Self {
1149 self.cache_skip_browser = skip;
1150 self
1151 }
1152
1153 #[cfg(not(any(feature = "cache_request", feature = "chrome_remote_cache")))]
1154 pub fn with_cache_skip_browser(&mut self, _skip: bool) -> &mut Self {
1157 self
1158 }
1159
1160 #[cfg(feature = "chrome")]
1161 pub fn with_service_worker_enabled(&mut self, enabled: bool) -> &mut Self {
1163 self.service_worker_enabled = enabled;
1164 self
1165 }
1166
1167 #[cfg(not(feature = "chrome"))]
1168 pub fn with_service_worker_enabled(&mut self, _enabled: bool) -> &mut Self {
1170 self
1171 }
1172
1173 #[cfg(not(feature = "chrome"))]
1175 pub fn with_auto_geolocation(&mut self, _enabled: bool) -> &mut Self {
1176 self
1177 }
1178
1179 #[cfg(feature = "chrome")]
1181 pub fn with_auto_geolocation(&mut self, enabled: bool) -> &mut Self {
1182 self.auto_geolocation = enabled;
1183 self
1184 }
1185
1186 pub fn with_retry(&mut self, retry: u8) -> &mut Self {
1188 self.retry = retry;
1189 self
1190 }
1191
1192 pub fn with_default_http_connect_timeout(
1194 &mut self,
1195 default_http_connect_timeout: Option<Duration>,
1196 ) -> &mut Self {
1197 self.default_http_connect_timeout = default_http_connect_timeout;
1198 self
1199 }
1200
1201 pub fn with_default_http_read_timeout(
1203 &mut self,
1204 default_http_read_timeout: Option<Duration>,
1205 ) -> &mut Self {
1206 self.default_http_read_timeout = default_http_read_timeout;
1207 self
1208 }
1209
1210 pub fn with_no_control_thread(&mut self, no_control_thread: bool) -> &mut Self {
1212 self.no_control_thread = no_control_thread;
1213 self
1214 }
1215
1216 pub fn with_viewport(&mut self, viewport: Option<crate::configuration::Viewport>) -> &mut Self {
1218 self.viewport = viewport.map(|vp| vp);
1219 self
1220 }
1221
1222 #[cfg(feature = "chrome")]
1223 pub fn with_stealth(&mut self, stealth_mode: bool) -> &mut Self {
1225 if stealth_mode {
1226 self.stealth_mode = spider_fingerprint::configs::Tier::Basic;
1227 } else {
1228 self.stealth_mode = spider_fingerprint::configs::Tier::None;
1229 }
1230 self
1231 }
1232
1233 #[cfg(feature = "chrome")]
1234 pub fn with_stealth_advanced(
1236 &mut self,
1237 stealth_mode: spider_fingerprint::configs::Tier,
1238 ) -> &mut Self {
1239 self.stealth_mode = stealth_mode;
1240 self
1241 }
1242
1243 #[cfg(not(feature = "chrome"))]
1244 pub fn with_stealth(&mut self, _stealth_mode: bool) -> &mut Self {
1246 self
1247 }
1248
1249 #[cfg(feature = "chrome")]
1250 pub fn with_wait_for_idle_network(
1252 &mut self,
1253 wait_for_idle_network: Option<WaitForIdleNetwork>,
1254 ) -> &mut Self {
1255 match self.wait_for.as_mut() {
1256 Some(wait_for) => wait_for.idle_network = wait_for_idle_network,
1257 _ => {
1258 let mut wait_for = WaitFor::default();
1259 wait_for.idle_network = wait_for_idle_network;
1260 self.wait_for = Some(wait_for);
1261 }
1262 }
1263 self
1264 }
1265
1266 #[cfg(feature = "chrome")]
1267 pub fn with_wait_for_idle_network0(
1269 &mut self,
1270 wait_for_idle_network0: Option<WaitForIdleNetwork>,
1271 ) -> &mut Self {
1272 match self.wait_for.as_mut() {
1273 Some(wait_for) => wait_for.idle_network0 = wait_for_idle_network0,
1274 _ => {
1275 let mut wait_for = WaitFor::default();
1276 wait_for.idle_network0 = wait_for_idle_network0;
1277 self.wait_for = Some(wait_for);
1278 }
1279 }
1280 self
1281 }
1282
1283 #[cfg(feature = "chrome")]
1284 pub fn with_wait_for_almost_idle_network0(
1286 &mut self,
1287 wait_for_almost_idle_network0: Option<WaitForIdleNetwork>,
1288 ) -> &mut Self {
1289 match self.wait_for.as_mut() {
1290 Some(wait_for) => wait_for.almost_idle_network0 = wait_for_almost_idle_network0,
1291 _ => {
1292 let mut wait_for = WaitFor::default();
1293 wait_for.almost_idle_network0 = wait_for_almost_idle_network0;
1294 self.wait_for = Some(wait_for);
1295 }
1296 }
1297 self
1298 }
1299
1300 #[cfg(not(feature = "chrome"))]
1301 pub fn with_wait_for_almost_idle_network0(
1303 &mut self,
1304 _wait_for_almost_idle_network0: Option<WaitForIdleNetwork>,
1305 ) -> &mut Self {
1306 self
1307 }
1308
1309 #[cfg(not(feature = "chrome"))]
1310 pub fn with_wait_for_idle_network0(
1312 &mut self,
1313 _wait_for_idle_network0: Option<WaitForIdleNetwork>,
1314 ) -> &mut Self {
1315 self
1316 }
1317
1318 #[cfg(not(feature = "chrome"))]
1319 pub fn with_wait_for_idle_network(
1321 &mut self,
1322 _wait_for_idle_network: Option<WaitForIdleNetwork>,
1323 ) -> &mut Self {
1324 self
1325 }
1326
1327 #[cfg(feature = "chrome")]
1328 pub fn with_wait_for_idle_dom(
1330 &mut self,
1331 wait_for_idle_dom: Option<WaitForSelector>,
1332 ) -> &mut Self {
1333 match self.wait_for.as_mut() {
1334 Some(wait_for) => wait_for.dom = wait_for_idle_dom,
1335 _ => {
1336 let mut wait_for = WaitFor::default();
1337 wait_for.dom = wait_for_idle_dom;
1338 self.wait_for = Some(wait_for);
1339 }
1340 }
1341 self
1342 }
1343
1344 #[cfg(not(feature = "chrome"))]
1345 pub fn with_wait_for_idle_dom(
1347 &mut self,
1348 _wait_for_idle_dom: Option<WaitForSelector>,
1349 ) -> &mut Self {
1350 self
1351 }
1352
1353 #[cfg(feature = "chrome")]
1354 pub fn with_wait_for_selector(
1356 &mut self,
1357 wait_for_selector: Option<WaitForSelector>,
1358 ) -> &mut Self {
1359 match self.wait_for.as_mut() {
1360 Some(wait_for) => wait_for.selector = wait_for_selector,
1361 _ => {
1362 let mut wait_for = WaitFor::default();
1363 wait_for.selector = wait_for_selector;
1364 self.wait_for = Some(wait_for);
1365 }
1366 }
1367 self
1368 }
1369
1370 #[cfg(not(feature = "chrome"))]
1371 pub fn with_wait_for_selector(
1373 &mut self,
1374 _wait_for_selector: Option<WaitForSelector>,
1375 ) -> &mut Self {
1376 self
1377 }
1378
1379 #[cfg(feature = "chrome")]
1380 pub fn with_wait_for_delay(&mut self, wait_for_delay: Option<WaitForDelay>) -> &mut Self {
1382 match self.wait_for.as_mut() {
1383 Some(wait_for) => wait_for.delay = wait_for_delay,
1384 _ => {
1385 let mut wait_for = WaitFor::default();
1386 wait_for.delay = wait_for_delay;
1387 self.wait_for = Some(wait_for);
1388 }
1389 }
1390 self
1391 }
1392
1393 #[cfg(not(feature = "chrome"))]
1394 pub fn with_wait_for_delay(&mut self, _wait_for_delay: Option<WaitForDelay>) -> &mut Self {
1396 self
1397 }
1398
1399 #[cfg(feature = "chrome_intercept")]
1400 pub fn with_chrome_intercept(
1402 &mut self,
1403 chrome_intercept: RequestInterceptConfiguration,
1404 url: &Option<Box<url::Url>>,
1405 ) -> &mut Self {
1406 self.chrome_intercept = chrome_intercept;
1407 self.chrome_intercept.setup_intercept_manager(url);
1408 self
1409 }
1410
1411 #[cfg(not(feature = "chrome_intercept"))]
1412 pub fn with_chrome_intercept(
1414 &mut self,
1415 _chrome_intercept: RequestInterceptConfiguration,
1416 _url: &Option<Box<url::Url>>,
1417 ) -> &mut Self {
1418 self
1419 }
1420
1421 #[cfg(feature = "chrome")]
1422 pub fn with_chrome_connection(&mut self, chrome_connection_url: Option<String>) -> &mut Self {
1424 self.chrome_connection_url = chrome_connection_url;
1425 self
1426 }
1427
1428 #[cfg(not(feature = "chrome"))]
1429 pub fn with_chrome_connection(&mut self, _chrome_connection_url: Option<String>) -> &mut Self {
1431 self
1432 }
1433
1434 #[cfg(not(feature = "chrome"))]
1435 pub fn with_execution_scripts(
1437 &mut self,
1438 _execution_scripts: Option<ExecutionScriptsMap>,
1439 ) -> &mut Self {
1440 self
1441 }
1442
1443 #[cfg(feature = "chrome")]
1444 pub fn with_execution_scripts(
1446 &mut self,
1447 execution_scripts: Option<ExecutionScriptsMap>,
1448 ) -> &mut Self {
1449 self.execution_scripts =
1450 crate::features::chrome_common::convert_to_trie_execution_scripts(&execution_scripts);
1451 self
1452 }
1453
1454 #[cfg(not(feature = "chrome"))]
1455 pub fn with_automation_scripts(
1457 &mut self,
1458 _automation_scripts: Option<AutomationScriptsMap>,
1459 ) -> &mut Self {
1460 self
1461 }
1462
1463 #[cfg(feature = "chrome")]
1464 pub fn with_automation_scripts(
1466 &mut self,
1467 automation_scripts: Option<AutomationScriptsMap>,
1468 ) -> &mut Self {
1469 self.automation_scripts =
1470 crate::features::chrome_common::convert_to_trie_automation_scripts(&automation_scripts);
1471 self
1472 }
1473
1474 pub fn with_budget(&mut self, budget: Option<hashbrown::HashMap<&str, u32>>) -> &mut Self {
1476 self.budget = match budget {
1477 Some(budget) => {
1478 let mut crawl_budget: hashbrown::HashMap<
1479 case_insensitive_string::CaseInsensitiveString,
1480 u32,
1481 > = hashbrown::HashMap::new();
1482
1483 for b in budget.into_iter() {
1484 crawl_budget.insert(
1485 case_insensitive_string::CaseInsensitiveString::from(b.0),
1486 b.1,
1487 );
1488 }
1489
1490 Some(crawl_budget)
1491 }
1492 _ => None,
1493 };
1494 self
1495 }
1496
1497 pub fn with_external_domains<'a, 'b>(
1499 &mut self,
1500 external_domains: Option<impl Iterator<Item = String> + 'a>,
1501 ) -> &mut Self {
1502 match external_domains {
1503 Some(external_domains) => {
1504 self.external_domains_caseless = external_domains
1505 .into_iter()
1506 .filter_map(|d| {
1507 if d == "*" {
1508 Some("*".into())
1509 } else {
1510 let host = get_domain_from_url(&d);
1511
1512 if !host.is_empty() {
1513 Some(host.into())
1514 } else {
1515 None
1516 }
1517 }
1518 })
1519 .collect::<hashbrown::HashSet<case_insensitive_string::CaseInsensitiveString>>()
1520 .into();
1521 }
1522 _ => self.external_domains_caseless = Default::default(),
1523 }
1524
1525 self
1526 }
1527
1528 pub fn with_danger_accept_invalid_certs(&mut self, accept_invalid_certs: bool) -> &mut Self {
1530 self.accept_invalid_certs = accept_invalid_certs;
1531 self
1532 }
1533
1534 pub fn with_normalize(&mut self, normalize: bool) -> &mut Self {
1536 self.normalize = normalize;
1537 self
1538 }
1539
1540 #[cfg(not(feature = "disk"))]
1541 pub fn with_shared_state(&mut self, _shared: bool) -> &mut Self {
1543 self
1544 }
1545
1546 #[cfg(feature = "disk")]
1548 pub fn with_shared_state(&mut self, shared: bool) -> &mut Self {
1549 self.shared = shared;
1550 self
1551 }
1552
1553 #[cfg(not(feature = "chrome"))]
1554 pub fn with_timezone_id(&mut self, _timezone_id: Option<String>) -> &mut Self {
1556 self
1557 }
1558
1559 #[cfg(feature = "chrome")]
1560 pub fn with_timezone_id(&mut self, timezone_id: Option<String>) -> &mut Self {
1562 self.timezone_id = timezone_id.map(|timezone_id| timezone_id.into());
1563 self
1564 }
1565
1566 #[cfg(not(feature = "chrome"))]
1567 pub fn with_locale(&mut self, _locale: Option<String>) -> &mut Self {
1569 self
1570 }
1571
1572 #[cfg(feature = "chrome")]
1573 pub fn with_locale(&mut self, locale: Option<String>) -> &mut Self {
1575 self.locale = locale.map(|locale| locale.into());
1576 self
1577 }
1578
1579 #[cfg(feature = "chrome")]
1580 pub fn with_event_tracker(&mut self, track_events: Option<ChromeEventTracker>) -> &mut Self {
1582 self.track_events = track_events;
1583 self
1584 }
1585
1586 #[cfg(not(feature = "chrome"))]
1588 pub fn with_screenshot(&mut self, _screenshot_config: Option<ScreenShotConfig>) -> &mut Self {
1589 self
1590 }
1591
1592 #[cfg(feature = "chrome")]
1594 pub fn with_screenshot(&mut self, screenshot_config: Option<ScreenShotConfig>) -> &mut Self {
1595 self.screenshot = screenshot_config;
1596 self
1597 }
1598
1599 pub fn with_max_page_bytes(&mut self, max_page_bytes: Option<f64>) -> &mut Self {
1601 self.max_page_bytes = max_page_bytes;
1602 self
1603 }
1604
1605 pub fn with_max_bytes_allowed(&mut self, max_bytes_allowed: Option<u64>) -> &mut Self {
1607 self.max_bytes_allowed = max_bytes_allowed;
1608 self
1609 }
1610
1611 pub fn with_block_assets(&mut self, only_html: bool) -> &mut Self {
1613 self.only_html = only_html;
1614 self
1615 }
1616
1617 pub fn with_modify_headers(&mut self, modify_headers: bool) -> &mut Self {
1619 self.modify_headers = modify_headers;
1620 self
1621 }
1622
1623 pub fn with_modify_http_client_headers(
1625 &mut self,
1626 modify_http_client_headers: bool,
1627 ) -> &mut Self {
1628 self.modify_http_client_headers = modify_http_client_headers;
1629 self
1630 }
1631
1632 pub fn with_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) -> &mut Self {
1634 self.cache_policy = cache_policy;
1635 self
1636 }
1637
1638 #[cfg(feature = "webdriver")]
1639 pub fn with_webdriver_config(
1641 &mut self,
1642 webdriver_config: Option<WebDriverConfig>,
1643 ) -> &mut Self {
1644 self.webdriver_config = webdriver_config.map(Box::new);
1645 self
1646 }
1647
1648 #[cfg(not(feature = "webdriver"))]
1649 pub fn with_webdriver_config(
1651 &mut self,
1652 _webdriver_config: Option<WebDriverConfig>,
1653 ) -> &mut Self {
1654 self
1655 }
1656
1657 #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
1659 pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
1660 use crate::utils::CacheOptions;
1661 if !self.cache {
1662 return None;
1663 }
1664 let auth_token = self
1665 .headers
1666 .as_ref()
1667 .and_then(|headers| {
1668 headers
1669 .0
1670 .get("authorization")
1671 .or_else(|| headers.0.get("Authorization"))
1672 })
1673 .map(|s| s.to_owned());
1674
1675 let skip_browser = self.cache_skip_browser;
1676
1677 match auth_token {
1678 Some(token) if !token.is_empty() => {
1679 if let Ok(token_str) = token.to_str() {
1680 if skip_browser {
1681 Some(CacheOptions::SkipBrowserAuthorized(token_str.into()))
1682 } else {
1683 Some(CacheOptions::Authorized(token_str.into()))
1684 }
1685 } else if skip_browser {
1686 Some(CacheOptions::SkipBrowser)
1687 } else {
1688 Some(CacheOptions::Yes)
1689 }
1690 }
1691 _ => {
1692 if skip_browser {
1693 Some(CacheOptions::SkipBrowser)
1694 } else {
1695 Some(CacheOptions::Yes)
1696 }
1697 }
1698 }
1699 }
1700
1701 #[cfg(all(
1703 feature = "chrome",
1704 not(any(feature = "cache_request", feature = "chrome_remote_cache"))
1705 ))]
1706 pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
1707 None
1708 }
1709
1710 #[cfg(not(any(
1712 feature = "cache_request",
1713 feature = "chrome_remote_cache",
1714 feature = "chrome"
1715 )))]
1716 #[allow(dead_code)]
1717 pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
1718 None
1719 }
1720
1721 pub fn build(&self) -> Self {
1723 self.to_owned()
1724 }
1725
1726 #[cfg(feature = "search")]
1727 pub fn with_search_config(&mut self, search_config: Option<SearchConfig>) -> &mut Self {
1729 self.search_config = search_config.map(Box::new);
1730 self
1731 }
1732
1733 #[cfg(not(feature = "search"))]
1734 pub fn with_search_config(&mut self, _search_config: Option<()>) -> &mut Self {
1736 self
1737 }
1738
1739 #[cfg(feature = "spider_cloud")]
1741 pub fn with_spider_cloud(&mut self, api_key: &str) -> &mut Self {
1742 self.spider_cloud = Some(Box::new(SpiderCloudConfig::new(api_key)));
1743 self
1744 }
1745
1746 #[cfg(not(feature = "spider_cloud"))]
1748 pub fn with_spider_cloud(&mut self, _api_key: &str) -> &mut Self {
1749 self
1750 }
1751
1752 #[cfg(feature = "spider_cloud")]
1754 pub fn with_spider_cloud_config(&mut self, config: SpiderCloudConfig) -> &mut Self {
1755 self.spider_cloud = Some(Box::new(config));
1756 self
1757 }
1758
1759 #[cfg(not(feature = "spider_cloud"))]
1761 pub fn with_spider_cloud_config(&mut self, _config: ()) -> &mut Self {
1762 self
1763 }
1764
1765 #[cfg(feature = "hedge")]
1767 pub fn with_hedge(&mut self, config: crate::utils::hedge::HedgeConfig) -> &mut Self {
1768 self.hedge = Some(config);
1769 self
1770 }
1771
1772 #[cfg(not(feature = "hedge"))]
1774 pub fn with_hedge(&mut self, _config: ()) -> &mut Self {
1775 self
1776 }
1777}
1778
1779#[cfg(feature = "search")]
1781#[derive(Debug, Clone, PartialEq)]
1782#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
1783pub struct SearchConfig {
1784 pub provider: SearchProviderType,
1786 pub api_key: String,
1788 pub api_url: Option<String>,
1790 pub default_options: Option<SearchOptions>,
1792}
1793
1794#[cfg(feature = "search")]
1795impl SearchConfig {
1796 pub fn new(provider: SearchProviderType, api_key: impl Into<String>) -> Self {
1798 Self {
1799 provider,
1800 api_key: api_key.into(),
1801 api_url: None,
1802 default_options: None,
1803 }
1804 }
1805
1806 pub fn with_api_url(mut self, url: impl Into<String>) -> Self {
1808 self.api_url = Some(url.into());
1809 self
1810 }
1811
1812 pub fn with_default_options(mut self, options: SearchOptions) -> Self {
1814 self.default_options = Some(options);
1815 self
1816 }
1817
1818 pub fn is_enabled(&self) -> bool {
1822 !self.api_key.is_empty() || self.api_url.is_some()
1823 }
1824}
1825
1826#[cfg(feature = "search")]
1828#[derive(Debug, Clone, Default, PartialEq, Eq)]
1829#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
1830pub enum SearchProviderType {
1831 #[default]
1833 Serper,
1834 Brave,
1836 Bing,
1838 Tavily,
1840}
1841
1842#[cfg(feature = "spider_cloud")]
1846#[derive(Debug, Clone, Default, PartialEq, Eq)]
1847#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
1848pub enum SpiderCloudMode {
1849 #[default]
1853 Proxy,
1854 Api,
1857 Unblocker,
1860 Fallback,
1863 Smart,
1868}
1869
1870#[cfg(feature = "spider_cloud")]
1875#[derive(Debug, Clone, PartialEq, Eq)]
1876#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
1877pub struct SpiderCloudConfig {
1878 pub api_key: String,
1880 #[cfg_attr(feature = "serde", serde(default))]
1882 pub mode: SpiderCloudMode,
1883 #[cfg_attr(
1885 feature = "serde",
1886 serde(default = "SpiderCloudConfig::default_api_url")
1887 )]
1888 pub api_url: String,
1889 #[cfg_attr(
1891 feature = "serde",
1892 serde(default = "SpiderCloudConfig::default_proxy_url")
1893 )]
1894 pub proxy_url: String,
1895 #[cfg_attr(
1897 feature = "serde",
1898 serde(default = "SpiderCloudConfig::default_return_format")
1899 )]
1900 pub return_format: String,
1901 #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
1903 pub extra_params: Option<hashbrown::HashMap<String, serde_json::Value>>,
1904}
1905
1906#[cfg(feature = "spider_cloud")]
1907impl Default for SpiderCloudConfig {
1908 fn default() -> Self {
1909 Self {
1910 api_key: String::new(),
1911 mode: SpiderCloudMode::default(),
1912 api_url: Self::default_api_url(),
1913 proxy_url: Self::default_proxy_url(),
1914 return_format: Self::default_return_format(),
1915 extra_params: None,
1916 }
1917 }
1918}
1919
1920#[cfg(feature = "spider_cloud")]
1921impl SpiderCloudConfig {
1922 pub fn new(api_key: impl Into<String>) -> Self {
1924 Self {
1925 api_key: api_key.into(),
1926 ..Default::default()
1927 }
1928 }
1929
1930 pub fn with_mode(mut self, mode: SpiderCloudMode) -> Self {
1932 self.mode = mode;
1933 self
1934 }
1935
1936 pub fn with_api_url(mut self, url: impl Into<String>) -> Self {
1938 self.api_url = url.into();
1939 self
1940 }
1941
1942 pub fn with_proxy_url(mut self, url: impl Into<String>) -> Self {
1944 self.proxy_url = url.into();
1945 self
1946 }
1947
1948 pub fn with_return_format(mut self, fmt: impl Into<String>) -> Self {
1950 self.return_format = fmt.into();
1951 self
1952 }
1953
1954 pub fn with_extra_params(
1956 mut self,
1957 params: hashbrown::HashMap<String, serde_json::Value>,
1958 ) -> Self {
1959 self.extra_params = Some(params);
1960 self
1961 }
1962
1963 pub fn should_fallback(&self, status_code: u16, body: Option<&[u8]>) -> bool {
1977 match self.mode {
1978 SpiderCloudMode::Api | SpiderCloudMode::Unblocker => false, SpiderCloudMode::Proxy => false, SpiderCloudMode::Fallback | SpiderCloudMode::Smart => {
1981 if matches!(status_code, 403 | 429 | 503 | 520..=530) {
1983 return true;
1984 }
1985 if status_code >= 500 {
1986 return true;
1987 }
1988
1989 if self.mode == SpiderCloudMode::Smart {
1991 if let Some(body) = body {
1992 if body.is_empty() {
1994 return true;
1995 }
1996
1997 let check_len = body.len().min(4096);
2000 let snippet = String::from_utf8_lossy(&body[..check_len]);
2001 let lower = snippet.to_lowercase();
2002
2003 if lower.contains("cf-browser-verification")
2005 || lower.contains("cloudflare") && lower.contains("challenge-platform")
2006 {
2007 return true;
2008 }
2009
2010 if lower.contains("captcha") && lower.contains("challenge")
2012 || lower.contains("please verify you are a human")
2013 || lower.contains("access denied") && lower.contains("automated")
2014 || lower.contains("bot detection")
2015 {
2016 return true;
2017 }
2018
2019 if lower.contains("distil_r_captcha")
2021 || lower.contains("_imperva")
2022 || lower.contains("akamai") && lower.contains("bot manager")
2023 {
2024 return true;
2025 }
2026 }
2027 }
2028
2029 false
2030 }
2031 }
2032 }
2033
2034 pub fn fallback_route(&self) -> &'static str {
2040 match self.mode {
2041 SpiderCloudMode::Smart | SpiderCloudMode::Unblocker => "unblocker",
2042 _ => "crawl",
2043 }
2044 }
2045
2046 pub fn uses_proxy(&self) -> bool {
2048 matches!(
2049 self.mode,
2050 SpiderCloudMode::Proxy | SpiderCloudMode::Fallback | SpiderCloudMode::Smart
2051 )
2052 }
2053
2054 fn default_api_url() -> String {
2055 "https://api.spider.cloud".to_string()
2056 }
2057
2058 fn default_proxy_url() -> String {
2059 "https://proxy.spider.cloud".to_string()
2060 }
2061
2062 fn default_return_format() -> String {
2063 "raw".to_string()
2064 }
2065}
2066
2067#[cfg(test)]
2068mod tests {
2069 use super::*;
2070
2071 #[test]
2072 fn test_configuration_defaults() {
2073 let config = Configuration::default();
2074 assert!(!config.respect_robots_txt);
2075 assert!(!config.subdomains);
2076 assert!(!config.tld);
2077 assert_eq!(config.delay, 0);
2078 assert!(config.user_agent.is_none());
2079 assert!(config.blacklist_url.is_none());
2080 assert!(config.whitelist_url.is_none());
2081 assert!(config.proxies.is_none());
2082 assert!(!config.http2_prior_knowledge);
2083 }
2084
2085 #[test]
2086 fn test_redirect_policy_variants() {
2087 assert_eq!(RedirectPolicy::default(), RedirectPolicy::Loose);
2088 let strict = RedirectPolicy::Strict;
2089 let none = RedirectPolicy::None;
2090 assert_ne!(strict, RedirectPolicy::Loose);
2091 assert_ne!(none, RedirectPolicy::Loose);
2092 assert_ne!(strict, none);
2093 }
2094
2095 #[test]
2096 fn test_proxy_ignore_variants() {
2097 assert_eq!(ProxyIgnore::default(), ProxyIgnore::No);
2098 let chrome = ProxyIgnore::Chrome;
2099 let http = ProxyIgnore::Http;
2100 assert_ne!(chrome, ProxyIgnore::No);
2101 assert_ne!(http, ProxyIgnore::No);
2102 assert_ne!(chrome, http);
2103 }
2104
2105 #[test]
2106 fn test_request_proxy_construction() {
2107 let proxy = RequestProxy {
2108 addr: "http://proxy.example.com:8080".to_string(),
2109 ignore: ProxyIgnore::No,
2110 };
2111 assert_eq!(proxy.addr, "http://proxy.example.com:8080");
2112 assert_eq!(proxy.ignore, ProxyIgnore::No);
2113 }
2114
2115 #[test]
2116 fn test_request_proxy_default() {
2117 let proxy = RequestProxy::default();
2118 assert!(proxy.addr.is_empty());
2119 assert_eq!(proxy.ignore, ProxyIgnore::No);
2120 }
2121
2122 #[test]
2123 fn test_configuration_blacklist_setup() {
2124 let mut config = Configuration::default();
2125 config.blacklist_url = Some(vec![
2126 "https://example.com/private".into(),
2127 "https://example.com/admin".into(),
2128 ]);
2129 assert_eq!(config.blacklist_url.as_ref().unwrap().len(), 2);
2130 }
2131
2132 #[test]
2133 fn test_configuration_whitelist_setup() {
2134 let mut config = Configuration::default();
2135 config.whitelist_url = Some(vec!["https://example.com/public".into()]);
2136 assert_eq!(config.whitelist_url.as_ref().unwrap().len(), 1);
2137 }
2138
2139 #[test]
2140 fn test_configuration_external_domains() {
2141 let mut config = Configuration::default();
2142 config.external_domains_caseless = Arc::new(
2143 [
2144 case_insensitive_string::CaseInsensitiveString::from("Example.Com"),
2145 case_insensitive_string::CaseInsensitiveString::from("OTHER.org"),
2146 ]
2147 .into_iter()
2148 .collect(),
2149 );
2150 assert_eq!(config.external_domains_caseless.len(), 2);
2151 assert!(config.external_domains_caseless.contains(
2152 &case_insensitive_string::CaseInsensitiveString::from("example.com")
2153 ));
2154 }
2155
2156 #[test]
2157 fn test_configuration_budget() {
2158 let mut config = Configuration::default();
2159 let mut budget = hashbrown::HashMap::new();
2160 budget.insert(
2161 case_insensitive_string::CaseInsensitiveString::from("/path"),
2162 100u32,
2163 );
2164 config.budget = Some(budget);
2165 assert!(config.budget.is_some());
2166 assert_eq!(
2167 config.budget.as_ref().unwrap().get(
2168 &case_insensitive_string::CaseInsensitiveString::from("/path")
2169 ),
2170 Some(&100u32)
2171 );
2172 }
2173
2174 #[cfg(not(feature = "regex"))]
2175 #[test]
2176 fn test_allow_list_set_default() {
2177 let allow_list = AllowListSet::default();
2178 assert!(allow_list.0.is_empty());
2179 }
2180
2181 #[cfg(feature = "agent")]
2182 #[test]
2183 fn test_build_remote_multimodal_engine_preserves_dual_models() {
2184 use crate::features::automation::{
2185 ModelEndpoint, RemoteMultimodalConfigs, VisionRouteMode,
2186 };
2187
2188 let mut config = Configuration::default();
2189 let mm = RemoteMultimodalConfigs::new(
2190 "https://api.example.com/v1/chat/completions",
2191 "primary-model",
2192 )
2193 .with_vision_model(ModelEndpoint::new("vision-model").with_api_key("vision-key"))
2194 .with_text_model(
2195 ModelEndpoint::new("text-model")
2196 .with_api_url("https://text.example.com/v1/chat/completions")
2197 .with_api_key("text-key"),
2198 )
2199 .with_vision_route_mode(VisionRouteMode::TextFirst);
2200 config.remote_multimodal = Some(Box::new(mm));
2201
2202 let engine = config
2203 .build_remote_multimodal_engine()
2204 .expect("engine should be built");
2205
2206 assert_eq!(
2207 engine.vision_model.as_ref().map(|m| m.model_name.as_str()),
2208 Some("vision-model")
2209 );
2210 assert_eq!(
2211 engine.text_model.as_ref().map(|m| m.model_name.as_str()),
2212 Some("text-model")
2213 );
2214 assert_eq!(engine.vision_route_mode, VisionRouteMode::TextFirst);
2215 }
2216}