1use crate::compact_str::CompactString;
2use crate::features::chrome_common::RequestInterceptConfiguration;
3pub use crate::features::chrome_common::{
4 AuthChallengeResponse, AuthChallengeResponseResponse, AutomationScripts, AutomationScriptsMap,
5 CaptureScreenshotFormat, CaptureScreenshotParams, ClipViewport, ExecutionScripts,
6 ExecutionScriptsMap, ScreenShotConfig, ScreenshotParams, Viewport, WaitFor, WaitForDelay,
7 WaitForIdleNetwork, WaitForSelector, WebAutomation,
8};
9pub use crate::features::gemini_common::GeminiConfigs;
10pub use crate::features::openai_common::GPTConfigs;
11#[cfg(feature = "search")]
12pub use crate::features::search::{
13 SearchError, SearchOptions, SearchResult, SearchResults, TimeRange,
14};
15pub use crate::features::webdriver_common::{WebDriverBrowser, WebDriverConfig};
16use crate::utils::get_domain_from_url;
17use crate::utils::BasicCachePolicy;
18use crate::website::CronType;
19use reqwest::header::{AsHeaderName, HeaderMap, HeaderName, HeaderValue, IntoHeaderName};
20use std::net::IpAddr;
21use std::sync::Arc;
22use std::time::Duration;
23
24#[cfg(feature = "chrome")]
25pub use spider_fingerprint::Fingerprint;
26
27pub fn is_placeholder_api_key(key: &str) -> bool {
29 let trimmed = key.trim();
30 trimmed.is_empty()
31 || trimmed.eq_ignore_ascii_case("YOUR_API_KEY")
32 || trimmed.eq_ignore_ascii_case("YOUR-API-KEY")
33 || trimmed.eq_ignore_ascii_case("API_KEY")
34 || trimmed.eq_ignore_ascii_case("API-KEY")
35}
36
37#[derive(Debug, Default, Clone, PartialEq)]
39#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
40pub enum RedirectPolicy {
41 #[default]
42 #[cfg_attr(
43 feature = "serde",
44 serde(alias = "Loose", alias = "loose", alias = "LOOSE",)
45 )]
46 Loose,
48 #[cfg_attr(
49 feature = "serde",
50 serde(alias = "Strict", alias = "strict", alias = "STRICT",)
51 )]
52 Strict,
54 #[cfg_attr(
55 feature = "serde",
56 serde(alias = "None", alias = "none", alias = "NONE",)
57 )]
58 None,
60}
61
62#[cfg(not(feature = "regex"))]
63pub type AllowList = Vec<CompactString>;
65
66#[cfg(feature = "regex")]
67pub type AllowList = Box<regex::RegexSet>;
69
70#[derive(Debug, Default, Clone)]
72#[cfg_attr(not(feature = "regex"), derive(PartialEq, Eq))]
73pub struct AllowListSet(pub AllowList);
74
75#[cfg(feature = "chrome")]
76#[derive(Debug, PartialEq, Eq, Clone, Default)]
78#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
79pub struct ChromeEventTracker {
80 pub responses: bool,
82 pub requests: bool,
84 pub automation: bool,
86}
87
88#[cfg(feature = "chrome")]
89impl ChromeEventTracker {
90 pub fn new(requests: bool, responses: bool) -> Self {
92 ChromeEventTracker {
93 requests,
94 responses,
95 automation: true,
96 }
97 }
98}
99
100#[cfg(feature = "sitemap")]
101#[derive(Debug, Default)]
102pub struct SitemapWhitelistChanges {
104 pub added_default: bool,
106 pub added_custom: bool,
108}
109
110#[cfg(feature = "sitemap")]
111impl SitemapWhitelistChanges {
112 pub(crate) fn modified(&self) -> bool {
114 self.added_default || self.added_custom
115 }
116}
117
118#[derive(Debug, Default, Clone, PartialEq)]
120#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
121pub enum ProxyIgnore {
122 Chrome,
124 Http,
126 #[default]
127 No,
129}
130
131#[derive(Debug, Default, Clone, PartialEq)]
133#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
134pub struct RequestProxy {
135 pub addr: String,
137 pub ignore: ProxyIgnore,
139}
140
141#[cfg(feature = "parallel_backends")]
143#[derive(Debug, Clone, PartialEq)]
144#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
145pub enum BackendProtocol {
146 Cdp,
148 WebDriver,
150}
151
152#[cfg(feature = "parallel_backends")]
154#[derive(Debug, Default, Clone, PartialEq)]
155#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
156pub enum BackendEngine {
157 #[default]
158 Cdp,
160 Servo,
162 Custom,
165}
166
167#[cfg(feature = "parallel_backends")]
173#[derive(Debug, Default, Clone, PartialEq)]
174#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
175#[cfg_attr(feature = "serde", serde(default))]
176pub struct BackendEndpoint {
177 pub engine: BackendEngine,
179 pub endpoint: Option<String>,
184 pub binary_path: Option<String>,
188 pub protocol: Option<BackendProtocol>,
192 pub proxy: Option<String>,
198}
199
200#[cfg(feature = "parallel_backends")]
205#[derive(Debug, Clone, PartialEq)]
206#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
207#[cfg_attr(feature = "serde", serde(default))]
208pub struct ParallelBackendsConfig {
209 pub backends: Vec<BackendEndpoint>,
211 pub grace_period_ms: u64,
215 pub enabled: bool,
217 pub fast_accept_threshold: u16,
221 pub max_consecutive_errors: u16,
224 pub connect_timeout_ms: u64,
228 pub skip_binary_content_types: bool,
233 pub max_concurrent_sessions: usize,
236 pub skip_extensions: Vec<CompactString>,
240 pub max_backend_bytes_in_flight: usize,
246 pub backend_timeout_ms: u64,
252}
253
254#[cfg(feature = "parallel_backends")]
255impl Default for ParallelBackendsConfig {
256 fn default() -> Self {
257 Self {
258 backends: Vec::new(),
259 grace_period_ms: 500,
260 enabled: true,
261 fast_accept_threshold: 80,
262 max_consecutive_errors: 10,
263 connect_timeout_ms: 5000,
264 skip_binary_content_types: true,
265 max_concurrent_sessions: 8,
266 skip_extensions: Vec::new(),
267 max_backend_bytes_in_flight: 256 * 1024 * 1024, backend_timeout_ms: 30_000,
269 }
270 }
271}
272
273#[derive(Debug, Default, Clone, PartialEq, Eq)]
275#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
276#[cfg_attr(feature = "serde", serde(default))]
277pub struct CustomAntibotPatterns {
278 pub body: Vec<CompactString>,
280 pub url: Vec<CompactString>,
282 pub header_keys: Vec<CompactString>,
284}
285
286#[derive(Debug, Default, Clone)]
296#[cfg_attr(
297 all(
298 not(feature = "regex"),
299 not(feature = "openai"),
300 not(feature = "cache_openai"),
301 not(feature = "gemini"),
302 not(feature = "cache_gemini")
303 ),
304 derive(PartialEq)
305)]
306#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
307#[cfg_attr(feature = "serde", serde(default))]
308pub struct Configuration {
309 pub respect_robots_txt: bool,
311 pub subdomains: bool,
313 pub tld: bool,
315 pub crawl_timeout: Option<Duration>,
317 pub preserve_host_header: bool,
319 pub blacklist_url: Option<Vec<CompactString>>,
321 pub whitelist_url: Option<Vec<CompactString>>,
323 pub user_agent: Option<Box<CompactString>>,
325 pub delay: u64,
327 pub request_timeout: Option<Duration>,
329 pub http2_prior_knowledge: bool,
331 pub proxies: Option<Vec<RequestProxy>>,
333 pub headers: Option<Box<SerializableHeaderMap>>,
335 #[cfg(feature = "sitemap")]
336 pub sitemap_url: Option<Box<CompactString>>,
338 #[cfg(feature = "sitemap")]
339 pub ignore_sitemap: bool,
341 pub redirect_limit: usize,
343 pub redirect_policy: RedirectPolicy,
345 #[cfg(feature = "cookies")]
346 pub cookie_str: String,
348 #[cfg(feature = "wreq")]
349 pub emulation: Option<wreq_util::Emulation>,
351 #[cfg(feature = "cron")]
352 pub cron_str: String,
354 #[cfg(feature = "cron")]
355 pub cron_type: CronType,
357 pub depth: usize,
359 pub depth_distance: usize,
361 pub stealth_mode: spider_fingerprint::configs::Tier,
363 pub viewport: Option<Viewport>,
365 pub budget: Option<hashbrown::HashMap<case_insensitive_string::CaseInsensitiveString, u32>>,
367 pub wild_card_budgeting: bool,
369 pub external_domains_caseless:
371 Arc<hashbrown::HashSet<case_insensitive_string::CaseInsensitiveString>>,
372 pub full_resources: bool,
374 pub accept_invalid_certs: bool,
376 pub auth_challenge_response: Option<AuthChallengeResponse>,
378 pub openai_config: Option<Box<GPTConfigs>>,
380 pub gemini_config: Option<Box<GeminiConfigs>>,
382 pub remote_multimodal: Option<Box<crate::features::automation::RemoteMultimodalConfigs>>,
385 pub shared_queue: bool,
387 pub return_page_links: bool,
389 pub retry: u8,
391 pub custom_antibot: Option<CustomAntibotPatterns>,
394 pub no_control_thread: bool,
396 blacklist: AllowListSet,
398 whitelist: AllowListSet,
400 pub(crate) inner_budget:
402 Option<hashbrown::HashMap<case_insensitive_string::CaseInsensitiveString, u32>>,
403 pub only_html: bool,
405 pub concurrency_limit: Option<usize>,
407 pub normalize: bool,
409 pub shared: bool,
411 pub modify_headers: bool,
413 pub modify_http_client_headers: bool,
415 #[cfg(any(
417 feature = "cache_request",
418 feature = "chrome",
419 feature = "chrome_remote_cache"
420 ))]
421 pub cache: bool,
422 #[cfg(any(
425 feature = "cache_request",
426 feature = "chrome",
427 feature = "chrome_remote_cache"
428 ))]
429 pub cache_skip_browser: bool,
430 pub cache_namespace: Option<Box<String>>,
437 #[cfg(feature = "chrome")]
438 pub service_worker_enabled: bool,
440 #[cfg(feature = "chrome")]
441 #[cfg(feature = "chrome")]
443 pub timezone_id: Option<Box<String>>,
444 #[cfg(feature = "chrome")]
446 pub locale: Option<Box<String>>,
447 #[cfg(feature = "chrome")]
449 pub evaluate_on_new_document: Option<Box<String>>,
450 #[cfg(feature = "chrome")]
451 pub dismiss_dialogs: Option<bool>,
453 #[cfg(feature = "chrome")]
454 pub wait_for: Option<WaitFor>,
456 #[cfg(feature = "chrome")]
457 pub screenshot: Option<ScreenShotConfig>,
459 #[cfg(feature = "chrome")]
460 pub track_events: Option<ChromeEventTracker>,
462 #[cfg(feature = "chrome")]
463 pub fingerprint: Fingerprint,
465 #[cfg(feature = "chrome")]
466 pub chrome_connection_url: Option<String>,
468 #[cfg(feature = "chrome")]
469 pub chrome_connection_urls: Option<Vec<String>>,
473 #[cfg(feature = "chrome")]
475 pub execution_scripts: Option<ExecutionScripts>,
476 #[cfg(feature = "chrome")]
478 pub automation_scripts: Option<AutomationScripts>,
479 #[cfg(feature = "chrome")]
481 pub chrome_intercept: RequestInterceptConfiguration,
482 pub referer: Option<String>,
484 pub max_page_bytes: Option<f64>,
486 pub max_bytes_allowed: Option<u64>,
488 #[cfg(feature = "chrome")]
489 pub disable_log: bool,
491 #[cfg(feature = "chrome")]
492 pub auto_geolocation: bool,
494 pub cache_policy: Option<BasicCachePolicy>,
496 #[cfg(feature = "chrome")]
497 pub bypass_csp: bool,
499 #[cfg(feature = "chrome")]
500 pub disable_javascript: bool,
502 pub network_interface: Option<String>,
504 pub local_address: Option<IpAddr>,
506 pub default_http_connect_timeout: Option<Duration>,
508 pub default_http_read_timeout: Option<Duration>,
510 #[cfg(feature = "webdriver")]
511 pub webdriver_config: Option<Box<WebDriverConfig>>,
513 #[cfg(feature = "search")]
514 pub search_config: Option<Box<SearchConfig>>,
516 #[cfg(feature = "spider_cloud")]
517 pub spider_cloud: Option<Box<SpiderCloudConfig>>,
519 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
520 pub spider_browser: Option<Box<SpiderBrowserConfig>>,
522 #[cfg(feature = "hedge")]
523 pub hedge: Option<crate::utils::hedge::HedgeConfig>,
526 #[cfg(feature = "auto_throttle")]
527 pub auto_throttle: Option<crate::utils::auto_throttle::AutoThrottleConfig>,
530 #[cfg(feature = "etag_cache")]
531 pub etag_cache: bool,
536 #[cfg(feature = "warc")]
537 pub warc: Option<crate::utils::warc::WarcConfig>,
540 #[cfg(feature = "parallel_backends")]
541 pub parallel_backends: Option<ParallelBackendsConfig>,
544}
545
546#[derive(Default, Debug, Clone, PartialEq, Eq)]
547pub struct SerializableHeaderMap(pub HeaderMap);
549
550impl SerializableHeaderMap {
551 pub fn inner(&self) -> &HeaderMap {
553 &self.0
554 }
555 pub fn contains_key<K>(&self, key: K) -> bool
557 where
558 K: AsHeaderName,
559 {
560 self.0.contains_key(key)
561 }
562 pub fn insert<K>(
564 &mut self,
565 key: K,
566 val: reqwest::header::HeaderValue,
567 ) -> Option<reqwest::header::HeaderValue>
568 where
569 K: IntoHeaderName,
570 {
571 self.0.insert(key, val)
572 }
573 pub fn extend<I>(&mut self, iter: I)
575 where
576 I: IntoIterator<Item = (Option<HeaderName>, HeaderValue)>,
577 {
578 self.0.extend(iter);
579 }
580}
581
582pub fn get_referer(header_map: &Option<Box<SerializableHeaderMap>>) -> Option<String> {
584 match header_map {
585 Some(header_map) => {
586 header_map
587 .0
588 .get(crate::client::header::REFERER) .and_then(|value| value.to_str().ok()) .map(String::from) }
592 _ => None,
593 }
594}
595
596impl From<HeaderMap> for SerializableHeaderMap {
597 fn from(header_map: HeaderMap) -> Self {
598 SerializableHeaderMap(header_map)
599 }
600}
601
602#[cfg(feature = "serde")]
603impl serde::Serialize for SerializableHeaderMap {
604 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
605 where
606 S: serde::Serializer,
607 {
608 let map: std::collections::BTreeMap<String, String> = self
609 .0
610 .iter()
611 .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
612 .collect();
613 map.serialize(serializer)
614 }
615}
616
617#[cfg(feature = "serde")]
618impl<'de> serde::Deserialize<'de> for SerializableHeaderMap {
619 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
620 where
621 D: serde::Deserializer<'de>,
622 {
623 use reqwest::header::{HeaderName, HeaderValue};
624 use std::collections::BTreeMap;
625 let map: BTreeMap<String, String> = BTreeMap::deserialize(deserializer)?;
626 let mut headers = HeaderMap::with_capacity(map.len());
627 for (k, v) in map {
628 let key = HeaderName::from_bytes(k.as_bytes()).map_err(serde::de::Error::custom)?;
629 let value = HeaderValue::from_str(&v).map_err(serde::de::Error::custom)?;
630 headers.insert(key, value);
631 }
632 Ok(SerializableHeaderMap(headers))
633 }
634}
635
636#[cfg(feature = "serde")]
637impl serde::Serialize for AllowListSet {
638 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
639 where
640 S: serde::Serializer,
641 {
642 #[cfg(not(feature = "regex"))]
643 {
644 self.0.serialize(serializer)
645 }
646
647 #[cfg(feature = "regex")]
648 {
649 self.0
650 .patterns()
651 .iter()
652 .collect::<Vec<&String>>()
653 .serialize(serializer)
654 }
655 }
656}
657
658#[cfg(feature = "serde")]
659impl<'de> serde::Deserialize<'de> for AllowListSet {
660 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
661 where
662 D: serde::Deserializer<'de>,
663 {
664 #[cfg(not(feature = "regex"))]
665 {
666 let vec = Vec::<CompactString>::deserialize(deserializer)?;
667 Ok(AllowListSet(vec))
668 }
669
670 #[cfg(feature = "regex")]
671 {
672 let patterns = Vec::<String>::deserialize(deserializer)?;
673 let regex_set = regex::RegexSet::new(&patterns).map_err(serde::de::Error::custom)?;
674 Ok(AllowListSet(regex_set.into()))
675 }
676 }
677}
678
679#[cfg(feature = "ua_generator")]
681pub fn get_ua(chrome: bool) -> &'static str {
682 if chrome {
683 ua_generator::ua::spoof_chrome_ua()
684 } else {
685 ua_generator::ua::spoof_ua()
686 }
687}
688
689#[cfg(not(feature = "ua_generator"))]
691pub fn get_ua(_chrome: bool) -> &'static str {
692 use std::env;
693
694 lazy_static! {
695 static ref AGENT: &'static str =
696 concat!(env!("CARGO_PKG_NAME"), '/', env!("CARGO_PKG_VERSION"));
697 };
698
699 AGENT.as_ref()
700}
701
702impl Configuration {
703 #[cfg(not(feature = "chrome"))]
705 pub fn new() -> Self {
706 Self {
707 delay: 0,
708 depth: 25,
709 redirect_limit: 7,
710 request_timeout: Some(Duration::from_secs(120)),
711 only_html: true,
712 modify_headers: true,
713 ..Default::default()
714 }
715 }
716
717 #[cfg(feature = "chrome")]
719 pub fn new() -> Self {
720 Self {
721 delay: 0,
722 depth: 25,
723 redirect_limit: 7,
724 request_timeout: Some(Duration::from_secs(120)),
725 chrome_intercept: RequestInterceptConfiguration::new(cfg!(
726 feature = "chrome_intercept"
727 )),
728 user_agent: Some(Box::new(get_ua(true).into())),
729 only_html: true,
730 cache: true,
731 modify_headers: true,
732 service_worker_enabled: true,
733 fingerprint: Fingerprint::Basic,
734 auto_geolocation: false,
735 ..Default::default()
736 }
737 }
738
739 #[cfg(feature = "agent")]
742 pub fn build_remote_multimodal_engine(
743 &self,
744 ) -> Option<crate::features::automation::RemoteMultimodalEngine> {
745 let cfgs = self.remote_multimodal.as_ref()?;
746 let sem = cfgs
747 .concurrency_limit
748 .filter(|&n| n > 0)
749 .map(|n| std::sync::Arc::new(tokio::sync::Semaphore::new(n)));
750
751 #[allow(unused_mut)]
752 let mut engine = crate::features::automation::RemoteMultimodalEngine::new(
753 cfgs.api_url.clone(),
754 cfgs.model_name.clone(),
755 cfgs.system_prompt.clone(),
756 )
757 .with_api_key(cfgs.api_key.as_deref())
758 .with_system_prompt_extra(cfgs.system_prompt_extra.as_deref())
759 .with_user_message_extra(cfgs.user_message_extra.as_deref())
760 .with_remote_multimodal_config(cfgs.cfg.clone())
761 .with_prompt_url_gate(cfgs.prompt_url_gate.clone())
762 .with_vision_model(cfgs.vision_model.clone())
763 .with_text_model(cfgs.text_model.clone())
764 .with_vision_route_mode(cfgs.vision_route_mode)
765 .with_chrome_ai(cfgs.use_chrome_ai)
766 .with_semaphore(sem)
767 .to_owned();
768
769 #[cfg(feature = "agent_skills")]
770 if let Some(ref registry) = cfgs.skill_registry {
771 engine.with_skill_registry(Some(registry.clone()));
772 }
773
774 let model_pool = cfgs.model_pool.clone();
776 if model_pool.len() >= 3 {
777 let model_names: Vec<&str> =
778 model_pool.iter().map(|ep| ep.model_name.as_str()).collect();
779 let policy = crate::features::automation::auto_policy(&model_names);
780 engine.model_router = Some(crate::features::automation::ModelRouter::with_policy(
781 policy,
782 ));
783 }
784 engine.model_pool = model_pool;
785
786 Some(engine)
787 }
788
789 #[cfg(not(feature = "chrome"))]
791 pub(crate) fn only_chrome_agent(&self) -> bool {
792 false
793 }
794
795 #[cfg(feature = "chrome")]
797 pub(crate) fn only_chrome_agent(&self) -> bool {
798 self.chrome_connection_url.is_some()
799 || self.wait_for.is_some()
800 || self.chrome_intercept.enabled
801 || self.stealth_mode.stealth()
802 || self.fingerprint.valid()
803 }
804
805 #[cfg(feature = "regex")]
806 pub fn get_blacklist(&self) -> Box<regex::RegexSet> {
808 match &self.blacklist_url {
809 Some(blacklist) => match regex::RegexSet::new(&**blacklist) {
810 Ok(s) => Box::new(s),
811 _ => Default::default(),
812 },
813 _ => Default::default(),
814 }
815 }
816
817 #[cfg(not(feature = "regex"))]
818 pub fn get_blacklist(&self) -> AllowList {
820 match &self.blacklist_url {
821 Some(blacklist) => blacklist.to_owned(),
822 _ => Default::default(),
823 }
824 }
825
826 pub(crate) fn set_blacklist(&mut self) {
828 self.blacklist = AllowListSet(self.get_blacklist());
829 }
830
831 pub fn set_whitelist(&mut self) {
833 self.whitelist = AllowListSet(self.get_whitelist());
834 }
835
836 pub fn configure_allowlist(&mut self) {
838 self.set_whitelist();
839 self.set_blacklist();
840 }
841
842 pub fn get_blacklist_compiled(&self) -> &AllowList {
844 &self.blacklist.0
845 }
846
847 pub fn configure_budget(&mut self) {
849 self.inner_budget.clone_from(&self.budget);
850 }
851
852 pub fn get_whitelist_compiled(&self) -> &AllowList {
854 &self.whitelist.0
855 }
856
857 #[cfg(feature = "regex")]
858 pub fn get_whitelist(&self) -> Box<regex::RegexSet> {
860 match &self.whitelist_url {
861 Some(whitelist) => match regex::RegexSet::new(&**whitelist) {
862 Ok(s) => Box::new(s),
863 _ => Default::default(),
864 },
865 _ => Default::default(),
866 }
867 }
868
869 #[cfg(not(feature = "regex"))]
870 pub fn get_whitelist(&self) -> AllowList {
872 match &self.whitelist_url {
873 Some(whitelist) => whitelist.to_owned(),
874 _ => Default::default(),
875 }
876 }
877
878 #[cfg(feature = "sitemap")]
879 pub fn add_sitemap_to_whitelist(&mut self) -> SitemapWhitelistChanges {
881 let mut changes = SitemapWhitelistChanges::default();
882
883 if self.ignore_sitemap && self.whitelist_url.is_none() {
884 return changes;
885 }
886
887 if let Some(list) = self.whitelist_url.as_mut() {
888 if list.is_empty() {
889 return changes;
890 }
891
892 let default = CompactString::from("sitemap.xml");
893
894 if !list.contains(&default) {
895 list.push(default);
896 changes.added_default = true;
897 }
898
899 if let Some(custom) = &self.sitemap_url {
900 if !list.contains(custom) {
901 list.push(*custom.clone());
902 changes.added_custom = true;
903 }
904 }
905 }
906
907 changes
908 }
909
910 #[cfg(feature = "sitemap")]
911 pub fn remove_sitemap_from_whitelist(&mut self, changes: SitemapWhitelistChanges) {
913 if let Some(list) = self.whitelist_url.as_mut() {
914 if changes.added_default {
915 let default = CompactString::from("sitemap.xml");
916 if let Some(pos) = list.iter().position(|s| s == default) {
917 list.remove(pos);
918 }
919 }
920 if changes.added_custom {
921 if let Some(custom) = &self.sitemap_url {
922 if let Some(pos) = list.iter().position(|s| *s == **custom) {
923 list.remove(pos);
924 }
925 }
926 }
927 if list.is_empty() {
928 self.whitelist_url = None;
929 }
930 }
931 }
932
933 pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self {
935 self.respect_robots_txt = respect_robots_txt;
936 self
937 }
938
939 pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self {
941 self.subdomains = subdomains;
942 self
943 }
944
945 #[cfg(feature = "chrome")]
947 pub fn with_csp_bypass(&mut self, enabled: bool) -> &mut Self {
948 self.bypass_csp = enabled;
949 self
950 }
951
952 #[cfg(not(feature = "chrome"))]
954 pub fn with_csp_bypass(&mut self, _enabled: bool) -> &mut Self {
955 self
956 }
957
958 #[cfg(feature = "chrome")]
960 pub fn with_disable_javascript(&mut self, disabled: bool) -> &mut Self {
961 self.disable_javascript = disabled;
962 self
963 }
964
965 #[cfg(not(feature = "chrome"))]
967 pub fn with_disable_javascript(&mut self, _disabled: bool) -> &mut Self {
968 self
969 }
970
971 pub fn with_network_interface(&mut self, network_interface: Option<String>) -> &mut Self {
973 self.network_interface = network_interface;
974 self
975 }
976
977 pub fn with_local_address(&mut self, local_address: Option<IpAddr>) -> &mut Self {
979 self.local_address = local_address;
980 self
981 }
982
983 pub fn with_tld(&mut self, tld: bool) -> &mut Self {
985 self.tld = tld;
986 self
987 }
988
989 pub fn with_crawl_timeout(&mut self, crawl_timeout: Option<Duration>) -> &mut Self {
991 self.crawl_timeout = crawl_timeout;
992 self
993 }
994
995 pub fn with_delay(&mut self, delay: u64) -> &mut Self {
997 self.delay = delay;
998 self
999 }
1000
1001 pub fn with_http2_prior_knowledge(&mut self, http2_prior_knowledge: bool) -> &mut Self {
1003 self.http2_prior_knowledge = http2_prior_knowledge;
1004 self
1005 }
1006
1007 pub fn with_request_timeout(&mut self, request_timeout: Option<Duration>) -> &mut Self {
1009 match request_timeout {
1010 Some(timeout) => self.request_timeout = Some(timeout),
1011 _ => self.request_timeout = None,
1012 };
1013
1014 self
1015 }
1016
1017 #[cfg(feature = "sitemap")]
1018 pub fn with_sitemap(&mut self, sitemap_url: Option<&str>) -> &mut Self {
1020 match sitemap_url {
1021 Some(sitemap_url) => {
1022 self.sitemap_url = Some(CompactString::new(sitemap_url.to_string()).into())
1023 }
1024 _ => self.sitemap_url = None,
1025 };
1026 self
1027 }
1028
1029 #[cfg(not(feature = "sitemap"))]
1030 pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self {
1032 self
1033 }
1034
1035 #[cfg(feature = "sitemap")]
1036 pub fn with_ignore_sitemap(&mut self, ignore_sitemap: bool) -> &mut Self {
1038 self.ignore_sitemap = ignore_sitemap;
1039 self
1040 }
1041
1042 #[cfg(not(feature = "sitemap"))]
1043 pub fn with_ignore_sitemap(&mut self, _ignore_sitemap: bool) -> &mut Self {
1045 self
1046 }
1047
1048 pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self {
1050 match user_agent {
1051 Some(agent) => self.user_agent = Some(CompactString::new(agent).into()),
1052 _ => self.user_agent = None,
1053 };
1054 self
1055 }
1056
1057 pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self {
1059 self.preserve_host_header = preserve;
1060 self
1061 }
1062
1063 #[cfg(feature = "agent")]
1066 pub fn with_remote_multimodal(
1067 &mut self,
1068 remote_multimodal: Option<crate::features::automation::RemoteMultimodalConfigs>,
1069 ) -> &mut Self {
1070 self.remote_multimodal = remote_multimodal.map(Box::new);
1071 self
1072 }
1073
1074 #[cfg(not(feature = "agent"))]
1077 pub fn with_remote_multimodal(
1078 &mut self,
1079 remote_multimodal: Option<crate::features::automation::RemoteMultimodalConfigs>,
1080 ) -> &mut Self {
1081 self.remote_multimodal = remote_multimodal.map(Box::new);
1082 self
1083 }
1084
1085 #[cfg(not(feature = "openai"))]
1086 pub fn with_openai(&mut self, _openai_config: Option<GPTConfigs>) -> &mut Self {
1088 self
1089 }
1090
1091 #[cfg(feature = "openai")]
1093 pub fn with_openai(&mut self, openai_config: Option<GPTConfigs>) -> &mut Self {
1094 match openai_config {
1095 Some(openai_config) => self.openai_config = Some(Box::new(openai_config)),
1096 _ => self.openai_config = None,
1097 };
1098 self
1099 }
1100
1101 #[cfg(not(feature = "gemini"))]
1102 pub fn with_gemini(&mut self, _gemini_config: Option<GeminiConfigs>) -> &mut Self {
1104 self
1105 }
1106
1107 #[cfg(feature = "gemini")]
1109 pub fn with_gemini(&mut self, gemini_config: Option<GeminiConfigs>) -> &mut Self {
1110 match gemini_config {
1111 Some(gemini_config) => self.gemini_config = Some(Box::new(gemini_config)),
1112 _ => self.gemini_config = None,
1113 };
1114 self
1115 }
1116
1117 #[cfg(feature = "cookies")]
1118 pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self {
1120 self.cookie_str = cookie_str.into();
1121 self
1122 }
1123
1124 #[cfg(not(feature = "cookies"))]
1125 pub fn with_cookies(&mut self, _cookie_str: &str) -> &mut Self {
1127 self
1128 }
1129
1130 #[cfg(feature = "chrome")]
1131 pub fn with_fingerprint(&mut self, fingerprint: bool) -> &mut Self {
1133 if fingerprint {
1134 self.fingerprint = Fingerprint::Basic;
1135 } else {
1136 self.fingerprint = Fingerprint::None;
1137 }
1138 self
1139 }
1140
1141 #[cfg(feature = "chrome")]
1142 pub fn with_fingerprint_advanced(&mut self, fingerprint: Fingerprint) -> &mut Self {
1144 self.fingerprint = fingerprint;
1145 self
1146 }
1147
1148 #[cfg(not(feature = "chrome"))]
1149 pub fn with_fingerprint(&mut self, _fingerprint: bool) -> &mut Self {
1151 self
1152 }
1153
1154 pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self {
1156 self.proxies = proxies.map(|p| {
1157 p.iter()
1158 .map(|addr| RequestProxy {
1159 addr: addr.to_owned(),
1160 ..Default::default()
1161 })
1162 .collect::<Vec<RequestProxy>>()
1163 });
1164 self
1165 }
1166
1167 pub fn with_proxies_direct(&mut self, proxies: Option<Vec<RequestProxy>>) -> &mut Self {
1169 self.proxies = proxies;
1170 self
1171 }
1172
1173 pub fn with_shared_queue(&mut self, shared_queue: bool) -> &mut Self {
1175 self.shared_queue = shared_queue;
1176 self
1177 }
1178
1179 pub fn with_blacklist_url<T>(&mut self, blacklist_url: Option<Vec<T>>) -> &mut Self
1181 where
1182 Vec<CompactString>: From<Vec<T>>,
1183 {
1184 match blacklist_url {
1185 Some(p) => self.blacklist_url = Some(p.into()),
1186 _ => self.blacklist_url = None,
1187 };
1188 self
1189 }
1190
1191 pub fn with_whitelist_url<T>(&mut self, whitelist_url: Option<Vec<T>>) -> &mut Self
1193 where
1194 Vec<CompactString>: From<Vec<T>>,
1195 {
1196 match whitelist_url {
1197 Some(p) => self.whitelist_url = Some(p.into()),
1198 _ => self.whitelist_url = None,
1199 };
1200 self
1201 }
1202
1203 pub fn with_return_page_links(&mut self, return_page_links: bool) -> &mut Self {
1205 self.return_page_links = return_page_links;
1206 self
1207 }
1208
1209 pub fn with_headers(&mut self, headers: Option<reqwest::header::HeaderMap>) -> &mut Self {
1211 match headers {
1212 Some(m) => self.headers = Some(SerializableHeaderMap::from(m).into()),
1213 _ => self.headers = None,
1214 };
1215 self
1216 }
1217
1218 pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self {
1220 self.redirect_limit = redirect_limit;
1221 self
1222 }
1223
1224 pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self {
1226 self.redirect_policy = policy;
1227 self
1228 }
1229
1230 pub fn with_referer(&mut self, referer: Option<String>) -> &mut Self {
1232 self.referer = referer;
1233 self
1234 }
1235
1236 pub fn with_referrer(&mut self, referer: Option<String>) -> &mut Self {
1238 self.referer = referer;
1239 self
1240 }
1241
1242 pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self {
1244 self.full_resources = full_resources;
1245 self
1246 }
1247
1248 #[cfg(feature = "chrome")]
1250 pub fn with_dismiss_dialogs(&mut self, dismiss_dialogs: bool) -> &mut Self {
1251 self.dismiss_dialogs = Some(dismiss_dialogs);
1252 self
1253 }
1254
1255 #[cfg(not(feature = "chrome"))]
1257 pub fn with_dismiss_dialogs(&mut self, _dismiss_dialogs: bool) -> &mut Self {
1258 self
1259 }
1260
1261 #[cfg(feature = "wreq")]
1263 pub fn with_emulation(&mut self, emulation: Option<wreq_util::Emulation>) -> &mut Self {
1264 self.emulation = emulation;
1265 self
1266 }
1267
1268 #[cfg(feature = "cron")]
1269 pub fn with_cron(&mut self, cron_str: &str, cron_type: CronType) -> &mut Self {
1271 self.cron_str = cron_str.into();
1272 self.cron_type = cron_type;
1273 self
1274 }
1275
1276 #[cfg(not(feature = "cron"))]
1277 pub fn with_cron(&mut self, _cron_str: &str, _cron_type: CronType) -> &mut Self {
1279 self
1280 }
1281
1282 pub fn with_limit(&mut self, limit: u32) -> &mut Self {
1284 self.with_budget(Some(hashbrown::HashMap::from([("*", limit)])));
1285 self
1286 }
1287
1288 pub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self {
1290 self.concurrency_limit = limit;
1291 self
1292 }
1293
1294 #[cfg(feature = "chrome")]
1295 pub fn with_auth_challenge_response(
1297 &mut self,
1298 auth_challenge_response: Option<AuthChallengeResponse>,
1299 ) -> &mut Self {
1300 self.auth_challenge_response = auth_challenge_response;
1301 self
1302 }
1303
1304 #[cfg(feature = "chrome")]
1305 pub fn with_evaluate_on_new_document(
1307 &mut self,
1308 evaluate_on_new_document: Option<Box<String>>,
1309 ) -> &mut Self {
1310 self.evaluate_on_new_document = evaluate_on_new_document;
1311 self
1312 }
1313
1314 #[cfg(not(feature = "chrome"))]
1315 pub fn with_evaluate_on_new_document(
1317 &mut self,
1318 _evaluate_on_new_document: Option<Box<String>>,
1319 ) -> &mut Self {
1320 self
1321 }
1322
1323 #[cfg(not(feature = "chrome"))]
1324 pub fn with_auth_challenge_response(
1326 &mut self,
1327 _auth_challenge_response: Option<AuthChallengeResponse>,
1328 ) -> &mut Self {
1329 self
1330 }
1331
1332 pub fn with_depth(&mut self, depth: usize) -> &mut Self {
1334 self.depth = depth;
1335 self
1336 }
1337
1338 #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
1339 pub fn with_caching(&mut self, cache: bool) -> &mut Self {
1341 self.cache = cache;
1342 self
1343 }
1344
1345 #[cfg(not(any(feature = "cache_request", feature = "chrome_remote_cache")))]
1346 pub fn with_caching(&mut self, _cache: bool) -> &mut Self {
1348 self
1349 }
1350
1351 #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
1352 pub fn with_cache_skip_browser(&mut self, skip: bool) -> &mut Self {
1356 self.cache_skip_browser = skip;
1357 self
1358 }
1359
1360 #[cfg(not(any(feature = "cache_request", feature = "chrome_remote_cache")))]
1361 pub fn with_cache_skip_browser(&mut self, _skip: bool) -> &mut Self {
1364 self
1365 }
1366
1367 pub fn with_cache_namespace<S: Into<String>>(&mut self, namespace: Option<S>) -> &mut Self {
1374 self.cache_namespace = namespace.map(|s| Box::new(s.into()));
1375 self
1376 }
1377
1378 #[inline]
1380 pub(crate) fn cache_namespace_str(&self) -> Option<&str> {
1381 self.cache_namespace.as_ref().map(|s| s.as_str())
1382 }
1383
1384 #[cfg(feature = "chrome")]
1385 pub fn with_service_worker_enabled(&mut self, enabled: bool) -> &mut Self {
1387 self.service_worker_enabled = enabled;
1388 self
1389 }
1390
1391 #[cfg(not(feature = "chrome"))]
1392 pub fn with_service_worker_enabled(&mut self, _enabled: bool) -> &mut Self {
1394 self
1395 }
1396
1397 #[cfg(not(feature = "chrome"))]
1399 pub fn with_auto_geolocation(&mut self, _enabled: bool) -> &mut Self {
1400 self
1401 }
1402
1403 #[cfg(feature = "chrome")]
1405 pub fn with_auto_geolocation(&mut self, enabled: bool) -> &mut Self {
1406 self.auto_geolocation = enabled;
1407 self
1408 }
1409
1410 pub fn with_retry(&mut self, retry: u8) -> &mut Self {
1412 self.retry = retry;
1413 self
1414 }
1415
1416 pub fn with_default_http_connect_timeout(
1418 &mut self,
1419 default_http_connect_timeout: Option<Duration>,
1420 ) -> &mut Self {
1421 self.default_http_connect_timeout = default_http_connect_timeout;
1422 self
1423 }
1424
1425 pub fn with_default_http_read_timeout(
1427 &mut self,
1428 default_http_read_timeout: Option<Duration>,
1429 ) -> &mut Self {
1430 self.default_http_read_timeout = default_http_read_timeout;
1431 self
1432 }
1433
1434 pub fn with_no_control_thread(&mut self, no_control_thread: bool) -> &mut Self {
1436 self.no_control_thread = no_control_thread;
1437 self
1438 }
1439
1440 pub fn with_viewport(&mut self, viewport: Option<crate::configuration::Viewport>) -> &mut Self {
1442 self.viewport = viewport.map(|vp| vp);
1443 self
1444 }
1445
1446 #[cfg(feature = "chrome")]
1447 pub fn with_stealth(&mut self, stealth_mode: bool) -> &mut Self {
1449 if stealth_mode {
1450 self.stealth_mode = spider_fingerprint::configs::Tier::Basic;
1451 } else {
1452 self.stealth_mode = spider_fingerprint::configs::Tier::None;
1453 }
1454 self
1455 }
1456
1457 #[cfg(feature = "chrome")]
1458 pub fn with_stealth_advanced(
1460 &mut self,
1461 stealth_mode: spider_fingerprint::configs::Tier,
1462 ) -> &mut Self {
1463 self.stealth_mode = stealth_mode;
1464 self
1465 }
1466
1467 #[cfg(not(feature = "chrome"))]
1468 pub fn with_stealth(&mut self, _stealth_mode: bool) -> &mut Self {
1470 self
1471 }
1472
1473 #[cfg(feature = "chrome")]
1474 pub fn with_wait_for_idle_network(
1476 &mut self,
1477 wait_for_idle_network: Option<WaitForIdleNetwork>,
1478 ) -> &mut Self {
1479 match self.wait_for.as_mut() {
1480 Some(wait_for) => wait_for.idle_network = wait_for_idle_network,
1481 _ => {
1482 let mut wait_for = WaitFor::default();
1483 wait_for.idle_network = wait_for_idle_network;
1484 self.wait_for = Some(wait_for);
1485 }
1486 }
1487 self
1488 }
1489
1490 #[cfg(feature = "chrome")]
1491 pub fn with_wait_for_idle_network0(
1493 &mut self,
1494 wait_for_idle_network0: Option<WaitForIdleNetwork>,
1495 ) -> &mut Self {
1496 match self.wait_for.as_mut() {
1497 Some(wait_for) => wait_for.idle_network0 = wait_for_idle_network0,
1498 _ => {
1499 let mut wait_for = WaitFor::default();
1500 wait_for.idle_network0 = wait_for_idle_network0;
1501 self.wait_for = Some(wait_for);
1502 }
1503 }
1504 self
1505 }
1506
1507 #[cfg(feature = "chrome")]
1508 pub fn with_wait_for_almost_idle_network0(
1510 &mut self,
1511 wait_for_almost_idle_network0: Option<WaitForIdleNetwork>,
1512 ) -> &mut Self {
1513 match self.wait_for.as_mut() {
1514 Some(wait_for) => wait_for.almost_idle_network0 = wait_for_almost_idle_network0,
1515 _ => {
1516 let mut wait_for = WaitFor::default();
1517 wait_for.almost_idle_network0 = wait_for_almost_idle_network0;
1518 self.wait_for = Some(wait_for);
1519 }
1520 }
1521 self
1522 }
1523
1524 #[cfg(not(feature = "chrome"))]
1525 pub fn with_wait_for_almost_idle_network0(
1527 &mut self,
1528 _wait_for_almost_idle_network0: Option<WaitForIdleNetwork>,
1529 ) -> &mut Self {
1530 self
1531 }
1532
1533 #[cfg(not(feature = "chrome"))]
1534 pub fn with_wait_for_idle_network0(
1536 &mut self,
1537 _wait_for_idle_network0: Option<WaitForIdleNetwork>,
1538 ) -> &mut Self {
1539 self
1540 }
1541
1542 #[cfg(not(feature = "chrome"))]
1543 pub fn with_wait_for_idle_network(
1545 &mut self,
1546 _wait_for_idle_network: Option<WaitForIdleNetwork>,
1547 ) -> &mut Self {
1548 self
1549 }
1550
1551 #[cfg(feature = "chrome")]
1552 pub fn with_wait_for_idle_dom(
1554 &mut self,
1555 wait_for_idle_dom: Option<WaitForSelector>,
1556 ) -> &mut Self {
1557 match self.wait_for.as_mut() {
1558 Some(wait_for) => wait_for.dom = wait_for_idle_dom,
1559 _ => {
1560 let mut wait_for = WaitFor::default();
1561 wait_for.dom = wait_for_idle_dom;
1562 self.wait_for = Some(wait_for);
1563 }
1564 }
1565 self
1566 }
1567
1568 #[cfg(not(feature = "chrome"))]
1569 pub fn with_wait_for_idle_dom(
1571 &mut self,
1572 _wait_for_idle_dom: Option<WaitForSelector>,
1573 ) -> &mut Self {
1574 self
1575 }
1576
1577 #[cfg(feature = "chrome")]
1578 pub fn with_wait_for_selector(
1580 &mut self,
1581 wait_for_selector: Option<WaitForSelector>,
1582 ) -> &mut Self {
1583 match self.wait_for.as_mut() {
1584 Some(wait_for) => wait_for.selector = wait_for_selector,
1585 _ => {
1586 let mut wait_for = WaitFor::default();
1587 wait_for.selector = wait_for_selector;
1588 self.wait_for = Some(wait_for);
1589 }
1590 }
1591 self
1592 }
1593
1594 #[cfg(not(feature = "chrome"))]
1595 pub fn with_wait_for_selector(
1597 &mut self,
1598 _wait_for_selector: Option<WaitForSelector>,
1599 ) -> &mut Self {
1600 self
1601 }
1602
1603 #[cfg(feature = "chrome")]
1604 pub fn with_wait_for_delay(&mut self, wait_for_delay: Option<WaitForDelay>) -> &mut Self {
1606 match self.wait_for.as_mut() {
1607 Some(wait_for) => wait_for.delay = wait_for_delay,
1608 _ => {
1609 let mut wait_for = WaitFor::default();
1610 wait_for.delay = wait_for_delay;
1611 self.wait_for = Some(wait_for);
1612 }
1613 }
1614 self
1615 }
1616
1617 #[cfg(not(feature = "chrome"))]
1618 pub fn with_wait_for_delay(&mut self, _wait_for_delay: Option<WaitForDelay>) -> &mut Self {
1620 self
1621 }
1622
1623 #[cfg(feature = "chrome_intercept")]
1624 pub fn with_chrome_intercept(
1626 &mut self,
1627 chrome_intercept: RequestInterceptConfiguration,
1628 url: &Option<Box<url::Url>>,
1629 ) -> &mut Self {
1630 self.chrome_intercept = chrome_intercept;
1631 self.chrome_intercept.setup_intercept_manager(url);
1632 self
1633 }
1634
1635 #[cfg(not(feature = "chrome_intercept"))]
1636 pub fn with_chrome_intercept(
1638 &mut self,
1639 _chrome_intercept: RequestInterceptConfiguration,
1640 _url: &Option<Box<url::Url>>,
1641 ) -> &mut Self {
1642 self
1643 }
1644
1645 #[cfg(feature = "chrome")]
1646 pub fn with_chrome_connection(&mut self, chrome_connection_url: Option<String>) -> &mut Self {
1648 self.chrome_connection_url = chrome_connection_url;
1649 self
1650 }
1651
1652 #[cfg(not(feature = "chrome"))]
1653 pub fn with_chrome_connection(&mut self, _chrome_connection_url: Option<String>) -> &mut Self {
1655 self
1656 }
1657
1658 #[cfg(feature = "chrome")]
1659 pub fn with_chrome_connections(&mut self, urls: Vec<String>) -> &mut Self {
1663 self.chrome_connection_urls = if urls.is_empty() { None } else { Some(urls) };
1664 self
1665 }
1666
1667 #[cfg(not(feature = "chrome"))]
1668 pub fn with_chrome_connections(&mut self, _urls: Vec<String>) -> &mut Self {
1670 self
1671 }
1672
1673 #[cfg(not(feature = "chrome"))]
1674 pub fn with_execution_scripts(
1676 &mut self,
1677 _execution_scripts: Option<ExecutionScriptsMap>,
1678 ) -> &mut Self {
1679 self
1680 }
1681
1682 #[cfg(feature = "chrome")]
1683 pub fn with_execution_scripts(
1685 &mut self,
1686 execution_scripts: Option<ExecutionScriptsMap>,
1687 ) -> &mut Self {
1688 self.execution_scripts =
1689 crate::features::chrome_common::convert_to_trie_execution_scripts(&execution_scripts);
1690 self
1691 }
1692
1693 #[cfg(not(feature = "chrome"))]
1694 pub fn with_automation_scripts(
1696 &mut self,
1697 _automation_scripts: Option<AutomationScriptsMap>,
1698 ) -> &mut Self {
1699 self
1700 }
1701
1702 #[cfg(feature = "chrome")]
1703 pub fn with_automation_scripts(
1705 &mut self,
1706 automation_scripts: Option<AutomationScriptsMap>,
1707 ) -> &mut Self {
1708 self.automation_scripts =
1709 crate::features::chrome_common::convert_to_trie_automation_scripts(&automation_scripts);
1710 self
1711 }
1712
1713 pub fn with_budget(&mut self, budget: Option<hashbrown::HashMap<&str, u32>>) -> &mut Self {
1715 self.budget = match budget {
1716 Some(budget) => {
1717 let mut crawl_budget: hashbrown::HashMap<
1718 case_insensitive_string::CaseInsensitiveString,
1719 u32,
1720 > = hashbrown::HashMap::new();
1721
1722 for b in budget.into_iter() {
1723 crawl_budget.insert(
1724 case_insensitive_string::CaseInsensitiveString::from(b.0),
1725 b.1,
1726 );
1727 }
1728
1729 Some(crawl_budget)
1730 }
1731 _ => None,
1732 };
1733 self
1734 }
1735
1736 pub fn with_external_domains<'a, 'b>(
1738 &mut self,
1739 external_domains: Option<impl Iterator<Item = String> + 'a>,
1740 ) -> &mut Self {
1741 match external_domains {
1742 Some(external_domains) => {
1743 self.external_domains_caseless = external_domains
1744 .into_iter()
1745 .filter_map(|d| {
1746 if d == "*" {
1747 Some("*".into())
1748 } else {
1749 let host = get_domain_from_url(&d);
1750
1751 if !host.is_empty() {
1752 Some(host.into())
1753 } else {
1754 None
1755 }
1756 }
1757 })
1758 .collect::<hashbrown::HashSet<case_insensitive_string::CaseInsensitiveString>>()
1759 .into();
1760 }
1761 _ => self.external_domains_caseless = Default::default(),
1762 }
1763
1764 self
1765 }
1766
1767 pub fn with_danger_accept_invalid_certs(&mut self, accept_invalid_certs: bool) -> &mut Self {
1769 self.accept_invalid_certs = accept_invalid_certs;
1770 self
1771 }
1772
1773 pub fn with_normalize(&mut self, normalize: bool) -> &mut Self {
1775 self.normalize = normalize;
1776 self
1777 }
1778
1779 #[cfg(not(feature = "disk"))]
1780 pub fn with_shared_state(&mut self, _shared: bool) -> &mut Self {
1782 self
1783 }
1784
1785 #[cfg(feature = "disk")]
1787 pub fn with_shared_state(&mut self, shared: bool) -> &mut Self {
1788 self.shared = shared;
1789 self
1790 }
1791
1792 #[cfg(not(feature = "chrome"))]
1793 pub fn with_timezone_id(&mut self, _timezone_id: Option<String>) -> &mut Self {
1795 self
1796 }
1797
1798 #[cfg(feature = "chrome")]
1799 pub fn with_timezone_id(&mut self, timezone_id: Option<String>) -> &mut Self {
1801 self.timezone_id = timezone_id.map(|timezone_id| timezone_id.into());
1802 self
1803 }
1804
1805 #[cfg(not(feature = "chrome"))]
1806 pub fn with_locale(&mut self, _locale: Option<String>) -> &mut Self {
1808 self
1809 }
1810
1811 #[cfg(feature = "chrome")]
1812 pub fn with_locale(&mut self, locale: Option<String>) -> &mut Self {
1814 self.locale = locale.map(|locale| locale.into());
1815 self
1816 }
1817
1818 #[cfg(feature = "chrome")]
1819 pub fn with_event_tracker(&mut self, track_events: Option<ChromeEventTracker>) -> &mut Self {
1821 self.track_events = track_events;
1822 self
1823 }
1824
1825 #[cfg(not(feature = "chrome"))]
1827 pub fn with_screenshot(&mut self, _screenshot_config: Option<ScreenShotConfig>) -> &mut Self {
1828 self
1829 }
1830
1831 #[cfg(feature = "chrome")]
1833 pub fn with_screenshot(&mut self, screenshot_config: Option<ScreenShotConfig>) -> &mut Self {
1834 self.screenshot = screenshot_config;
1835 self
1836 }
1837
1838 pub fn with_max_page_bytes(&mut self, max_page_bytes: Option<f64>) -> &mut Self {
1840 self.max_page_bytes = max_page_bytes;
1841 self
1842 }
1843
1844 pub fn with_max_bytes_allowed(&mut self, max_bytes_allowed: Option<u64>) -> &mut Self {
1846 self.max_bytes_allowed = max_bytes_allowed;
1847 self
1848 }
1849
1850 pub fn with_block_assets(&mut self, only_html: bool) -> &mut Self {
1852 self.only_html = only_html;
1853 self
1854 }
1855
1856 pub fn with_modify_headers(&mut self, modify_headers: bool) -> &mut Self {
1858 self.modify_headers = modify_headers;
1859 self
1860 }
1861
1862 pub fn with_modify_http_client_headers(
1864 &mut self,
1865 modify_http_client_headers: bool,
1866 ) -> &mut Self {
1867 self.modify_http_client_headers = modify_http_client_headers;
1868 self
1869 }
1870
1871 pub fn with_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) -> &mut Self {
1873 self.cache_policy = cache_policy;
1874 self
1875 }
1876
1877 #[cfg(feature = "webdriver")]
1878 pub fn with_webdriver_config(
1880 &mut self,
1881 webdriver_config: Option<WebDriverConfig>,
1882 ) -> &mut Self {
1883 self.webdriver_config = webdriver_config.map(Box::new);
1884 self
1885 }
1886
1887 #[cfg(not(feature = "webdriver"))]
1888 pub fn with_webdriver_config(
1890 &mut self,
1891 _webdriver_config: Option<WebDriverConfig>,
1892 ) -> &mut Self {
1893 self
1894 }
1895
1896 #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
1898 pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
1899 use crate::utils::CacheOptions;
1900 if !self.cache {
1901 return None;
1902 }
1903 let auth_token = self
1904 .headers
1905 .as_ref()
1906 .and_then(|headers| {
1907 headers
1908 .0
1909 .get("authorization")
1910 .or_else(|| headers.0.get("Authorization"))
1911 })
1912 .map(|s| s.to_owned());
1913
1914 #[cfg(feature = "cache_mem")]
1919 let skip_browser = true;
1920 #[cfg(not(feature = "cache_mem"))]
1921 let skip_browser = self.cache_skip_browser;
1922
1923 match auth_token {
1924 Some(token) if !token.is_empty() => {
1925 if let Ok(token_str) = token.to_str() {
1926 if skip_browser {
1927 Some(CacheOptions::SkipBrowserAuthorized(token_str.into()))
1928 } else {
1929 Some(CacheOptions::Authorized(token_str.into()))
1930 }
1931 } else if skip_browser {
1932 Some(CacheOptions::SkipBrowser)
1933 } else {
1934 Some(CacheOptions::Yes)
1935 }
1936 }
1937 _ => {
1938 if skip_browser {
1939 Some(CacheOptions::SkipBrowser)
1940 } else {
1941 Some(CacheOptions::Yes)
1942 }
1943 }
1944 }
1945 }
1946
1947 #[cfg(all(
1949 feature = "chrome",
1950 not(any(feature = "cache_request", feature = "chrome_remote_cache"))
1951 ))]
1952 pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
1953 None
1954 }
1955
1956 #[cfg(not(any(
1958 feature = "cache_request",
1959 feature = "chrome_remote_cache",
1960 feature = "chrome"
1961 )))]
1962 #[allow(dead_code)]
1963 pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
1964 None
1965 }
1966
1967 pub fn build(&self) -> Self {
1969 self.to_owned()
1970 }
1971
1972 #[cfg(feature = "search")]
1973 pub fn with_search_config(&mut self, search_config: Option<SearchConfig>) -> &mut Self {
1975 self.search_config = search_config.map(Box::new);
1976 self
1977 }
1978
1979 #[cfg(not(feature = "search"))]
1980 pub fn with_search_config(&mut self, _search_config: Option<()>) -> &mut Self {
1982 self
1983 }
1984
1985 #[cfg(feature = "spider_cloud")]
1987 pub fn with_spider_cloud(&mut self, api_key: &str) -> &mut Self {
1988 if is_placeholder_api_key(api_key) {
1989 log::warn!("Spider Cloud API key looks like a placeholder — skipping. Get a real key at https://spider.cloud");
1990 return self;
1991 }
1992 self.spider_cloud = Some(Box::new(SpiderCloudConfig::new(api_key)));
1993 self
1994 }
1995
1996 #[cfg(not(feature = "spider_cloud"))]
1998 pub fn with_spider_cloud(&mut self, _api_key: &str) -> &mut Self {
1999 self
2000 }
2001
2002 #[cfg(feature = "spider_cloud")]
2004 pub fn with_spider_cloud_config(&mut self, config: SpiderCloudConfig) -> &mut Self {
2005 self.spider_cloud = Some(Box::new(config));
2006 self
2007 }
2008
2009 #[cfg(not(feature = "spider_cloud"))]
2011 pub fn with_spider_cloud_config(&mut self, _config: ()) -> &mut Self {
2012 self
2013 }
2014
2015 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2020 pub fn with_spider_browser(&mut self, api_key: &str) -> &mut Self {
2021 if is_placeholder_api_key(api_key) {
2022 log::warn!("Spider Browser Cloud API key looks like a placeholder — skipping. Get a real key at https://spider.cloud");
2023 return self;
2024 }
2025 let cfg = SpiderBrowserConfig::new(api_key);
2026 self.chrome_connection_url = Some(cfg.connection_url());
2027 self.spider_browser = Some(Box::new(cfg));
2028 self
2029 }
2030
2031 #[cfg(not(all(feature = "spider_cloud", feature = "chrome")))]
2033 pub fn with_spider_browser(&mut self, _api_key: &str) -> &mut Self {
2034 self
2035 }
2036
2037 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2040 pub fn with_spider_browser_config(&mut self, config: SpiderBrowserConfig) -> &mut Self {
2041 self.chrome_connection_url = Some(config.connection_url());
2042 self.spider_browser = Some(Box::new(config));
2043 self
2044 }
2045
2046 #[cfg(not(all(feature = "spider_cloud", feature = "chrome")))]
2048 pub fn with_spider_browser_config(&mut self, _config: ()) -> &mut Self {
2049 self
2050 }
2051
2052 #[cfg(feature = "hedge")]
2054 pub fn with_hedge(&mut self, config: crate::utils::hedge::HedgeConfig) -> &mut Self {
2055 self.hedge = Some(config);
2056 self
2057 }
2058
2059 #[cfg(not(feature = "hedge"))]
2061 pub fn with_hedge(&mut self, _config: ()) -> &mut Self {
2062 self
2063 }
2064
2065 #[cfg(feature = "auto_throttle")]
2066 pub fn with_auto_throttle(
2068 &mut self,
2069 config: crate::utils::auto_throttle::AutoThrottleConfig,
2070 ) -> &mut Self {
2071 self.auto_throttle = Some(config);
2072 self
2073 }
2074
2075 #[cfg(not(feature = "auto_throttle"))]
2077 pub fn with_auto_throttle(&mut self, _config: ()) -> &mut Self {
2078 self
2079 }
2080
2081 #[cfg(feature = "etag_cache")]
2082 pub fn with_etag_cache(&mut self, enabled: bool) -> &mut Self {
2084 self.etag_cache = enabled;
2085 self
2086 }
2087
2088 #[cfg(not(feature = "etag_cache"))]
2090 pub fn with_etag_cache(&mut self, _enabled: bool) -> &mut Self {
2091 self
2092 }
2093
2094 #[cfg(feature = "warc")]
2095 pub fn with_warc(&mut self, config: crate::utils::warc::WarcConfig) -> &mut Self {
2097 self.warc = Some(config);
2098 self
2099 }
2100
2101 #[cfg(not(feature = "warc"))]
2103 pub fn with_warc(&mut self, _config: ()) -> &mut Self {
2104 self
2105 }
2106}
2107
2108#[cfg(feature = "search")]
2110#[derive(Debug, Clone, PartialEq)]
2111#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2112pub struct SearchConfig {
2113 pub provider: SearchProviderType,
2115 pub api_key: String,
2117 pub api_url: Option<String>,
2119 pub default_options: Option<SearchOptions>,
2121}
2122
2123#[cfg(feature = "search")]
2124impl SearchConfig {
2125 pub fn new(provider: SearchProviderType, api_key: impl Into<String>) -> Self {
2127 Self {
2128 provider,
2129 api_key: api_key.into(),
2130 api_url: None,
2131 default_options: None,
2132 }
2133 }
2134
2135 pub fn with_api_url(mut self, url: impl Into<String>) -> Self {
2137 self.api_url = Some(url.into());
2138 self
2139 }
2140
2141 pub fn with_default_options(mut self, options: SearchOptions) -> Self {
2143 self.default_options = Some(options);
2144 self
2145 }
2146
2147 pub fn is_enabled(&self) -> bool {
2151 !self.api_key.is_empty() || self.api_url.is_some()
2152 }
2153}
2154
2155#[cfg(feature = "search")]
2157#[derive(Debug, Clone, Default, PartialEq, Eq)]
2158#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2159pub enum SearchProviderType {
2160 #[default]
2162 Serper,
2163 Brave,
2165 Bing,
2167 Tavily,
2169}
2170
2171#[cfg(feature = "spider_cloud")]
2175#[derive(Debug, Clone, Default, PartialEq, Eq)]
2176#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2177pub enum SpiderCloudMode {
2178 #[default]
2182 Proxy,
2183 Api,
2186 Unblocker,
2189 Fallback,
2192 Smart,
2197}
2198
2199#[cfg(feature = "spider_cloud")]
2201#[derive(Debug, Clone, Default, PartialEq, Eq)]
2202#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2203pub enum SpiderCloudReturnFormat {
2204 #[default]
2206 #[cfg_attr(feature = "serde", serde(rename = "raw"))]
2207 Raw,
2208 #[cfg_attr(feature = "serde", serde(rename = "markdown"))]
2210 Markdown,
2211 #[cfg_attr(feature = "serde", serde(rename = "commonmark"))]
2213 CommonMark,
2214 #[cfg_attr(feature = "serde", serde(rename = "text"))]
2216 Text,
2217 #[cfg_attr(feature = "serde", serde(rename = "bytes"))]
2219 Bytes,
2220}
2221
2222#[cfg(feature = "spider_cloud")]
2223impl SpiderCloudReturnFormat {
2224 pub fn as_str(&self) -> &'static str {
2226 match self {
2227 Self::Raw => "raw",
2228 Self::Markdown => "markdown",
2229 Self::CommonMark => "commonmark",
2230 Self::Text => "text",
2231 Self::Bytes => "bytes",
2232 }
2233 }
2234}
2235
2236#[cfg(feature = "spider_cloud")]
2237impl std::fmt::Display for SpiderCloudReturnFormat {
2238 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2239 f.write_str(self.as_str())
2240 }
2241}
2242
2243#[cfg(feature = "spider_cloud")]
2244impl From<&str> for SpiderCloudReturnFormat {
2245 fn from(s: &str) -> Self {
2246 match s {
2247 "markdown" | "Markdown" | "MARKDOWN" => Self::Markdown,
2248 "commonmark" | "CommonMark" | "COMMONMARK" => Self::CommonMark,
2249 "text" | "Text" | "TEXT" => Self::Text,
2250 "bytes" | "Bytes" | "BYTES" => Self::Bytes,
2251 _ => Self::Raw,
2252 }
2253 }
2254}
2255
2256#[cfg(feature = "spider_cloud")]
2261#[derive(Debug, Clone, PartialEq, Eq)]
2262#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2263pub struct SpiderCloudConfig {
2264 pub api_key: String,
2266 #[cfg_attr(feature = "serde", serde(default))]
2268 pub mode: SpiderCloudMode,
2269 #[cfg_attr(
2271 feature = "serde",
2272 serde(default = "SpiderCloudConfig::default_api_url")
2273 )]
2274 pub api_url: String,
2275 #[cfg_attr(
2277 feature = "serde",
2278 serde(default = "SpiderCloudConfig::default_proxy_url")
2279 )]
2280 pub proxy_url: String,
2281 #[cfg_attr(feature = "serde", serde(default))]
2283 pub return_format: SpiderCloudReturnFormat,
2284 #[cfg_attr(
2291 feature = "serde",
2292 serde(default, skip_serializing_if = "Option::is_none")
2293 )]
2294 pub return_formats: Option<Vec<SpiderCloudReturnFormat>>,
2295 #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
2297 pub extra_params: Option<hashbrown::HashMap<String, serde_json::Value>>,
2298}
2299
2300#[cfg(feature = "spider_cloud")]
2301impl Default for SpiderCloudConfig {
2302 fn default() -> Self {
2303 Self {
2304 api_key: String::new(),
2305 mode: SpiderCloudMode::default(),
2306 api_url: Self::default_api_url(),
2307 proxy_url: Self::default_proxy_url(),
2308 return_format: SpiderCloudReturnFormat::default(),
2309 return_formats: None,
2310 extra_params: None,
2311 }
2312 }
2313}
2314
2315#[cfg(feature = "spider_cloud")]
2316impl SpiderCloudConfig {
2317 pub fn new(api_key: impl Into<String>) -> Self {
2319 Self {
2320 api_key: api_key.into(),
2321 ..Default::default()
2322 }
2323 }
2324
2325 pub fn with_mode(mut self, mode: SpiderCloudMode) -> Self {
2327 self.mode = mode;
2328 self
2329 }
2330
2331 pub fn with_api_url(mut self, url: impl Into<String>) -> Self {
2333 self.api_url = url.into();
2334 self
2335 }
2336
2337 pub fn with_proxy_url(mut self, url: impl Into<String>) -> Self {
2339 self.proxy_url = url.into();
2340 self
2341 }
2342
2343 pub fn with_return_format(mut self, fmt: impl Into<SpiderCloudReturnFormat>) -> Self {
2351 self.return_format = fmt.into();
2352 self
2353 }
2354
2355 pub fn with_return_formats(mut self, formats: Vec<SpiderCloudReturnFormat>) -> Self {
2368 let mut seen = Vec::with_capacity(formats.len());
2370 for f in formats {
2371 if !seen.contains(&f) {
2372 seen.push(f);
2373 }
2374 }
2375 if let Some(first) = seen.first() {
2376 self.return_format = first.clone();
2377 }
2378 self.return_formats = Some(seen);
2379 self
2380 }
2381
2382 pub fn has_multiple_formats(&self) -> bool {
2384 self.return_formats.as_ref().is_some_and(|f| f.len() > 1)
2385 }
2386
2387 pub fn with_extra_params(
2389 mut self,
2390 params: hashbrown::HashMap<String, serde_json::Value>,
2391 ) -> Self {
2392 self.extra_params = Some(params);
2393 self
2394 }
2395
2396 pub fn should_fallback(&self, status_code: u16, body: Option<&[u8]>) -> bool {
2410 match self.mode {
2411 SpiderCloudMode::Api | SpiderCloudMode::Unblocker => false, SpiderCloudMode::Proxy => false, SpiderCloudMode::Fallback | SpiderCloudMode::Smart => {
2414 if matches!(status_code, 403 | 429 | 503 | 520..=530) {
2416 return true;
2417 }
2418 if status_code >= 500 {
2419 return true;
2420 }
2421
2422 if self.mode == SpiderCloudMode::Smart {
2424 if let Some(body) = body {
2425 if body.is_empty() {
2427 return true;
2428 }
2429
2430 let check_len = body.len().min(4096);
2433 let snippet = String::from_utf8_lossy(&body[..check_len]);
2434 let lower = snippet.to_lowercase();
2435
2436 if lower.contains("cf-browser-verification")
2438 || lower.contains("cloudflare") && lower.contains("challenge-platform")
2439 {
2440 return true;
2441 }
2442
2443 if lower.contains("captcha") && lower.contains("challenge")
2445 || lower.contains("please verify you are a human")
2446 || lower.contains("access denied") && lower.contains("automated")
2447 || lower.contains("bot detection")
2448 {
2449 return true;
2450 }
2451
2452 if lower.contains("distil_r_captcha")
2454 || lower.contains("_imperva")
2455 || lower.contains("akamai") && lower.contains("bot manager")
2456 {
2457 return true;
2458 }
2459 }
2460 }
2461
2462 false
2463 }
2464 }
2465 }
2466
2467 pub fn fallback_route(&self) -> &'static str {
2473 match self.mode {
2474 SpiderCloudMode::Smart | SpiderCloudMode::Unblocker => "unblocker",
2475 _ => "crawl",
2476 }
2477 }
2478
2479 pub fn uses_proxy(&self) -> bool {
2481 matches!(
2482 self.mode,
2483 SpiderCloudMode::Proxy | SpiderCloudMode::Fallback | SpiderCloudMode::Smart
2484 )
2485 }
2486
2487 fn default_api_url() -> String {
2488 "https://api.spider.cloud".to_string()
2489 }
2490
2491 fn default_proxy_url() -> String {
2492 "https://proxy.spider.cloud".to_string()
2493 }
2494}
2495
2496#[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2506#[derive(Debug, Clone, PartialEq, Eq)]
2507#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2508pub struct SpiderBrowserConfig {
2509 pub api_key: String,
2511 #[cfg_attr(
2513 feature = "serde",
2514 serde(default = "SpiderBrowserConfig::default_wss_url")
2515 )]
2516 pub wss_url: String,
2517 #[cfg_attr(feature = "serde", serde(default))]
2519 pub stealth: bool,
2520 #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
2522 pub browser: Option<String>,
2523 #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
2525 pub country: Option<String>,
2526 #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
2528 pub extra_params: Option<Vec<(String, String)>>,
2529}
2530
2531#[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2532impl Default for SpiderBrowserConfig {
2533 fn default() -> Self {
2534 Self {
2535 api_key: String::new(),
2536 wss_url: Self::default_wss_url(),
2537 stealth: false,
2538 browser: None,
2539 country: None,
2540 extra_params: None,
2541 }
2542 }
2543}
2544
2545#[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2546impl SpiderBrowserConfig {
2547 pub fn new(api_key: impl Into<String>) -> Self {
2549 Self {
2550 api_key: api_key.into(),
2551 ..Default::default()
2552 }
2553 }
2554
2555 pub fn with_wss_url(mut self, url: impl Into<String>) -> Self {
2557 self.wss_url = url.into();
2558 self
2559 }
2560
2561 pub fn with_stealth(mut self, stealth: bool) -> Self {
2563 self.stealth = stealth;
2564 self
2565 }
2566
2567 pub fn with_browser(mut self, browser: impl Into<String>) -> Self {
2569 self.browser = Some(browser.into());
2570 self
2571 }
2572
2573 pub fn with_country(mut self, country: impl Into<String>) -> Self {
2575 self.country = Some(country.into());
2576 self
2577 }
2578
2579 pub fn with_extra_params(mut self, params: Vec<(String, String)>) -> Self {
2581 self.extra_params = Some(params);
2582 self
2583 }
2584
2585 pub fn connection_url(&self) -> String {
2590 let mut url = self.wss_url.clone();
2591
2592 if url.contains('?') {
2594 url.push('&');
2595 } else {
2596 url.push('?');
2597 }
2598 url.push_str("token=");
2599 url.push_str(&self.api_key);
2600
2601 if self.stealth {
2602 url.push_str("&stealth=true");
2603 }
2604 if let Some(ref browser) = self.browser {
2605 url.push_str("&browser=");
2606 url.push_str(browser);
2607 }
2608 if let Some(ref country) = self.country {
2609 url.push_str("&country=");
2610 url.push_str(country);
2611 }
2612 if let Some(ref extra) = self.extra_params {
2613 for (k, v) in extra {
2614 url.push('&');
2615 url.push_str(k);
2616 url.push('=');
2617 url.push_str(v);
2618 }
2619 }
2620
2621 url
2622 }
2623
2624 fn default_wss_url() -> String {
2625 "wss://browser.spider.cloud/v1/browser".to_string()
2626 }
2627}
2628
2629#[cfg(test)]
2630mod tests {
2631 use super::*;
2632
2633 #[test]
2634 fn test_configuration_defaults() {
2635 let config = Configuration::default();
2636 assert!(!config.respect_robots_txt);
2637 assert!(!config.subdomains);
2638 assert!(!config.tld);
2639 assert_eq!(config.delay, 0);
2640 assert!(config.user_agent.is_none());
2641 assert!(config.blacklist_url.is_none());
2642 assert!(config.whitelist_url.is_none());
2643 assert!(config.proxies.is_none());
2644 assert!(!config.http2_prior_knowledge);
2645 }
2646
2647 #[test]
2648 fn test_redirect_policy_variants() {
2649 assert_eq!(RedirectPolicy::default(), RedirectPolicy::Loose);
2650 let strict = RedirectPolicy::Strict;
2651 let none = RedirectPolicy::None;
2652 assert_ne!(strict, RedirectPolicy::Loose);
2653 assert_ne!(none, RedirectPolicy::Loose);
2654 assert_ne!(strict, none);
2655 }
2656
2657 #[test]
2658 fn test_proxy_ignore_variants() {
2659 assert_eq!(ProxyIgnore::default(), ProxyIgnore::No);
2660 let chrome = ProxyIgnore::Chrome;
2661 let http = ProxyIgnore::Http;
2662 assert_ne!(chrome, ProxyIgnore::No);
2663 assert_ne!(http, ProxyIgnore::No);
2664 assert_ne!(chrome, http);
2665 }
2666
2667 #[test]
2668 fn test_request_proxy_construction() {
2669 let proxy = RequestProxy {
2670 addr: "http://proxy.example.com:8080".to_string(),
2671 ignore: ProxyIgnore::No,
2672 };
2673 assert_eq!(proxy.addr, "http://proxy.example.com:8080");
2674 assert_eq!(proxy.ignore, ProxyIgnore::No);
2675 }
2676
2677 #[test]
2678 fn test_request_proxy_default() {
2679 let proxy = RequestProxy::default();
2680 assert!(proxy.addr.is_empty());
2681 assert_eq!(proxy.ignore, ProxyIgnore::No);
2682 }
2683
2684 #[test]
2685 fn test_configuration_blacklist_setup() {
2686 let mut config = Configuration::default();
2687 config.blacklist_url = Some(vec![
2688 "https://example.com/private".into(),
2689 "https://example.com/admin".into(),
2690 ]);
2691 assert_eq!(config.blacklist_url.as_ref().unwrap().len(), 2);
2692 }
2693
2694 #[test]
2695 fn test_configuration_whitelist_setup() {
2696 let mut config = Configuration::default();
2697 config.whitelist_url = Some(vec!["https://example.com/public".into()]);
2698 assert_eq!(config.whitelist_url.as_ref().unwrap().len(), 1);
2699 }
2700
2701 #[test]
2702 fn test_configuration_external_domains() {
2703 let mut config = Configuration::default();
2704 config.external_domains_caseless = Arc::new(
2705 [
2706 case_insensitive_string::CaseInsensitiveString::from("Example.Com"),
2707 case_insensitive_string::CaseInsensitiveString::from("OTHER.org"),
2708 ]
2709 .into_iter()
2710 .collect(),
2711 );
2712 assert_eq!(config.external_domains_caseless.len(), 2);
2713 assert!(config.external_domains_caseless.contains(
2714 &case_insensitive_string::CaseInsensitiveString::from("example.com")
2715 ));
2716 }
2717
2718 #[test]
2719 fn test_configuration_budget() {
2720 let mut config = Configuration::default();
2721 let mut budget = hashbrown::HashMap::new();
2722 budget.insert(
2723 case_insensitive_string::CaseInsensitiveString::from("/path"),
2724 100u32,
2725 );
2726 config.budget = Some(budget);
2727 assert!(config.budget.is_some());
2728 assert_eq!(
2729 config.budget.as_ref().unwrap().get(
2730 &case_insensitive_string::CaseInsensitiveString::from("/path")
2731 ),
2732 Some(&100u32)
2733 );
2734 }
2735
2736 #[cfg(not(feature = "regex"))]
2737 #[test]
2738 fn test_allow_list_set_default() {
2739 let allow_list = AllowListSet::default();
2740 assert!(allow_list.0.is_empty());
2741 }
2742
2743 #[cfg(feature = "agent")]
2744 #[test]
2745 fn test_build_remote_multimodal_engine_preserves_dual_models() {
2746 use crate::features::automation::{
2747 ModelEndpoint, RemoteMultimodalConfigs, VisionRouteMode,
2748 };
2749
2750 let mut config = Configuration::default();
2751 let mm = RemoteMultimodalConfigs::new(
2752 "https://api.example.com/v1/chat/completions",
2753 "primary-model",
2754 )
2755 .with_vision_model(ModelEndpoint::new("vision-model").with_api_key("vision-key"))
2756 .with_text_model(
2757 ModelEndpoint::new("text-model")
2758 .with_api_url("https://text.example.com/v1/chat/completions")
2759 .with_api_key("text-key"),
2760 )
2761 .with_vision_route_mode(VisionRouteMode::TextFirst);
2762 config.remote_multimodal = Some(Box::new(mm));
2763
2764 let engine = config
2765 .build_remote_multimodal_engine()
2766 .expect("engine should be built");
2767
2768 assert_eq!(
2769 engine.vision_model.as_ref().map(|m| m.model_name.as_str()),
2770 Some("vision-model")
2771 );
2772 assert_eq!(
2773 engine.text_model.as_ref().map(|m| m.model_name.as_str()),
2774 Some("text-model")
2775 );
2776 assert_eq!(engine.vision_route_mode, VisionRouteMode::TextFirst);
2777 }
2778
2779 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2780 #[test]
2781 fn test_spider_browser_config_defaults() {
2782 let cfg = SpiderBrowserConfig::new("test-key");
2783 assert_eq!(cfg.api_key, "test-key");
2784 assert_eq!(cfg.wss_url, "wss://browser.spider.cloud/v1/browser");
2785 assert!(!cfg.stealth);
2786 assert!(cfg.browser.is_none());
2787 assert!(cfg.country.is_none());
2788 assert!(cfg.extra_params.is_none());
2789 }
2790
2791 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2792 #[test]
2793 fn test_spider_browser_connection_url_basic() {
2794 let cfg = SpiderBrowserConfig::new("sk-abc123");
2795 assert_eq!(
2796 cfg.connection_url(),
2797 "wss://browser.spider.cloud/v1/browser?token=sk-abc123"
2798 );
2799 }
2800
2801 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2802 #[test]
2803 fn test_spider_browser_connection_url_full() {
2804 let cfg = SpiderBrowserConfig::new("sk-abc123")
2805 .with_stealth(true)
2806 .with_browser("chrome")
2807 .with_country("us")
2808 .with_extra_params(vec![("timeout".into(), "30000".into())]);
2809 assert_eq!(
2810 cfg.connection_url(),
2811 "wss://browser.spider.cloud/v1/browser?token=sk-abc123&stealth=true&browser=chrome&country=us&timeout=30000"
2812 );
2813 }
2814
2815 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2816 #[test]
2817 fn test_spider_browser_connection_url_custom_wss() {
2818 let cfg = SpiderBrowserConfig::new("key")
2819 .with_wss_url("wss://custom.browser.example.com/v1/browser");
2820 assert_eq!(
2821 cfg.connection_url(),
2822 "wss://custom.browser.example.com/v1/browser?token=key"
2823 );
2824 }
2825
2826 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2827 #[test]
2828 fn test_with_spider_browser_sets_chrome_connection() {
2829 let mut config = Configuration::default();
2830 config.with_spider_browser("my-api-key");
2831 assert_eq!(
2832 config.chrome_connection_url.as_deref(),
2833 Some("wss://browser.spider.cloud/v1/browser?token=my-api-key")
2834 );
2835 assert!(config.spider_browser.is_some());
2836 }
2837
2838 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2839 #[test]
2840 fn test_with_spider_browser_config_stealth() {
2841 let mut config = Configuration::default();
2842 let browser_cfg = SpiderBrowserConfig::new("key")
2843 .with_stealth(true)
2844 .with_country("gb");
2845 config.with_spider_browser_config(browser_cfg);
2846 assert_eq!(
2847 config.chrome_connection_url.as_deref(),
2848 Some("wss://browser.spider.cloud/v1/browser?token=key&stealth=true&country=gb")
2849 );
2850 }
2851}