1use crate::compact_str::CompactString;
2use crate::features::chrome_common::RequestInterceptConfiguration;
3pub use crate::features::chrome_common::{
4 AuthChallengeResponse, AuthChallengeResponseResponse, AutomationScripts, AutomationScriptsMap,
5 CaptureScreenshotFormat, CaptureScreenshotParams, ClipViewport, ExecutionScripts,
6 ExecutionScriptsMap, ScreenShotConfig, ScreenshotParams, Viewport, WaitFor, WaitForDelay,
7 WaitForIdleNetwork, WaitForSelector, WebAutomation,
8};
9pub use crate::features::gemini_common::GeminiConfigs;
10pub use crate::features::openai_common::GPTConfigs;
11#[cfg(feature = "search")]
12pub use crate::features::search::{
13 SearchError, SearchOptions, SearchResult, SearchResults, TimeRange,
14};
15pub use crate::features::webdriver_common::{WebDriverBrowser, WebDriverConfig};
16use crate::utils::get_domain_from_url;
17use crate::utils::BasicCachePolicy;
18use crate::website::CronType;
19use reqwest::header::{AsHeaderName, HeaderMap, HeaderName, HeaderValue, IntoHeaderName};
20use std::net::IpAddr;
21use std::sync::Arc;
22use std::time::Duration;
23
24#[cfg(feature = "chrome")]
25pub use spider_fingerprint::Fingerprint;
26
27pub fn is_placeholder_api_key(key: &str) -> bool {
29 let trimmed = key.trim();
30 trimmed.is_empty()
31 || trimmed.eq_ignore_ascii_case("YOUR_API_KEY")
32 || trimmed.eq_ignore_ascii_case("YOUR-API-KEY")
33 || trimmed.eq_ignore_ascii_case("API_KEY")
34 || trimmed.eq_ignore_ascii_case("API-KEY")
35}
36
37#[derive(Debug, Default, Clone, PartialEq)]
39#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
40pub enum RedirectPolicy {
41 #[default]
42 #[cfg_attr(
43 feature = "serde",
44 serde(alias = "Loose", alias = "loose", alias = "LOOSE",)
45 )]
46 Loose,
48 #[cfg_attr(
49 feature = "serde",
50 serde(alias = "Strict", alias = "strict", alias = "STRICT",)
51 )]
52 Strict,
54 #[cfg_attr(
55 feature = "serde",
56 serde(alias = "None", alias = "none", alias = "NONE",)
57 )]
58 None,
60}
61
62#[cfg(not(feature = "regex"))]
63pub type AllowList = Vec<CompactString>;
65
66#[cfg(feature = "regex")]
67pub type AllowList = Box<regex::RegexSet>;
69
70#[derive(Debug, Default, Clone)]
72#[cfg_attr(not(feature = "regex"), derive(PartialEq, Eq))]
73pub struct AllowListSet(pub AllowList);
74
75#[cfg(feature = "chrome")]
76#[derive(Debug, PartialEq, Eq, Clone, Default)]
78#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
79pub struct ChromeEventTracker {
80 pub responses: bool,
82 pub requests: bool,
84 pub automation: bool,
86}
87
88#[cfg(feature = "chrome")]
89impl ChromeEventTracker {
90 pub fn new(requests: bool, responses: bool) -> Self {
92 ChromeEventTracker {
93 requests,
94 responses,
95 automation: true,
96 }
97 }
98}
99
100#[cfg(feature = "sitemap")]
101#[derive(Debug, Default)]
102pub struct SitemapWhitelistChanges {
104 pub added_default: bool,
106 pub added_custom: bool,
108}
109
110#[cfg(feature = "sitemap")]
111impl SitemapWhitelistChanges {
112 pub(crate) fn modified(&self) -> bool {
114 self.added_default || self.added_custom
115 }
116}
117
118#[derive(Debug, Default, Clone, PartialEq)]
120#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
121pub enum ProxyIgnore {
122 Chrome,
124 Http,
126 #[default]
127 No,
129}
130
131#[derive(Debug, Default, Clone, PartialEq)]
133#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
134pub struct RequestProxy {
135 pub addr: String,
137 pub ignore: ProxyIgnore,
139}
140
141#[derive(Debug, Clone, PartialEq, Eq, Hash)]
159#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
160pub enum ProxyKind {
161 Default,
164 MediaAsset,
169 Custom(CompactString),
171}
172
173impl Default for ProxyKind {
174 #[inline]
175 fn default() -> Self {
176 ProxyKind::Default
177 }
178}
179
180#[cfg(feature = "parallel_backends")]
182#[derive(Debug, Clone, PartialEq)]
183#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
184pub enum BackendProtocol {
185 Cdp,
187 WebDriver,
189}
190
191#[cfg(feature = "parallel_backends")]
193#[derive(Debug, Default, Clone, PartialEq)]
194#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
195pub enum BackendEngine {
196 #[default]
197 Cdp,
199 Servo,
201 Custom,
204}
205
206#[cfg(feature = "parallel_backends")]
212#[derive(Debug, Default, Clone, PartialEq)]
213#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
214#[cfg_attr(feature = "serde", serde(default))]
215pub struct BackendEndpoint {
216 pub engine: BackendEngine,
218 pub endpoint: Option<String>,
223 pub binary_path: Option<String>,
227 pub protocol: Option<BackendProtocol>,
231 pub proxy: Option<String>,
237}
238
239#[cfg(feature = "parallel_backends")]
244#[derive(Debug, Clone, PartialEq)]
245#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
246#[cfg_attr(feature = "serde", serde(default))]
247pub struct ParallelBackendsConfig {
248 pub backends: Vec<BackendEndpoint>,
250 pub grace_period_ms: u64,
254 pub enabled: bool,
256 pub fast_accept_threshold: u16,
260 pub max_consecutive_errors: u16,
263 pub connect_timeout_ms: u64,
267 pub skip_binary_content_types: bool,
272 pub max_concurrent_sessions: usize,
275 pub skip_extensions: Vec<CompactString>,
279 pub max_backend_bytes_in_flight: usize,
285 pub backend_timeout_ms: u64,
291}
292
293#[cfg(feature = "parallel_backends")]
294impl Default for ParallelBackendsConfig {
295 fn default() -> Self {
296 Self {
297 backends: Vec::new(),
298 grace_period_ms: 500,
299 enabled: true,
300 fast_accept_threshold: 80,
301 max_consecutive_errors: 10,
302 connect_timeout_ms: 5000,
303 skip_binary_content_types: true,
304 max_concurrent_sessions: 8,
305 skip_extensions: Vec::new(),
306 max_backend_bytes_in_flight: 256 * 1024 * 1024, backend_timeout_ms: 30_000,
308 }
309 }
310}
311
312#[derive(Debug, Default, Clone, PartialEq, Eq)]
314#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
315#[cfg_attr(feature = "serde", serde(default))]
316pub struct CustomAntibotPatterns {
317 pub body: Vec<CompactString>,
319 pub url: Vec<CompactString>,
321 pub header_keys: Vec<CompactString>,
323}
324
325#[derive(Debug, Default, Clone)]
335#[cfg_attr(
336 all(
337 not(feature = "regex"),
338 not(feature = "openai"),
339 not(feature = "cache_openai"),
340 not(feature = "gemini"),
341 not(feature = "cache_gemini")
342 ),
343 derive(PartialEq)
344)]
345#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
346#[cfg_attr(feature = "serde", serde(default))]
347pub struct Configuration {
348 pub respect_robots_txt: bool,
350 pub subdomains: bool,
352 pub tld: bool,
354 pub crawl_timeout: Option<Duration>,
356 pub preserve_host_header: bool,
358 pub blacklist_url: Option<Vec<CompactString>>,
360 pub whitelist_url: Option<Vec<CompactString>>,
362 pub user_agent: Option<Box<CompactString>>,
364 pub delay: u64,
366 pub request_timeout: Option<Duration>,
368 pub http2_prior_knowledge: bool,
370 pub proxies: Option<Vec<RequestProxy>>,
372 pub proxies_by_kind: Option<hashbrown::HashMap<ProxyKind, Vec<RequestProxy>>>,
387 pub headers: Option<Box<SerializableHeaderMap>>,
389 #[cfg(feature = "sitemap")]
390 pub sitemap_url: Option<Box<CompactString>>,
392 #[cfg(feature = "sitemap")]
393 pub ignore_sitemap: bool,
395 pub redirect_limit: usize,
397 pub redirect_policy: RedirectPolicy,
399 #[cfg_attr(feature = "serde", serde(skip))]
406 pub redirect_limit_set: bool,
407 pub max_main_frame_navigations: Option<u32>,
417 #[cfg(feature = "cookies")]
418 pub cookie_str: String,
420 #[cfg(feature = "wreq")]
421 pub emulation: Option<wreq_util::Emulation>,
423 #[cfg(feature = "cron")]
424 pub cron_str: String,
426 #[cfg(feature = "cron")]
427 pub cron_type: CronType,
429 pub depth: usize,
431 pub depth_distance: usize,
433 pub stealth_mode: spider_fingerprint::configs::Tier,
435 pub viewport: Option<Viewport>,
437 pub budget: Option<hashbrown::HashMap<case_insensitive_string::CaseInsensitiveString, u32>>,
439 pub wild_card_budgeting: bool,
441 pub external_domains_caseless:
443 Arc<hashbrown::HashSet<case_insensitive_string::CaseInsensitiveString>>,
444 pub full_resources: bool,
446 pub accept_invalid_certs: bool,
448 pub auth_challenge_response: Option<AuthChallengeResponse>,
450 pub openai_config: Option<Box<GPTConfigs>>,
452 pub gemini_config: Option<Box<GeminiConfigs>>,
454 pub remote_multimodal: Option<Box<crate::features::automation::RemoteMultimodalConfigs>>,
457 pub shared_queue: bool,
459 pub return_page_links: bool,
461 pub retry: u8,
463 pub custom_antibot: Option<CustomAntibotPatterns>,
466 pub no_control_thread: bool,
468 blacklist: AllowListSet,
470 whitelist: AllowListSet,
472 pub(crate) inner_budget:
474 Option<hashbrown::HashMap<case_insensitive_string::CaseInsensitiveString, u32>>,
475 pub only_html: bool,
477 pub concurrency_limit: Option<usize>,
479 pub normalize: bool,
481 pub shared: bool,
483 pub modify_headers: bool,
485 pub modify_http_client_headers: bool,
487 #[cfg(any(
489 feature = "cache_request",
490 feature = "chrome",
491 feature = "chrome_remote_cache"
492 ))]
493 pub cache: bool,
494 #[cfg(any(
497 feature = "cache_request",
498 feature = "chrome",
499 feature = "chrome_remote_cache"
500 ))]
501 pub cache_skip_browser: bool,
502 pub cache_namespace: Option<Box<String>>,
509 #[cfg(feature = "chrome_remote_cache")]
517 pub chrome_remote_cache_read_only: bool,
518 #[cfg(feature = "chrome_remote_cache")]
526 pub remote_cache_skip_browser: bool,
527 #[cfg(feature = "chrome_remote_cache")]
539 pub chrome_remote_cache_main_doc_only: bool,
540 #[cfg(feature = "chrome")]
541 pub service_worker_enabled: bool,
543 #[cfg(feature = "chrome")]
544 #[cfg(feature = "chrome")]
546 pub timezone_id: Option<Box<String>>,
547 #[cfg(feature = "chrome")]
549 pub locale: Option<Box<String>>,
550 #[cfg(feature = "chrome")]
552 pub evaluate_on_new_document: Option<Box<String>>,
553 #[cfg(feature = "chrome")]
554 pub dismiss_dialogs: Option<bool>,
556 #[cfg(feature = "chrome")]
557 pub wait_for: Option<WaitFor>,
559 #[cfg(feature = "chrome")]
560 pub screenshot: Option<ScreenShotConfig>,
562 #[cfg(feature = "chrome")]
563 pub track_events: Option<ChromeEventTracker>,
565 #[cfg(feature = "chrome")]
566 pub fingerprint: Fingerprint,
568 #[cfg(feature = "chrome")]
569 pub chrome_connection_url: Option<String>,
571 #[cfg(feature = "chrome")]
572 pub chrome_connection_urls: Option<Vec<String>>,
576 #[cfg(feature = "chrome")]
577 #[cfg_attr(feature = "serde", serde(skip))]
578 pub(crate) chrome_failover: crate::features::chrome::LazyChromeFailover,
582 #[cfg(feature = "chrome")]
583 pub chrome_first_byte_timeout: Option<Duration>,
592 #[cfg(feature = "chrome")]
593 pub chrome_first_byte_timeout_jitter: Option<Duration>,
601 pub http_first_byte_timeout: Option<Duration>,
613 pub http_first_byte_timeout_jitter: Option<Duration>,
618 #[cfg(feature = "chrome")]
620 pub execution_scripts: Option<ExecutionScripts>,
621 #[cfg(feature = "chrome")]
623 pub automation_scripts: Option<AutomationScripts>,
624 #[cfg(feature = "chrome")]
626 pub chrome_intercept: RequestInterceptConfiguration,
627 pub referer: Option<String>,
629 pub max_page_bytes: Option<f64>,
631 pub max_bytes_allowed: Option<u64>,
633 #[cfg(feature = "chrome")]
634 pub disable_log: bool,
636 #[cfg(feature = "chrome")]
637 pub auto_geolocation: bool,
639 pub cache_policy: Option<BasicCachePolicy>,
641 #[cfg(feature = "chrome")]
642 pub bypass_csp: bool,
644 #[cfg(feature = "chrome")]
645 pub disable_javascript: bool,
647 pub network_interface: Option<String>,
649 pub local_address: Option<IpAddr>,
651 pub default_http_connect_timeout: Option<Duration>,
653 pub default_http_read_timeout: Option<Duration>,
655 #[cfg(feature = "webdriver")]
656 pub webdriver_config: Option<Box<WebDriverConfig>>,
658 #[cfg(feature = "search")]
659 pub search_config: Option<Box<SearchConfig>>,
661 #[cfg(feature = "spider_cloud")]
662 pub spider_cloud: Option<Box<SpiderCloudConfig>>,
664 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
665 pub spider_browser: Option<Box<SpiderBrowserConfig>>,
667 #[cfg(feature = "hedge")]
668 pub hedge: Option<crate::utils::hedge::HedgeConfig>,
671 #[cfg(feature = "auto_throttle")]
672 pub auto_throttle: Option<crate::utils::auto_throttle::AutoThrottleConfig>,
675 #[cfg(feature = "etag_cache")]
676 pub etag_cache: bool,
681 #[cfg(feature = "warc")]
682 pub warc: Option<crate::utils::warc::WarcConfig>,
685 #[cfg(feature = "parallel_backends")]
686 pub parallel_backends: Option<ParallelBackendsConfig>,
689 #[cfg(feature = "decentralized")]
690 pub worker_connection_urls: Option<Vec<String>>,
695 #[cfg(feature = "decentralized")]
696 pub scraper_worker_connection_urls: Option<Vec<String>>,
701}
702
703#[derive(Default, Debug, Clone, PartialEq, Eq)]
704pub struct SerializableHeaderMap(pub HeaderMap);
706
707impl SerializableHeaderMap {
708 pub fn inner(&self) -> &HeaderMap {
710 &self.0
711 }
712 pub fn contains_key<K>(&self, key: K) -> bool
714 where
715 K: AsHeaderName,
716 {
717 self.0.contains_key(key)
718 }
719 pub fn insert<K>(
721 &mut self,
722 key: K,
723 val: reqwest::header::HeaderValue,
724 ) -> Option<reqwest::header::HeaderValue>
725 where
726 K: IntoHeaderName,
727 {
728 self.0.insert(key, val)
729 }
730 pub fn extend<I>(&mut self, iter: I)
732 where
733 I: IntoIterator<Item = (Option<HeaderName>, HeaderValue)>,
734 {
735 self.0.extend(iter);
736 }
737}
738
739pub fn get_referer(header_map: &Option<Box<SerializableHeaderMap>>) -> Option<String> {
741 match header_map {
742 Some(header_map) => {
743 header_map
744 .0
745 .get(crate::client::header::REFERER) .and_then(|value| value.to_str().ok()) .map(String::from) }
749 _ => None,
750 }
751}
752
753impl From<HeaderMap> for SerializableHeaderMap {
754 fn from(header_map: HeaderMap) -> Self {
755 SerializableHeaderMap(header_map)
756 }
757}
758
759#[cfg(feature = "serde")]
760impl serde::Serialize for SerializableHeaderMap {
761 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
762 where
763 S: serde::Serializer,
764 {
765 let map: std::collections::BTreeMap<String, String> = self
766 .0
767 .iter()
768 .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
769 .collect();
770 map.serialize(serializer)
771 }
772}
773
774#[cfg(feature = "serde")]
775impl<'de> serde::Deserialize<'de> for SerializableHeaderMap {
776 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
777 where
778 D: serde::Deserializer<'de>,
779 {
780 use reqwest::header::{HeaderName, HeaderValue};
781 use std::collections::BTreeMap;
782 let map: BTreeMap<String, String> = BTreeMap::deserialize(deserializer)?;
783 let mut headers = HeaderMap::with_capacity(map.len());
784 for (k, v) in map {
785 let key = HeaderName::from_bytes(k.as_bytes()).map_err(serde::de::Error::custom)?;
786 let value = HeaderValue::from_str(&v).map_err(serde::de::Error::custom)?;
787 headers.insert(key, value);
788 }
789 Ok(SerializableHeaderMap(headers))
790 }
791}
792
793#[cfg(feature = "serde")]
794impl serde::Serialize for AllowListSet {
795 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
796 where
797 S: serde::Serializer,
798 {
799 #[cfg(not(feature = "regex"))]
800 {
801 self.0.serialize(serializer)
802 }
803
804 #[cfg(feature = "regex")]
805 {
806 self.0
807 .patterns()
808 .iter()
809 .collect::<Vec<&String>>()
810 .serialize(serializer)
811 }
812 }
813}
814
815#[cfg(feature = "serde")]
816impl<'de> serde::Deserialize<'de> for AllowListSet {
817 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
818 where
819 D: serde::Deserializer<'de>,
820 {
821 #[cfg(not(feature = "regex"))]
822 {
823 let vec = Vec::<CompactString>::deserialize(deserializer)?;
824 Ok(AllowListSet(vec))
825 }
826
827 #[cfg(feature = "regex")]
828 {
829 let patterns = Vec::<String>::deserialize(deserializer)?;
830 let regex_set = regex::RegexSet::new(&patterns).map_err(serde::de::Error::custom)?;
831 Ok(AllowListSet(regex_set.into()))
832 }
833 }
834}
835
836#[cfg(feature = "ua_generator")]
838pub fn get_ua(chrome: bool) -> &'static str {
839 if chrome {
840 ua_generator::ua::spoof_chrome_ua()
841 } else {
842 ua_generator::ua::spoof_ua()
843 }
844}
845
846#[cfg(not(feature = "ua_generator"))]
848pub fn get_ua(_chrome: bool) -> &'static str {
849 use std::env;
850
851 lazy_static! {
852 static ref AGENT: &'static str =
853 concat!(env!("CARGO_PKG_NAME"), '/', env!("CARGO_PKG_VERSION"));
854 };
855
856 AGENT.as_ref()
857}
858
859impl Configuration {
860 #[cfg(not(feature = "chrome"))]
862 pub fn new() -> Self {
863 Self {
864 delay: 0,
865 depth: 25,
866 redirect_limit: 7,
867 request_timeout: Some(Duration::from_secs(120)),
868 only_html: true,
869 modify_headers: true,
870 ..Default::default()
871 }
872 }
873
874 #[cfg(feature = "chrome")]
876 pub fn new() -> Self {
877 Self {
878 delay: 0,
879 depth: 25,
880 redirect_limit: 7,
881 request_timeout: Some(Duration::from_secs(120)),
882 chrome_intercept: RequestInterceptConfiguration::new(cfg!(
883 feature = "chrome_intercept"
884 )),
885 user_agent: Some(Box::new(get_ua(true).into())),
886 only_html: true,
887 cache: true,
888 modify_headers: true,
889 service_worker_enabled: true,
890 fingerprint: Fingerprint::Basic,
891 auto_geolocation: false,
892 ..Default::default()
893 }
894 }
895
896 #[cfg(feature = "agent")]
899 pub fn build_remote_multimodal_engine(
900 &self,
901 ) -> Option<crate::features::automation::RemoteMultimodalEngine> {
902 let cfgs = self.remote_multimodal.as_ref()?;
903 let sem = cfgs
904 .concurrency_limit
905 .filter(|&n| n > 0)
906 .map(|n| std::sync::Arc::new(tokio::sync::Semaphore::new(n)));
907
908 #[allow(unused_mut)]
909 let mut engine = crate::features::automation::RemoteMultimodalEngine::new(
910 cfgs.api_url.clone(),
911 cfgs.model_name.clone(),
912 cfgs.system_prompt.clone(),
913 )
914 .with_api_key(cfgs.api_key.as_deref())
915 .with_system_prompt_extra(cfgs.system_prompt_extra.as_deref())
916 .with_user_message_extra(cfgs.user_message_extra.as_deref())
917 .with_remote_multimodal_config(cfgs.cfg.clone())
918 .with_prompt_url_gate(cfgs.prompt_url_gate.clone())
919 .with_vision_model(cfgs.vision_model.clone())
920 .with_text_model(cfgs.text_model.clone())
921 .with_vision_route_mode(cfgs.vision_route_mode)
922 .with_chrome_ai(cfgs.use_chrome_ai)
923 .with_semaphore(sem)
924 .to_owned();
925
926 #[cfg(feature = "agent_skills")]
927 if let Some(ref registry) = cfgs.skill_registry {
928 engine.with_skill_registry(Some(registry.clone()));
929 }
930
931 let model_pool = cfgs.model_pool.clone();
933 if model_pool.len() >= 3 {
934 let model_names: Vec<&str> =
935 model_pool.iter().map(|ep| ep.model_name.as_str()).collect();
936 let policy = crate::features::automation::auto_policy(&model_names);
937 engine.model_router = Some(crate::features::automation::ModelRouter::with_policy(
938 policy,
939 ));
940 }
941 engine.model_pool = model_pool;
942
943 Some(engine)
944 }
945
946 #[cfg(not(feature = "chrome"))]
948 pub(crate) fn only_chrome_agent(&self) -> bool {
949 false
950 }
951
952 #[cfg(feature = "chrome")]
954 pub(crate) fn only_chrome_agent(&self) -> bool {
955 self.chrome_connection_url.is_some()
956 || self.wait_for.is_some()
957 || self.chrome_intercept.enabled
958 || self.stealth_mode.stealth()
959 || self.fingerprint.valid()
960 }
961
962 #[cfg(feature = "regex")]
963 pub fn get_blacklist(&self) -> Box<regex::RegexSet> {
965 match &self.blacklist_url {
966 Some(blacklist) => match regex::RegexSet::new(&**blacklist) {
967 Ok(s) => Box::new(s),
968 _ => Default::default(),
969 },
970 _ => Default::default(),
971 }
972 }
973
974 #[cfg(not(feature = "regex"))]
975 pub fn get_blacklist(&self) -> AllowList {
977 match &self.blacklist_url {
978 Some(blacklist) => blacklist.to_owned(),
979 _ => Default::default(),
980 }
981 }
982
983 pub(crate) fn set_blacklist(&mut self) {
985 self.blacklist = AllowListSet(self.get_blacklist());
986 }
987
988 pub fn set_whitelist(&mut self) {
990 self.whitelist = AllowListSet(self.get_whitelist());
991 }
992
993 pub fn configure_allowlist(&mut self) {
995 self.set_whitelist();
996 self.set_blacklist();
997 }
998
999 pub fn get_blacklist_compiled(&self) -> &AllowList {
1001 &self.blacklist.0
1002 }
1003
1004 pub fn configure_budget(&mut self) {
1006 self.inner_budget.clone_from(&self.budget);
1007 }
1008
1009 pub fn get_whitelist_compiled(&self) -> &AllowList {
1011 &self.whitelist.0
1012 }
1013
1014 #[cfg(feature = "regex")]
1015 pub fn get_whitelist(&self) -> Box<regex::RegexSet> {
1017 match &self.whitelist_url {
1018 Some(whitelist) => match regex::RegexSet::new(&**whitelist) {
1019 Ok(s) => Box::new(s),
1020 _ => Default::default(),
1021 },
1022 _ => Default::default(),
1023 }
1024 }
1025
1026 #[cfg(not(feature = "regex"))]
1027 pub fn get_whitelist(&self) -> AllowList {
1029 match &self.whitelist_url {
1030 Some(whitelist) => whitelist.to_owned(),
1031 _ => Default::default(),
1032 }
1033 }
1034
1035 #[cfg(feature = "sitemap")]
1036 pub fn add_sitemap_to_whitelist(&mut self) -> SitemapWhitelistChanges {
1038 let mut changes = SitemapWhitelistChanges::default();
1039
1040 if self.ignore_sitemap && self.whitelist_url.is_none() {
1041 return changes;
1042 }
1043
1044 if let Some(list) = self.whitelist_url.as_mut() {
1045 if list.is_empty() {
1046 return changes;
1047 }
1048
1049 let default = CompactString::from("sitemap.xml");
1050
1051 if !list.contains(&default) {
1052 list.push(default);
1053 changes.added_default = true;
1054 }
1055
1056 if let Some(custom) = &self.sitemap_url {
1057 if !list.contains(custom) {
1058 list.push((**custom).clone());
1061 changes.added_custom = true;
1062 }
1063 }
1064 }
1065
1066 changes
1067 }
1068
1069 #[cfg(feature = "sitemap")]
1070 pub fn remove_sitemap_from_whitelist(&mut self, changes: SitemapWhitelistChanges) {
1072 if let Some(list) = self.whitelist_url.as_mut() {
1073 if changes.added_default {
1074 let default = CompactString::from("sitemap.xml");
1075 if let Some(pos) = list.iter().position(|s| s == default) {
1076 list.remove(pos);
1077 }
1078 }
1079 if changes.added_custom {
1080 if let Some(custom) = &self.sitemap_url {
1081 if let Some(pos) = list.iter().position(|s| *s == **custom) {
1082 list.remove(pos);
1083 }
1084 }
1085 }
1086 if list.is_empty() {
1087 self.whitelist_url = None;
1088 }
1089 }
1090 }
1091
1092 pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self {
1094 self.respect_robots_txt = respect_robots_txt;
1095 self
1096 }
1097
1098 pub fn with_subdomains(&mut self, subdomains: bool) -> &mut Self {
1100 self.subdomains = subdomains;
1101 self
1102 }
1103
1104 #[cfg(feature = "chrome")]
1106 pub fn with_csp_bypass(&mut self, enabled: bool) -> &mut Self {
1107 self.bypass_csp = enabled;
1108 self
1109 }
1110
1111 #[cfg(not(feature = "chrome"))]
1113 pub fn with_csp_bypass(&mut self, _enabled: bool) -> &mut Self {
1114 self
1115 }
1116
1117 #[cfg(feature = "chrome")]
1119 pub fn with_disable_javascript(&mut self, disabled: bool) -> &mut Self {
1120 self.disable_javascript = disabled;
1121 self
1122 }
1123
1124 #[cfg(not(feature = "chrome"))]
1126 pub fn with_disable_javascript(&mut self, _disabled: bool) -> &mut Self {
1127 self
1128 }
1129
1130 pub fn with_network_interface(&mut self, network_interface: Option<String>) -> &mut Self {
1132 self.network_interface = network_interface;
1133 self
1134 }
1135
1136 pub fn with_local_address(&mut self, local_address: Option<IpAddr>) -> &mut Self {
1138 self.local_address = local_address;
1139 self
1140 }
1141
1142 pub fn with_tld(&mut self, tld: bool) -> &mut Self {
1144 self.tld = tld;
1145 self
1146 }
1147
1148 pub fn with_crawl_timeout(&mut self, crawl_timeout: Option<Duration>) -> &mut Self {
1150 self.crawl_timeout = crawl_timeout;
1151 self
1152 }
1153
1154 pub fn with_delay(&mut self, delay: u64) -> &mut Self {
1156 self.delay = delay;
1157 self
1158 }
1159
1160 pub fn with_http2_prior_knowledge(&mut self, http2_prior_knowledge: bool) -> &mut Self {
1162 self.http2_prior_knowledge = http2_prior_knowledge;
1163 self
1164 }
1165
1166 pub fn with_request_timeout(&mut self, request_timeout: Option<Duration>) -> &mut Self {
1168 match request_timeout {
1169 Some(timeout) => self.request_timeout = Some(timeout),
1170 _ => self.request_timeout = None,
1171 };
1172
1173 self
1174 }
1175
1176 #[cfg(feature = "sitemap")]
1177 pub fn with_sitemap(&mut self, sitemap_url: Option<&str>) -> &mut Self {
1179 match sitemap_url {
1180 Some(sitemap_url) => {
1181 self.sitemap_url = Some(CompactString::new(sitemap_url.to_string()).into())
1182 }
1183 _ => self.sitemap_url = None,
1184 };
1185 self
1186 }
1187
1188 #[cfg(not(feature = "sitemap"))]
1189 pub fn with_sitemap(&mut self, _sitemap_url: Option<&str>) -> &mut Self {
1191 self
1192 }
1193
1194 #[cfg(feature = "sitemap")]
1195 pub fn with_ignore_sitemap(&mut self, ignore_sitemap: bool) -> &mut Self {
1197 self.ignore_sitemap = ignore_sitemap;
1198 self
1199 }
1200
1201 #[cfg(not(feature = "sitemap"))]
1202 pub fn with_ignore_sitemap(&mut self, _ignore_sitemap: bool) -> &mut Self {
1204 self
1205 }
1206
1207 pub fn with_user_agent(&mut self, user_agent: Option<&str>) -> &mut Self {
1209 match user_agent {
1210 Some(agent) => self.user_agent = Some(CompactString::new(agent).into()),
1211 _ => self.user_agent = None,
1212 };
1213 self
1214 }
1215
1216 pub fn with_preserve_host_header(&mut self, preserve: bool) -> &mut Self {
1218 self.preserve_host_header = preserve;
1219 self
1220 }
1221
1222 #[cfg(feature = "agent")]
1225 pub fn with_remote_multimodal(
1226 &mut self,
1227 remote_multimodal: Option<crate::features::automation::RemoteMultimodalConfigs>,
1228 ) -> &mut Self {
1229 self.remote_multimodal = remote_multimodal.map(Box::new);
1230 self
1231 }
1232
1233 #[cfg(not(feature = "agent"))]
1236 pub fn with_remote_multimodal(
1237 &mut self,
1238 remote_multimodal: Option<crate::features::automation::RemoteMultimodalConfigs>,
1239 ) -> &mut Self {
1240 self.remote_multimodal = remote_multimodal.map(Box::new);
1241 self
1242 }
1243
1244 #[cfg(not(feature = "openai"))]
1245 pub fn with_openai(&mut self, _openai_config: Option<GPTConfigs>) -> &mut Self {
1247 self
1248 }
1249
1250 #[cfg(feature = "openai")]
1252 pub fn with_openai(&mut self, openai_config: Option<GPTConfigs>) -> &mut Self {
1253 match openai_config {
1254 Some(openai_config) => self.openai_config = Some(Box::new(openai_config)),
1255 _ => self.openai_config = None,
1256 };
1257 self
1258 }
1259
1260 #[cfg(not(feature = "gemini"))]
1261 pub fn with_gemini(&mut self, _gemini_config: Option<GeminiConfigs>) -> &mut Self {
1263 self
1264 }
1265
1266 #[cfg(feature = "gemini")]
1268 pub fn with_gemini(&mut self, gemini_config: Option<GeminiConfigs>) -> &mut Self {
1269 match gemini_config {
1270 Some(gemini_config) => self.gemini_config = Some(Box::new(gemini_config)),
1271 _ => self.gemini_config = None,
1272 };
1273 self
1274 }
1275
1276 #[cfg(feature = "cookies")]
1277 pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self {
1279 self.cookie_str = cookie_str.into();
1280 self
1281 }
1282
1283 #[cfg(not(feature = "cookies"))]
1284 pub fn with_cookies(&mut self, _cookie_str: &str) -> &mut Self {
1286 self
1287 }
1288
1289 #[cfg(feature = "chrome")]
1290 pub fn with_fingerprint(&mut self, fingerprint: bool) -> &mut Self {
1292 if fingerprint {
1293 self.fingerprint = Fingerprint::Basic;
1294 } else {
1295 self.fingerprint = Fingerprint::None;
1296 }
1297 self
1298 }
1299
1300 #[cfg(feature = "chrome")]
1301 pub fn with_fingerprint_advanced(&mut self, fingerprint: Fingerprint) -> &mut Self {
1303 self.fingerprint = fingerprint;
1304 self
1305 }
1306
1307 #[cfg(not(feature = "chrome"))]
1308 pub fn with_fingerprint(&mut self, _fingerprint: bool) -> &mut Self {
1310 self
1311 }
1312
1313 pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self {
1315 self.proxies = proxies.map(|p| {
1316 p.iter()
1317 .map(|addr| RequestProxy {
1318 addr: addr.to_owned(),
1319 ..Default::default()
1320 })
1321 .collect::<Vec<RequestProxy>>()
1322 });
1323 self
1324 }
1325
1326 pub fn with_proxies_direct(&mut self, proxies: Option<Vec<RequestProxy>>) -> &mut Self {
1328 self.proxies = proxies;
1329 self
1330 }
1331
1332 pub fn with_proxies_for_kind(
1344 &mut self,
1345 kind: ProxyKind,
1346 proxies: Option<Vec<RequestProxy>>,
1347 ) -> &mut Self {
1348 match (proxies, self.proxies_by_kind.as_mut()) {
1349 (Some(p), Some(map)) => {
1350 map.insert(kind, p);
1351 }
1352 (Some(p), None) => {
1353 let mut map = hashbrown::HashMap::new();
1354 map.insert(kind, p);
1355 self.proxies_by_kind = Some(map);
1356 }
1357 (None, Some(map)) => {
1358 map.remove(&kind);
1359 if map.is_empty() {
1360 self.proxies_by_kind = None;
1361 }
1362 }
1363 (None, None) => {}
1364 }
1365 self
1366 }
1367
1368 pub fn with_shared_queue(&mut self, shared_queue: bool) -> &mut Self {
1370 self.shared_queue = shared_queue;
1371 self
1372 }
1373
1374 pub fn with_blacklist_url<T>(&mut self, blacklist_url: Option<Vec<T>>) -> &mut Self
1376 where
1377 Vec<CompactString>: From<Vec<T>>,
1378 {
1379 match blacklist_url {
1380 Some(p) => self.blacklist_url = Some(p.into()),
1381 _ => self.blacklist_url = None,
1382 };
1383 self
1384 }
1385
1386 pub fn with_whitelist_url<T>(&mut self, whitelist_url: Option<Vec<T>>) -> &mut Self
1388 where
1389 Vec<CompactString>: From<Vec<T>>,
1390 {
1391 match whitelist_url {
1392 Some(p) => self.whitelist_url = Some(p.into()),
1393 _ => self.whitelist_url = None,
1394 };
1395 self
1396 }
1397
1398 pub fn with_return_page_links(&mut self, return_page_links: bool) -> &mut Self {
1400 self.return_page_links = return_page_links;
1401 self
1402 }
1403
1404 pub fn with_headers(&mut self, headers: Option<reqwest::header::HeaderMap>) -> &mut Self {
1406 match headers {
1407 Some(m) => self.headers = Some(SerializableHeaderMap::from(m).into()),
1408 _ => self.headers = None,
1409 };
1410 self
1411 }
1412
1413 pub fn with_redirect_limit(&mut self, redirect_limit: usize) -> &mut Self {
1419 self.redirect_limit = redirect_limit;
1420 self.redirect_limit_set = true;
1421 self
1422 }
1423
1424 pub fn with_max_main_frame_navigations(&mut self, cap: Option<u32>) -> &mut Self {
1432 self.max_main_frame_navigations = cap;
1433 self
1434 }
1435
1436 pub fn with_redirect_policy(&mut self, policy: RedirectPolicy) -> &mut Self {
1438 self.redirect_policy = policy;
1439 self
1440 }
1441
1442 pub fn with_referer(&mut self, referer: Option<String>) -> &mut Self {
1444 self.referer = referer;
1445 self
1446 }
1447
1448 pub fn with_referrer(&mut self, referer: Option<String>) -> &mut Self {
1450 self.referer = referer;
1451 self
1452 }
1453
1454 pub fn with_full_resources(&mut self, full_resources: bool) -> &mut Self {
1456 self.full_resources = full_resources;
1457 self
1458 }
1459
1460 #[cfg(feature = "chrome")]
1462 pub fn with_dismiss_dialogs(&mut self, dismiss_dialogs: bool) -> &mut Self {
1463 self.dismiss_dialogs = Some(dismiss_dialogs);
1464 self
1465 }
1466
1467 #[cfg(not(feature = "chrome"))]
1469 pub fn with_dismiss_dialogs(&mut self, _dismiss_dialogs: bool) -> &mut Self {
1470 self
1471 }
1472
1473 #[cfg(feature = "wreq")]
1475 pub fn with_emulation(&mut self, emulation: Option<wreq_util::Emulation>) -> &mut Self {
1476 self.emulation = emulation;
1477 self
1478 }
1479
1480 #[cfg(feature = "cron")]
1481 pub fn with_cron(&mut self, cron_str: &str, cron_type: CronType) -> &mut Self {
1483 self.cron_str = cron_str.into();
1484 self.cron_type = cron_type;
1485 self
1486 }
1487
1488 #[cfg(not(feature = "cron"))]
1489 pub fn with_cron(&mut self, _cron_str: &str, _cron_type: CronType) -> &mut Self {
1491 self
1492 }
1493
1494 pub fn with_limit(&mut self, limit: u32) -> &mut Self {
1496 self.with_budget(Some(hashbrown::HashMap::from([("*", limit)])));
1497 self
1498 }
1499
1500 pub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self {
1502 self.concurrency_limit = limit;
1503 self
1504 }
1505
1506 #[cfg(feature = "chrome")]
1507 pub fn with_auth_challenge_response(
1509 &mut self,
1510 auth_challenge_response: Option<AuthChallengeResponse>,
1511 ) -> &mut Self {
1512 self.auth_challenge_response = auth_challenge_response;
1513 self
1514 }
1515
1516 #[cfg(feature = "chrome")]
1517 pub fn with_evaluate_on_new_document(
1519 &mut self,
1520 evaluate_on_new_document: Option<Box<String>>,
1521 ) -> &mut Self {
1522 self.evaluate_on_new_document = evaluate_on_new_document;
1523 self
1524 }
1525
1526 #[cfg(not(feature = "chrome"))]
1527 pub fn with_evaluate_on_new_document(
1529 &mut self,
1530 _evaluate_on_new_document: Option<Box<String>>,
1531 ) -> &mut Self {
1532 self
1533 }
1534
1535 #[cfg(not(feature = "chrome"))]
1536 pub fn with_auth_challenge_response(
1538 &mut self,
1539 _auth_challenge_response: Option<AuthChallengeResponse>,
1540 ) -> &mut Self {
1541 self
1542 }
1543
1544 pub fn with_depth(&mut self, depth: usize) -> &mut Self {
1546 self.depth = depth;
1547 self
1548 }
1549
1550 #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
1551 pub fn with_caching(&mut self, cache: bool) -> &mut Self {
1553 self.cache = cache;
1554 self
1555 }
1556
1557 #[cfg(not(any(feature = "cache_request", feature = "chrome_remote_cache")))]
1558 pub fn with_caching(&mut self, _cache: bool) -> &mut Self {
1560 self
1561 }
1562
1563 #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
1564 pub fn with_cache_skip_browser(&mut self, skip: bool) -> &mut Self {
1568 self.cache_skip_browser = skip;
1569 self
1570 }
1571
1572 #[cfg(not(any(feature = "cache_request", feature = "chrome_remote_cache")))]
1573 pub fn with_cache_skip_browser(&mut self, _skip: bool) -> &mut Self {
1576 self
1577 }
1578
1579 pub fn with_cache_namespace<S: Into<String>>(&mut self, namespace: Option<S>) -> &mut Self {
1586 self.cache_namespace = namespace.map(|s| Box::new(s.into()));
1587 self
1588 }
1589
1590 #[inline]
1594 #[allow(dead_code)]
1595 pub(crate) fn cache_namespace_str(&self) -> Option<&str> {
1596 self.cache_namespace.as_ref().map(|s| s.as_str())
1597 }
1598
1599 #[cfg(feature = "chrome_remote_cache")]
1604 pub fn with_chrome_remote_cache_read_only(&mut self, read_only: bool) -> &mut Self {
1605 self.chrome_remote_cache_read_only = read_only;
1606 self
1607 }
1608
1609 #[cfg(not(feature = "chrome_remote_cache"))]
1612 pub fn with_chrome_remote_cache_read_only(&mut self, _read_only: bool) -> &mut Self {
1613 self
1614 }
1615
1616 #[inline]
1621 #[allow(dead_code)]
1622 pub(crate) fn chrome_remote_cache_read_only_enabled(&self) -> bool {
1623 #[cfg(feature = "chrome_remote_cache")]
1624 {
1625 self.chrome_remote_cache_read_only
1626 }
1627 #[cfg(not(feature = "chrome_remote_cache"))]
1628 {
1629 false
1630 }
1631 }
1632
1633 #[cfg(feature = "chrome_remote_cache")]
1641 pub fn with_remote_cache_skip_browser(&mut self, enabled: bool) -> &mut Self {
1642 self.remote_cache_skip_browser = enabled;
1643 spider_remote_cache::set_skip_browser_dumps_enabled(enabled);
1644 spider_remote_cache::set_spool_enabled(enabled);
1645 self
1646 }
1647
1648 #[cfg(not(feature = "chrome_remote_cache"))]
1652 pub fn with_remote_cache_skip_browser(&mut self, _enabled: bool) -> &mut Self {
1653 self
1654 }
1655
1656 #[inline]
1663 #[allow(dead_code)]
1664 pub(crate) fn remote_cache_skip_browser_enabled(&self) -> bool {
1665 #[cfg(feature = "chrome_remote_cache")]
1666 {
1667 self.remote_cache_skip_browser
1668 }
1669 #[cfg(not(feature = "chrome_remote_cache"))]
1670 {
1671 false
1672 }
1673 }
1674
1675 #[cfg(feature = "chrome_remote_cache")]
1684 pub fn with_chrome_remote_cache_main_doc_only(&mut self, enabled: bool) -> &mut Self {
1685 self.chrome_remote_cache_main_doc_only = enabled;
1686 self
1687 }
1688
1689 #[cfg(not(feature = "chrome_remote_cache"))]
1693 pub fn with_chrome_remote_cache_main_doc_only(&mut self, _enabled: bool) -> &mut Self {
1694 self
1695 }
1696
1697 #[inline]
1702 #[allow(dead_code)]
1703 pub(crate) fn chrome_remote_cache_main_doc_only_enabled(&self) -> bool {
1704 #[cfg(feature = "chrome_remote_cache")]
1705 {
1706 self.chrome_remote_cache_main_doc_only
1707 }
1708 #[cfg(not(feature = "chrome_remote_cache"))]
1709 {
1710 false
1711 }
1712 }
1713
1714 #[cfg(feature = "chrome")]
1715 pub fn with_service_worker_enabled(&mut self, enabled: bool) -> &mut Self {
1717 self.service_worker_enabled = enabled;
1718 self
1719 }
1720
1721 #[cfg(not(feature = "chrome"))]
1722 pub fn with_service_worker_enabled(&mut self, _enabled: bool) -> &mut Self {
1724 self
1725 }
1726
1727 #[cfg(not(feature = "chrome"))]
1729 pub fn with_auto_geolocation(&mut self, _enabled: bool) -> &mut Self {
1730 self
1731 }
1732
1733 #[cfg(feature = "chrome")]
1735 pub fn with_auto_geolocation(&mut self, enabled: bool) -> &mut Self {
1736 self.auto_geolocation = enabled;
1737 self
1738 }
1739
1740 pub fn with_retry(&mut self, retry: u8) -> &mut Self {
1742 self.retry = retry;
1743 self
1744 }
1745
1746 pub fn with_default_http_connect_timeout(
1748 &mut self,
1749 default_http_connect_timeout: Option<Duration>,
1750 ) -> &mut Self {
1751 self.default_http_connect_timeout = default_http_connect_timeout;
1752 self
1753 }
1754
1755 pub fn with_default_http_read_timeout(
1757 &mut self,
1758 default_http_read_timeout: Option<Duration>,
1759 ) -> &mut Self {
1760 self.default_http_read_timeout = default_http_read_timeout;
1761 self
1762 }
1763
1764 pub fn with_no_control_thread(&mut self, no_control_thread: bool) -> &mut Self {
1766 self.no_control_thread = no_control_thread;
1767 self
1768 }
1769
1770 pub fn with_viewport(&mut self, viewport: Option<crate::configuration::Viewport>) -> &mut Self {
1772 self.viewport = viewport.map(|vp| vp);
1773 self
1774 }
1775
1776 #[cfg(feature = "chrome")]
1777 pub fn with_stealth(&mut self, stealth_mode: bool) -> &mut Self {
1779 if stealth_mode {
1780 self.stealth_mode = spider_fingerprint::configs::Tier::Basic;
1781 } else {
1782 self.stealth_mode = spider_fingerprint::configs::Tier::None;
1783 }
1784 self
1785 }
1786
1787 #[cfg(feature = "chrome")]
1788 pub fn with_stealth_advanced(
1790 &mut self,
1791 stealth_mode: spider_fingerprint::configs::Tier,
1792 ) -> &mut Self {
1793 self.stealth_mode = stealth_mode;
1794 self
1795 }
1796
1797 #[cfg(not(feature = "chrome"))]
1798 pub fn with_stealth(&mut self, _stealth_mode: bool) -> &mut Self {
1800 self
1801 }
1802
1803 #[cfg(feature = "chrome")]
1804 pub fn with_wait_for_idle_network(
1806 &mut self,
1807 wait_for_idle_network: Option<WaitForIdleNetwork>,
1808 ) -> &mut Self {
1809 match self.wait_for.as_mut() {
1810 Some(wait_for) => wait_for.idle_network = wait_for_idle_network,
1811 _ => {
1812 let mut wait_for = WaitFor::default();
1813 wait_for.idle_network = wait_for_idle_network;
1814 self.wait_for = Some(wait_for);
1815 }
1816 }
1817 self
1818 }
1819
1820 #[cfg(feature = "chrome")]
1821 pub fn with_wait_for_idle_network0(
1823 &mut self,
1824 wait_for_idle_network0: Option<WaitForIdleNetwork>,
1825 ) -> &mut Self {
1826 match self.wait_for.as_mut() {
1827 Some(wait_for) => wait_for.idle_network0 = wait_for_idle_network0,
1828 _ => {
1829 let mut wait_for = WaitFor::default();
1830 wait_for.idle_network0 = wait_for_idle_network0;
1831 self.wait_for = Some(wait_for);
1832 }
1833 }
1834 self
1835 }
1836
1837 #[cfg(feature = "chrome")]
1838 pub fn with_wait_for_almost_idle_network0(
1840 &mut self,
1841 wait_for_almost_idle_network0: Option<WaitForIdleNetwork>,
1842 ) -> &mut Self {
1843 match self.wait_for.as_mut() {
1844 Some(wait_for) => wait_for.almost_idle_network0 = wait_for_almost_idle_network0,
1845 _ => {
1846 let mut wait_for = WaitFor::default();
1847 wait_for.almost_idle_network0 = wait_for_almost_idle_network0;
1848 self.wait_for = Some(wait_for);
1849 }
1850 }
1851 self
1852 }
1853
1854 #[cfg(not(feature = "chrome"))]
1855 pub fn with_wait_for_almost_idle_network0(
1857 &mut self,
1858 _wait_for_almost_idle_network0: Option<WaitForIdleNetwork>,
1859 ) -> &mut Self {
1860 self
1861 }
1862
1863 #[cfg(not(feature = "chrome"))]
1864 pub fn with_wait_for_idle_network0(
1866 &mut self,
1867 _wait_for_idle_network0: Option<WaitForIdleNetwork>,
1868 ) -> &mut Self {
1869 self
1870 }
1871
1872 #[cfg(not(feature = "chrome"))]
1873 pub fn with_wait_for_idle_network(
1875 &mut self,
1876 _wait_for_idle_network: Option<WaitForIdleNetwork>,
1877 ) -> &mut Self {
1878 self
1879 }
1880
1881 #[cfg(feature = "chrome")]
1882 pub fn with_wait_for_idle_dom(
1884 &mut self,
1885 wait_for_idle_dom: Option<WaitForSelector>,
1886 ) -> &mut Self {
1887 match self.wait_for.as_mut() {
1888 Some(wait_for) => wait_for.dom = wait_for_idle_dom,
1889 _ => {
1890 let mut wait_for = WaitFor::default();
1891 wait_for.dom = wait_for_idle_dom;
1892 self.wait_for = Some(wait_for);
1893 }
1894 }
1895 self
1896 }
1897
1898 #[cfg(not(feature = "chrome"))]
1899 pub fn with_wait_for_idle_dom(
1901 &mut self,
1902 _wait_for_idle_dom: Option<WaitForSelector>,
1903 ) -> &mut Self {
1904 self
1905 }
1906
1907 #[cfg(feature = "chrome")]
1908 pub fn with_wait_for_selector(
1910 &mut self,
1911 wait_for_selector: Option<WaitForSelector>,
1912 ) -> &mut Self {
1913 match self.wait_for.as_mut() {
1914 Some(wait_for) => wait_for.selector = wait_for_selector,
1915 _ => {
1916 let mut wait_for = WaitFor::default();
1917 wait_for.selector = wait_for_selector;
1918 self.wait_for = Some(wait_for);
1919 }
1920 }
1921 self
1922 }
1923
1924 #[cfg(not(feature = "chrome"))]
1925 pub fn with_wait_for_selector(
1927 &mut self,
1928 _wait_for_selector: Option<WaitForSelector>,
1929 ) -> &mut Self {
1930 self
1931 }
1932
1933 #[cfg(feature = "chrome")]
1934 pub fn with_wait_for_delay(&mut self, wait_for_delay: Option<WaitForDelay>) -> &mut Self {
1936 match self.wait_for.as_mut() {
1937 Some(wait_for) => wait_for.delay = wait_for_delay,
1938 _ => {
1939 let mut wait_for = WaitFor::default();
1940 wait_for.delay = wait_for_delay;
1941 self.wait_for = Some(wait_for);
1942 }
1943 }
1944 self
1945 }
1946
1947 #[cfg(not(feature = "chrome"))]
1948 pub fn with_wait_for_delay(&mut self, _wait_for_delay: Option<WaitForDelay>) -> &mut Self {
1950 self
1951 }
1952
1953 #[cfg(feature = "chrome_intercept")]
1954 pub fn with_chrome_intercept(
1956 &mut self,
1957 chrome_intercept: RequestInterceptConfiguration,
1958 url: &Option<Box<url::Url>>,
1959 ) -> &mut Self {
1960 self.chrome_intercept = chrome_intercept;
1961 self.chrome_intercept.setup_intercept_manager(url);
1962 self
1963 }
1964
1965 #[cfg(not(feature = "chrome_intercept"))]
1966 pub fn with_chrome_intercept(
1968 &mut self,
1969 _chrome_intercept: RequestInterceptConfiguration,
1970 _url: &Option<Box<url::Url>>,
1971 ) -> &mut Self {
1972 self
1973 }
1974
1975 #[cfg(feature = "chrome_intercept")]
1976 pub fn with_remote_local_policy(&mut self, enabled: bool) -> &mut Self {
1983 if enabled {
1984 self.chrome_intercept.enabled = true;
1985 }
1986 self.chrome_intercept.set_remote_local_policy(enabled);
1987 self
1988 }
1989
1990 #[cfg(not(feature = "chrome_intercept"))]
1991 pub fn with_remote_local_policy(&mut self, _enabled: bool) -> &mut Self {
1994 self
1995 }
1996
1997 #[cfg(feature = "chrome")]
1998 pub fn with_chrome_connection(&mut self, chrome_connection_url: Option<String>) -> &mut Self {
2000 self.chrome_connection_url = chrome_connection_url;
2001 self
2002 }
2003
2004 #[cfg(not(feature = "chrome"))]
2005 pub fn with_chrome_connection(&mut self, _chrome_connection_url: Option<String>) -> &mut Self {
2007 self
2008 }
2009
2010 #[cfg(feature = "chrome")]
2011 pub fn with_chrome_connections(&mut self, urls: Vec<String>) -> &mut Self {
2019 match urls.len() {
2020 0 => {
2021 self.chrome_connection_urls = None;
2022 }
2023 1 => {
2024 self.chrome_connection_url = urls.into_iter().next();
2025 self.chrome_connection_urls = None;
2026 }
2027 _ => {
2028 self.chrome_connection_urls = Some(urls);
2029 }
2030 }
2031 self.chrome_failover = crate::features::chrome::LazyChromeFailover::default();
2035 self
2036 }
2037
2038 #[cfg(not(feature = "chrome"))]
2039 pub fn with_chrome_connections(&mut self, _urls: Vec<String>) -> &mut Self {
2041 self
2042 }
2043
2044 #[cfg(feature = "decentralized")]
2045 pub fn with_worker_connection(&mut self, worker_connection_url: Option<String>) -> &mut Self {
2052 self.worker_connection_urls = worker_connection_url.map(|url| {
2053 let url = url.trim();
2054 if url.is_empty() {
2055 Vec::new()
2056 } else {
2057 vec![url.to_string()]
2058 }
2059 });
2060 self
2061 }
2062
2063 #[cfg(not(feature = "decentralized"))]
2064 pub fn with_worker_connection(&mut self, _worker_connection_url: Option<String>) -> &mut Self {
2067 self
2068 }
2069
2070 #[cfg(feature = "decentralized")]
2071 pub fn with_worker_connections(&mut self, urls: Vec<String>) -> &mut Self {
2075 self.worker_connection_urls = Some(
2076 urls.into_iter()
2077 .map(|url| url.trim().to_string())
2078 .filter(|url| !url.is_empty())
2079 .collect(),
2080 );
2081 self
2082 }
2083
2084 #[cfg(not(feature = "decentralized"))]
2085 pub fn with_worker_connections(&mut self, _urls: Vec<String>) -> &mut Self {
2088 self
2089 }
2090
2091 #[cfg(feature = "decentralized")]
2092 pub fn with_scraper_worker_connection(
2098 &mut self,
2099 scraper_worker_connection_url: Option<String>,
2100 ) -> &mut Self {
2101 self.scraper_worker_connection_urls = scraper_worker_connection_url.map(|url| {
2102 let url = url.trim();
2103 if url.is_empty() {
2104 Vec::new()
2105 } else {
2106 vec![url.to_string()]
2107 }
2108 });
2109 self
2110 }
2111
2112 #[cfg(not(feature = "decentralized"))]
2113 pub fn with_scraper_worker_connection(
2116 &mut self,
2117 _scraper_worker_connection_url: Option<String>,
2118 ) -> &mut Self {
2119 self
2120 }
2121
2122 #[cfg(feature = "decentralized")]
2123 pub fn with_scraper_worker_connections(&mut self, urls: Vec<String>) -> &mut Self {
2127 self.scraper_worker_connection_urls = Some(
2128 urls.into_iter()
2129 .map(|url| url.trim().to_string())
2130 .filter(|url| !url.is_empty())
2131 .collect(),
2132 );
2133 self
2134 }
2135
2136 #[cfg(not(feature = "decentralized"))]
2137 pub fn with_scraper_worker_connections(&mut self, _urls: Vec<String>) -> &mut Self {
2140 self
2141 }
2142
2143 #[cfg(feature = "chrome")]
2144 pub fn with_chrome_first_byte_timeout(&mut self, timeout: Option<Duration>) -> &mut Self {
2149 self.chrome_first_byte_timeout = timeout;
2150 self
2151 }
2152
2153 #[cfg(not(feature = "chrome"))]
2154 pub fn with_chrome_first_byte_timeout(&mut self, _timeout: Option<Duration>) -> &mut Self {
2156 self
2157 }
2158
2159 #[cfg(feature = "chrome")]
2160 pub fn with_chrome_first_byte_timeout_jitter(&mut self, jitter: Option<Duration>) -> &mut Self {
2164 self.chrome_first_byte_timeout_jitter = jitter;
2165 self
2166 }
2167
2168 #[cfg(not(feature = "chrome"))]
2169 pub fn with_chrome_first_byte_timeout_jitter(
2171 &mut self,
2172 _jitter: Option<Duration>,
2173 ) -> &mut Self {
2174 self
2175 }
2176
2177 pub fn with_http_first_byte_timeout(&mut self, timeout: Option<Duration>) -> &mut Self {
2186 self.http_first_byte_timeout = timeout;
2187 self
2188 }
2189
2190 pub fn with_http_first_byte_timeout_jitter(&mut self, jitter: Option<Duration>) -> &mut Self {
2194 self.http_first_byte_timeout_jitter = jitter;
2195 self
2196 }
2197
2198 #[cfg(not(feature = "chrome"))]
2199 pub fn with_execution_scripts(
2201 &mut self,
2202 _execution_scripts: Option<ExecutionScriptsMap>,
2203 ) -> &mut Self {
2204 self
2205 }
2206
2207 #[cfg(feature = "chrome")]
2208 pub fn with_execution_scripts(
2210 &mut self,
2211 execution_scripts: Option<ExecutionScriptsMap>,
2212 ) -> &mut Self {
2213 self.execution_scripts =
2214 crate::features::chrome_common::convert_to_trie_execution_scripts(&execution_scripts);
2215 self
2216 }
2217
2218 #[cfg(not(feature = "chrome"))]
2219 pub fn with_automation_scripts(
2221 &mut self,
2222 _automation_scripts: Option<AutomationScriptsMap>,
2223 ) -> &mut Self {
2224 self
2225 }
2226
2227 #[cfg(feature = "chrome")]
2228 pub fn with_automation_scripts(
2230 &mut self,
2231 automation_scripts: Option<AutomationScriptsMap>,
2232 ) -> &mut Self {
2233 self.automation_scripts =
2234 crate::features::chrome_common::convert_to_trie_automation_scripts(&automation_scripts);
2235 self
2236 }
2237
2238 pub fn with_budget(&mut self, budget: Option<hashbrown::HashMap<&str, u32>>) -> &mut Self {
2240 self.budget = match budget {
2241 Some(budget) => {
2242 let mut crawl_budget: hashbrown::HashMap<
2243 case_insensitive_string::CaseInsensitiveString,
2244 u32,
2245 > = hashbrown::HashMap::new();
2246
2247 for b in budget.into_iter() {
2248 crawl_budget.insert(
2249 case_insensitive_string::CaseInsensitiveString::from(b.0),
2250 b.1,
2251 );
2252 }
2253
2254 Some(crawl_budget)
2255 }
2256 _ => None,
2257 };
2258 self
2259 }
2260
2261 pub fn with_external_domains<'a, 'b>(
2263 &mut self,
2264 external_domains: Option<impl Iterator<Item = String> + 'a>,
2265 ) -> &mut Self {
2266 match external_domains {
2267 Some(external_domains) => {
2268 self.external_domains_caseless = external_domains
2269 .into_iter()
2270 .filter_map(|d| {
2271 if d == "*" {
2272 Some("*".into())
2273 } else {
2274 let host = get_domain_from_url(&d);
2275
2276 if !host.is_empty() {
2277 Some(host.into())
2278 } else {
2279 None
2280 }
2281 }
2282 })
2283 .collect::<hashbrown::HashSet<case_insensitive_string::CaseInsensitiveString>>()
2284 .into();
2285 }
2286 _ => self.external_domains_caseless = Default::default(),
2287 }
2288
2289 self
2290 }
2291
2292 pub fn with_danger_accept_invalid_certs(&mut self, accept_invalid_certs: bool) -> &mut Self {
2294 self.accept_invalid_certs = accept_invalid_certs;
2295 self
2296 }
2297
2298 pub fn with_normalize(&mut self, normalize: bool) -> &mut Self {
2300 self.normalize = normalize;
2301 self
2302 }
2303
2304 #[cfg(not(feature = "disk"))]
2305 pub fn with_shared_state(&mut self, _shared: bool) -> &mut Self {
2307 self
2308 }
2309
2310 #[cfg(feature = "disk")]
2312 pub fn with_shared_state(&mut self, shared: bool) -> &mut Self {
2313 self.shared = shared;
2314 self
2315 }
2316
2317 #[cfg(not(feature = "chrome"))]
2318 pub fn with_timezone_id(&mut self, _timezone_id: Option<String>) -> &mut Self {
2320 self
2321 }
2322
2323 #[cfg(feature = "chrome")]
2324 pub fn with_timezone_id(&mut self, timezone_id: Option<String>) -> &mut Self {
2326 self.timezone_id = timezone_id.map(|timezone_id| timezone_id.into());
2327 self
2328 }
2329
2330 #[cfg(not(feature = "chrome"))]
2331 pub fn with_locale(&mut self, _locale: Option<String>) -> &mut Self {
2333 self
2334 }
2335
2336 #[cfg(feature = "chrome")]
2337 pub fn with_locale(&mut self, locale: Option<String>) -> &mut Self {
2339 self.locale = locale.map(|locale| locale.into());
2340 self
2341 }
2342
2343 #[cfg(feature = "chrome")]
2344 pub fn with_event_tracker(&mut self, track_events: Option<ChromeEventTracker>) -> &mut Self {
2346 self.track_events = track_events;
2347 self
2348 }
2349
2350 #[cfg(not(feature = "chrome"))]
2352 pub fn with_screenshot(&mut self, _screenshot_config: Option<ScreenShotConfig>) -> &mut Self {
2353 self
2354 }
2355
2356 #[cfg(feature = "chrome")]
2358 pub fn with_screenshot(&mut self, screenshot_config: Option<ScreenShotConfig>) -> &mut Self {
2359 self.screenshot = screenshot_config;
2360 self
2361 }
2362
2363 pub fn with_max_page_bytes(&mut self, max_page_bytes: Option<f64>) -> &mut Self {
2365 self.max_page_bytes = max_page_bytes;
2366 self
2367 }
2368
2369 pub fn with_max_bytes_allowed(&mut self, max_bytes_allowed: Option<u64>) -> &mut Self {
2371 self.max_bytes_allowed = max_bytes_allowed;
2372 self
2373 }
2374
2375 pub fn with_block_assets(&mut self, only_html: bool) -> &mut Self {
2377 self.only_html = only_html;
2378 self
2379 }
2380
2381 pub fn with_modify_headers(&mut self, modify_headers: bool) -> &mut Self {
2383 self.modify_headers = modify_headers;
2384 self
2385 }
2386
2387 pub fn with_modify_http_client_headers(
2389 &mut self,
2390 modify_http_client_headers: bool,
2391 ) -> &mut Self {
2392 self.modify_http_client_headers = modify_http_client_headers;
2393 self
2394 }
2395
2396 pub fn with_cache_policy(&mut self, cache_policy: Option<BasicCachePolicy>) -> &mut Self {
2398 self.cache_policy = cache_policy;
2399 self
2400 }
2401
2402 #[cfg(feature = "webdriver")]
2403 pub fn with_webdriver_config(
2405 &mut self,
2406 webdriver_config: Option<WebDriverConfig>,
2407 ) -> &mut Self {
2408 self.webdriver_config = webdriver_config.map(Box::new);
2409 self
2410 }
2411
2412 #[cfg(not(feature = "webdriver"))]
2413 pub fn with_webdriver_config(
2415 &mut self,
2416 _webdriver_config: Option<WebDriverConfig>,
2417 ) -> &mut Self {
2418 self
2419 }
2420
2421 #[inline]
2442 pub fn auto_http_first_byte_args(&self) -> (Option<Duration>, Option<Duration>) {
2443 match self.http_first_byte_timeout {
2444 Some(_) => (
2445 self.http_first_byte_timeout,
2446 self.http_first_byte_timeout_jitter,
2447 ),
2448 None => (None, None),
2449 }
2450 }
2451
2452 #[cfg(feature = "chrome")]
2458 #[inline]
2459 pub fn chrome_fetch_params(&self) -> crate::utils::ChromeFetchParams<'_> {
2460 crate::utils::ChromeFetchParams {
2461 wait_for: &self.wait_for,
2462 screenshot: &self.screenshot,
2463 openai_config: &self.openai_config,
2464 execution_scripts: &self.execution_scripts,
2465 automation_scripts: &self.automation_scripts,
2466 viewport: &self.viewport,
2467 request_timeout: &self.request_timeout,
2468 track_events: &self.track_events,
2469 cache_policy: &self.cache_policy,
2470 remote_multimodal: &self.remote_multimodal,
2471 remote_cache_read_only: self.chrome_remote_cache_read_only_enabled(),
2472 remote_cache_main_doc_only: self.chrome_remote_cache_main_doc_only_enabled(),
2473 first_byte_timeout: &self.chrome_first_byte_timeout,
2474 first_byte_timeout_jitter: &self.chrome_first_byte_timeout_jitter,
2475 browser_dead: None,
2476 chrome_failover: Some(&self.chrome_failover),
2477 chrome_endpoint_url: self
2484 .chrome_failover
2485 .last_connected_url()
2486 .or(self.chrome_connection_url.as_deref()),
2487 }
2488 }
2489
2490 #[cfg(any(feature = "cache_request", feature = "chrome_remote_cache"))]
2492 pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
2493 use crate::utils::CacheOptions;
2494 if !self.cache {
2495 return None;
2496 }
2497 let auth_token = self
2498 .headers
2499 .as_ref()
2500 .and_then(|headers| {
2501 headers
2502 .0
2503 .get("authorization")
2504 .or_else(|| headers.0.get("Authorization"))
2505 })
2506 .map(|s| s.to_owned());
2507
2508 #[cfg(feature = "cache_mem")]
2513 let skip_browser = true;
2514 #[cfg(not(feature = "cache_mem"))]
2515 let skip_browser = self.cache_skip_browser;
2516
2517 match auth_token {
2518 Some(token) if !token.is_empty() => {
2519 if let Ok(token_str) = token.to_str() {
2520 if skip_browser {
2521 Some(CacheOptions::SkipBrowserAuthorized(token_str.into()))
2522 } else {
2523 Some(CacheOptions::Authorized(token_str.into()))
2524 }
2525 } else if skip_browser {
2526 Some(CacheOptions::SkipBrowser)
2527 } else {
2528 Some(CacheOptions::Yes)
2529 }
2530 }
2531 _ => {
2532 if skip_browser {
2533 Some(CacheOptions::SkipBrowser)
2534 } else {
2535 Some(CacheOptions::Yes)
2536 }
2537 }
2538 }
2539 }
2540
2541 #[cfg(all(
2543 feature = "chrome",
2544 not(any(feature = "cache_request", feature = "chrome_remote_cache"))
2545 ))]
2546 pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
2547 None
2548 }
2549
2550 #[cfg(not(any(
2552 feature = "cache_request",
2553 feature = "chrome_remote_cache",
2554 feature = "chrome"
2555 )))]
2556 #[allow(dead_code)]
2557 pub(crate) fn get_cache_options(&self) -> Option<crate::utils::CacheOptions> {
2558 None
2559 }
2560
2561 pub fn build(&self) -> Self {
2563 self.to_owned()
2564 }
2565
2566 #[cfg(feature = "search")]
2567 pub fn with_search_config(&mut self, search_config: Option<SearchConfig>) -> &mut Self {
2569 self.search_config = search_config.map(Box::new);
2570 self
2571 }
2572
2573 #[cfg(not(feature = "search"))]
2574 pub fn with_search_config(&mut self, _search_config: Option<()>) -> &mut Self {
2576 self
2577 }
2578
2579 #[cfg(feature = "spider_cloud")]
2581 pub fn with_spider_cloud(&mut self, api_key: &str) -> &mut Self {
2582 if is_placeholder_api_key(api_key) {
2583 log::warn!("Spider Cloud API key looks like a placeholder — skipping. Get a real key at https://spider.cloud");
2584 return self;
2585 }
2586 self.spider_cloud = Some(Box::new(SpiderCloudConfig::new(api_key)));
2587 self
2588 }
2589
2590 #[cfg(not(feature = "spider_cloud"))]
2592 pub fn with_spider_cloud(&mut self, _api_key: &str) -> &mut Self {
2593 self
2594 }
2595
2596 #[cfg(feature = "spider_cloud")]
2598 pub fn with_spider_cloud_config(&mut self, config: SpiderCloudConfig) -> &mut Self {
2599 self.spider_cloud = Some(Box::new(config));
2600 self
2601 }
2602
2603 #[cfg(not(feature = "spider_cloud"))]
2605 pub fn with_spider_cloud_config(&mut self, _config: ()) -> &mut Self {
2606 self
2607 }
2608
2609 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2614 pub fn with_spider_browser(&mut self, api_key: &str) -> &mut Self {
2615 if is_placeholder_api_key(api_key) {
2616 log::warn!("Spider Browser Cloud API key looks like a placeholder — skipping. Get a real key at https://spider.cloud");
2617 return self;
2618 }
2619 let cfg = SpiderBrowserConfig::new(api_key);
2620 self.chrome_connection_url = Some(cfg.connection_url());
2621 self.spider_browser = Some(Box::new(cfg));
2622 self
2623 }
2624
2625 #[cfg(not(all(feature = "spider_cloud", feature = "chrome")))]
2627 pub fn with_spider_browser(&mut self, _api_key: &str) -> &mut Self {
2628 self
2629 }
2630
2631 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
2634 pub fn with_spider_browser_config(&mut self, config: SpiderBrowserConfig) -> &mut Self {
2635 self.chrome_connection_url = Some(config.connection_url());
2636 self.spider_browser = Some(Box::new(config));
2637 self
2638 }
2639
2640 #[cfg(not(all(feature = "spider_cloud", feature = "chrome")))]
2642 pub fn with_spider_browser_config(&mut self, _config: ()) -> &mut Self {
2643 self
2644 }
2645
2646 #[cfg(feature = "hedge")]
2648 pub fn with_hedge(&mut self, config: crate::utils::hedge::HedgeConfig) -> &mut Self {
2649 self.hedge = Some(config);
2650 self
2651 }
2652
2653 #[cfg(not(feature = "hedge"))]
2655 pub fn with_hedge(&mut self, _config: ()) -> &mut Self {
2656 self
2657 }
2658
2659 #[cfg(feature = "auto_throttle")]
2660 pub fn with_auto_throttle(
2662 &mut self,
2663 config: crate::utils::auto_throttle::AutoThrottleConfig,
2664 ) -> &mut Self {
2665 self.auto_throttle = Some(config);
2666 self
2667 }
2668
2669 #[cfg(not(feature = "auto_throttle"))]
2671 pub fn with_auto_throttle(&mut self, _config: ()) -> &mut Self {
2672 self
2673 }
2674
2675 #[cfg(feature = "etag_cache")]
2676 pub fn with_etag_cache(&mut self, enabled: bool) -> &mut Self {
2678 self.etag_cache = enabled;
2679 self
2680 }
2681
2682 #[cfg(not(feature = "etag_cache"))]
2684 pub fn with_etag_cache(&mut self, _enabled: bool) -> &mut Self {
2685 self
2686 }
2687
2688 #[cfg(feature = "warc")]
2689 pub fn with_warc(&mut self, config: crate::utils::warc::WarcConfig) -> &mut Self {
2691 self.warc = Some(config);
2692 self
2693 }
2694
2695 #[cfg(not(feature = "warc"))]
2697 pub fn with_warc(&mut self, _config: ()) -> &mut Self {
2698 self
2699 }
2700}
2701
2702#[cfg(feature = "search")]
2704#[derive(Debug, Clone, PartialEq)]
2705#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2706pub struct SearchConfig {
2707 pub provider: SearchProviderType,
2709 pub api_key: String,
2711 pub api_url: Option<String>,
2713 pub default_options: Option<SearchOptions>,
2715}
2716
2717#[cfg(feature = "search")]
2718impl SearchConfig {
2719 pub fn new(provider: SearchProviderType, api_key: impl Into<String>) -> Self {
2721 Self {
2722 provider,
2723 api_key: api_key.into(),
2724 api_url: None,
2725 default_options: None,
2726 }
2727 }
2728
2729 pub fn with_api_url(mut self, url: impl Into<String>) -> Self {
2731 self.api_url = Some(url.into());
2732 self
2733 }
2734
2735 pub fn with_default_options(mut self, options: SearchOptions) -> Self {
2737 self.default_options = Some(options);
2738 self
2739 }
2740
2741 pub fn is_enabled(&self) -> bool {
2745 !self.api_key.is_empty() || self.api_url.is_some()
2746 }
2747}
2748
2749#[cfg(feature = "search")]
2751#[derive(Debug, Clone, Default, PartialEq, Eq)]
2752#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2753pub enum SearchProviderType {
2754 #[default]
2756 Serper,
2757 Brave,
2759 Bing,
2761 Tavily,
2763}
2764
2765#[cfg(feature = "spider_cloud")]
2769#[derive(Debug, Clone, Default, PartialEq, Eq)]
2770#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2771pub enum SpiderCloudMode {
2772 #[default]
2776 Proxy,
2777 Api,
2780 Unblocker,
2783 Fallback,
2786 Smart,
2791}
2792
2793#[cfg(feature = "spider_cloud")]
2795#[derive(Debug, Clone, Default, PartialEq, Eq)]
2796#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2797pub enum SpiderCloudReturnFormat {
2798 #[default]
2800 #[cfg_attr(feature = "serde", serde(rename = "raw"))]
2801 Raw,
2802 #[cfg_attr(feature = "serde", serde(rename = "markdown"))]
2804 Markdown,
2805 #[cfg_attr(feature = "serde", serde(rename = "commonmark"))]
2807 CommonMark,
2808 #[cfg_attr(feature = "serde", serde(rename = "text"))]
2810 Text,
2811 #[cfg_attr(feature = "serde", serde(rename = "bytes"))]
2813 Bytes,
2814}
2815
2816#[cfg(feature = "spider_cloud")]
2817impl SpiderCloudReturnFormat {
2818 pub fn as_str(&self) -> &'static str {
2820 match self {
2821 Self::Raw => "raw",
2822 Self::Markdown => "markdown",
2823 Self::CommonMark => "commonmark",
2824 Self::Text => "text",
2825 Self::Bytes => "bytes",
2826 }
2827 }
2828}
2829
2830#[cfg(feature = "spider_cloud")]
2831impl std::fmt::Display for SpiderCloudReturnFormat {
2832 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2833 f.write_str(self.as_str())
2834 }
2835}
2836
2837#[cfg(feature = "spider_cloud")]
2838impl From<&str> for SpiderCloudReturnFormat {
2839 fn from(s: &str) -> Self {
2840 match s {
2841 "markdown" | "Markdown" | "MARKDOWN" => Self::Markdown,
2842 "commonmark" | "CommonMark" | "COMMONMARK" => Self::CommonMark,
2843 "text" | "Text" | "TEXT" => Self::Text,
2844 "bytes" | "Bytes" | "BYTES" => Self::Bytes,
2845 _ => Self::Raw,
2846 }
2847 }
2848}
2849
2850#[cfg(feature = "spider_cloud")]
2855#[derive(Debug, Clone, PartialEq, Eq)]
2856#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
2857pub struct SpiderCloudConfig {
2858 pub api_key: String,
2860 #[cfg_attr(feature = "serde", serde(default))]
2862 pub mode: SpiderCloudMode,
2863 #[cfg_attr(
2865 feature = "serde",
2866 serde(default = "SpiderCloudConfig::default_api_url")
2867 )]
2868 pub api_url: String,
2869 #[cfg_attr(
2871 feature = "serde",
2872 serde(default = "SpiderCloudConfig::default_proxy_url")
2873 )]
2874 pub proxy_url: String,
2875 #[cfg_attr(feature = "serde", serde(default))]
2877 pub return_format: SpiderCloudReturnFormat,
2878 #[cfg_attr(
2885 feature = "serde",
2886 serde(default, skip_serializing_if = "Option::is_none")
2887 )]
2888 pub return_formats: Option<Vec<SpiderCloudReturnFormat>>,
2889 #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
2891 pub extra_params: Option<hashbrown::HashMap<String, serde_json::Value>>,
2892}
2893
2894#[cfg(feature = "spider_cloud")]
2895impl Default for SpiderCloudConfig {
2896 fn default() -> Self {
2897 Self {
2898 api_key: String::new(),
2899 mode: SpiderCloudMode::default(),
2900 api_url: Self::default_api_url(),
2901 proxy_url: Self::default_proxy_url(),
2902 return_format: SpiderCloudReturnFormat::default(),
2903 return_formats: None,
2904 extra_params: None,
2905 }
2906 }
2907}
2908
2909#[cfg(feature = "spider_cloud")]
2910impl SpiderCloudConfig {
2911 pub fn new(api_key: impl Into<String>) -> Self {
2913 Self {
2914 api_key: api_key.into(),
2915 ..Default::default()
2916 }
2917 }
2918
2919 pub fn with_mode(mut self, mode: SpiderCloudMode) -> Self {
2921 self.mode = mode;
2922 self
2923 }
2924
2925 pub fn with_api_url(mut self, url: impl Into<String>) -> Self {
2927 self.api_url = url.into();
2928 self
2929 }
2930
2931 pub fn with_proxy_url(mut self, url: impl Into<String>) -> Self {
2933 self.proxy_url = url.into();
2934 self
2935 }
2936
2937 pub fn with_return_format(mut self, fmt: impl Into<SpiderCloudReturnFormat>) -> Self {
2945 self.return_format = fmt.into();
2946 self
2947 }
2948
2949 pub fn with_return_formats(mut self, formats: Vec<SpiderCloudReturnFormat>) -> Self {
2962 let mut seen = Vec::with_capacity(formats.len());
2964 for f in formats {
2965 if !seen.contains(&f) {
2966 seen.push(f);
2967 }
2968 }
2969 if let Some(first) = seen.first() {
2970 self.return_format = first.clone();
2971 }
2972 self.return_formats = Some(seen);
2973 self
2974 }
2975
2976 pub fn has_multiple_formats(&self) -> bool {
2978 self.return_formats.as_ref().is_some_and(|f| f.len() > 1)
2979 }
2980
2981 pub fn with_extra_params(
2983 mut self,
2984 params: hashbrown::HashMap<String, serde_json::Value>,
2985 ) -> Self {
2986 self.extra_params = Some(params);
2987 self
2988 }
2989
2990 pub fn should_fallback(&self, status_code: u16, body: Option<&[u8]>) -> bool {
3004 match self.mode {
3005 SpiderCloudMode::Api | SpiderCloudMode::Unblocker => false, SpiderCloudMode::Proxy => false, SpiderCloudMode::Fallback | SpiderCloudMode::Smart => {
3008 if matches!(status_code, 403 | 429 | 503 | 520..=530) {
3010 return true;
3011 }
3012 if status_code >= 500 {
3013 return true;
3014 }
3015
3016 if self.mode == SpiderCloudMode::Smart {
3018 if let Some(body) = body {
3019 if body.is_empty() {
3021 return true;
3022 }
3023
3024 let check_len = body.len().min(4096);
3027 let snippet = String::from_utf8_lossy(&body[..check_len]);
3028 let lower = snippet.to_lowercase();
3029
3030 if lower.contains("cf-browser-verification")
3032 || lower.contains("cloudflare") && lower.contains("challenge-platform")
3033 {
3034 return true;
3035 }
3036
3037 if lower.contains("captcha") && lower.contains("challenge")
3039 || lower.contains("please verify you are a human")
3040 || lower.contains("access denied") && lower.contains("automated")
3041 || lower.contains("bot detection")
3042 {
3043 return true;
3044 }
3045
3046 if lower.contains("distil_r_captcha")
3048 || lower.contains("_imperva")
3049 || lower.contains("akamai") && lower.contains("bot manager")
3050 {
3051 return true;
3052 }
3053 }
3054 }
3055
3056 false
3057 }
3058 }
3059 }
3060
3061 pub fn fallback_route(&self) -> &'static str {
3067 match self.mode {
3068 SpiderCloudMode::Smart | SpiderCloudMode::Unblocker => "unblocker",
3069 _ => "crawl",
3070 }
3071 }
3072
3073 pub fn uses_proxy(&self) -> bool {
3075 matches!(
3076 self.mode,
3077 SpiderCloudMode::Proxy | SpiderCloudMode::Fallback | SpiderCloudMode::Smart
3078 )
3079 }
3080
3081 fn default_api_url() -> String {
3082 "https://api.spider.cloud".to_string()
3083 }
3084
3085 fn default_proxy_url() -> String {
3086 "https://proxy.spider.cloud".to_string()
3087 }
3088}
3089
3090#[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3100#[derive(Debug, Clone, PartialEq, Eq)]
3101#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
3102pub struct SpiderBrowserConfig {
3103 pub api_key: String,
3105 #[cfg_attr(
3107 feature = "serde",
3108 serde(default = "SpiderBrowserConfig::default_wss_url")
3109 )]
3110 pub wss_url: String,
3111 #[cfg_attr(feature = "serde", serde(default))]
3113 pub stealth: bool,
3114 #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
3116 pub browser: Option<String>,
3117 #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
3119 pub country: Option<String>,
3120 #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
3122 pub extra_params: Option<Vec<(String, String)>>,
3123}
3124
3125#[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3126impl Default for SpiderBrowserConfig {
3127 fn default() -> Self {
3128 Self {
3129 api_key: String::new(),
3130 wss_url: Self::default_wss_url(),
3131 stealth: false,
3132 browser: None,
3133 country: None,
3134 extra_params: None,
3135 }
3136 }
3137}
3138
3139#[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3140impl SpiderBrowserConfig {
3141 pub fn new(api_key: impl Into<String>) -> Self {
3143 Self {
3144 api_key: api_key.into(),
3145 ..Default::default()
3146 }
3147 }
3148
3149 pub fn with_wss_url(mut self, url: impl Into<String>) -> Self {
3151 self.wss_url = url.into();
3152 self
3153 }
3154
3155 pub fn with_stealth(mut self, stealth: bool) -> Self {
3157 self.stealth = stealth;
3158 self
3159 }
3160
3161 pub fn with_browser(mut self, browser: impl Into<String>) -> Self {
3163 self.browser = Some(browser.into());
3164 self
3165 }
3166
3167 pub fn with_country(mut self, country: impl Into<String>) -> Self {
3169 self.country = Some(country.into());
3170 self
3171 }
3172
3173 pub fn with_extra_params(mut self, params: Vec<(String, String)>) -> Self {
3175 self.extra_params = Some(params);
3176 self
3177 }
3178
3179 pub fn connection_url(&self) -> String {
3184 let mut url = self.wss_url.clone();
3185
3186 if url.contains('?') {
3188 url.push('&');
3189 } else {
3190 url.push('?');
3191 }
3192 url.push_str("token=");
3193 url.push_str(&self.api_key);
3194
3195 if self.stealth {
3196 url.push_str("&stealth=true");
3197 }
3198 if let Some(ref browser) = self.browser {
3199 url.push_str("&browser=");
3200 url.push_str(browser);
3201 }
3202 if let Some(ref country) = self.country {
3203 url.push_str("&country=");
3204 url.push_str(country);
3205 }
3206 if let Some(ref extra) = self.extra_params {
3207 for (k, v) in extra {
3208 url.push('&');
3209 url.push_str(k);
3210 url.push('=');
3211 url.push_str(v);
3212 }
3213 }
3214
3215 url
3216 }
3217
3218 fn default_wss_url() -> String {
3219 "wss://browser.spider.cloud/v1/browser".to_string()
3220 }
3221}
3222
3223#[cfg(test)]
3224mod tests {
3225 use super::*;
3226
3227 #[test]
3228 fn test_configuration_defaults() {
3229 let config = Configuration::default();
3230 assert!(!config.respect_robots_txt);
3231 assert!(!config.subdomains);
3232 assert!(!config.tld);
3233 assert_eq!(config.delay, 0);
3234 assert!(config.user_agent.is_none());
3235 assert!(config.blacklist_url.is_none());
3236 assert!(config.whitelist_url.is_none());
3237 assert!(config.proxies.is_none());
3238 assert!(!config.http2_prior_knowledge);
3239 }
3240
3241 #[test]
3242 fn test_redirect_policy_variants() {
3243 assert_eq!(RedirectPolicy::default(), RedirectPolicy::Loose);
3244 let strict = RedirectPolicy::Strict;
3245 let none = RedirectPolicy::None;
3246 assert_ne!(strict, RedirectPolicy::Loose);
3247 assert_ne!(none, RedirectPolicy::Loose);
3248 assert_ne!(strict, none);
3249 }
3250
3251 #[test]
3252 fn test_redirect_limit_is_opt_in_for_chrome_path() {
3253 let fresh = Configuration::default();
3255 assert!(
3256 !fresh.redirect_limit_set,
3257 "Configuration::default() must not claim the redirect_limit was set"
3258 );
3259
3260 let mut opt_in = Configuration::default();
3262 opt_in.with_redirect_limit(3);
3263 assert!(opt_in.redirect_limit_set);
3264 assert_eq!(opt_in.redirect_limit, 3);
3265 }
3266
3267 #[test]
3268 fn test_proxy_ignore_variants() {
3269 assert_eq!(ProxyIgnore::default(), ProxyIgnore::No);
3270 let chrome = ProxyIgnore::Chrome;
3271 let http = ProxyIgnore::Http;
3272 assert_ne!(chrome, ProxyIgnore::No);
3273 assert_ne!(http, ProxyIgnore::No);
3274 assert_ne!(chrome, http);
3275 }
3276
3277 #[test]
3278 fn test_request_proxy_construction() {
3279 let proxy = RequestProxy {
3280 addr: "http://proxy.example.com:8080".to_string(),
3281 ignore: ProxyIgnore::No,
3282 };
3283 assert_eq!(proxy.addr, "http://proxy.example.com:8080");
3284 assert_eq!(proxy.ignore, ProxyIgnore::No);
3285 }
3286
3287 #[test]
3288 fn test_request_proxy_default() {
3289 let proxy = RequestProxy::default();
3290 assert!(proxy.addr.is_empty());
3291 assert_eq!(proxy.ignore, ProxyIgnore::No);
3292 }
3293
3294 #[test]
3295 fn test_configuration_blacklist_setup() {
3296 let mut config = Configuration::default();
3297 config.blacklist_url = Some(vec![
3298 "https://example.com/private".into(),
3299 "https://example.com/admin".into(),
3300 ]);
3301 assert_eq!(config.blacklist_url.as_ref().unwrap().len(), 2);
3302 }
3303
3304 #[test]
3305 fn test_configuration_whitelist_setup() {
3306 let mut config = Configuration::default();
3307 config.whitelist_url = Some(vec!["https://example.com/public".into()]);
3308 assert_eq!(config.whitelist_url.as_ref().unwrap().len(), 1);
3309 }
3310
3311 #[test]
3312 fn test_configuration_external_domains() {
3313 let mut config = Configuration::default();
3314 config.external_domains_caseless = Arc::new(
3315 [
3316 case_insensitive_string::CaseInsensitiveString::from("Example.Com"),
3317 case_insensitive_string::CaseInsensitiveString::from("OTHER.org"),
3318 ]
3319 .into_iter()
3320 .collect(),
3321 );
3322 assert_eq!(config.external_domains_caseless.len(), 2);
3323 assert!(config.external_domains_caseless.contains(
3324 &case_insensitive_string::CaseInsensitiveString::from("example.com")
3325 ));
3326 }
3327
3328 #[test]
3329 fn test_configuration_budget() {
3330 let mut config = Configuration::default();
3331 let mut budget = hashbrown::HashMap::new();
3332 budget.insert(
3333 case_insensitive_string::CaseInsensitiveString::from("/path"),
3334 100u32,
3335 );
3336 config.budget = Some(budget);
3337 assert!(config.budget.is_some());
3338 assert_eq!(
3339 config.budget.as_ref().unwrap().get(
3340 &case_insensitive_string::CaseInsensitiveString::from("/path")
3341 ),
3342 Some(&100u32)
3343 );
3344 }
3345
3346 #[cfg(not(feature = "regex"))]
3347 #[test]
3348 fn test_allow_list_set_default() {
3349 let allow_list = AllowListSet::default();
3350 assert!(allow_list.0.is_empty());
3351 }
3352
3353 #[cfg(feature = "agent")]
3354 #[test]
3355 fn test_build_remote_multimodal_engine_preserves_dual_models() {
3356 use crate::features::automation::{
3357 ModelEndpoint, RemoteMultimodalConfigs, VisionRouteMode,
3358 };
3359
3360 let mut config = Configuration::default();
3361 let mm = RemoteMultimodalConfigs::new(
3362 "https://api.example.com/v1/chat/completions",
3363 "primary-model",
3364 )
3365 .with_vision_model(ModelEndpoint::new("vision-model").with_api_key("vision-key"))
3366 .with_text_model(
3367 ModelEndpoint::new("text-model")
3368 .with_api_url("https://text.example.com/v1/chat/completions")
3369 .with_api_key("text-key"),
3370 )
3371 .with_vision_route_mode(VisionRouteMode::TextFirst);
3372 config.remote_multimodal = Some(Box::new(mm));
3373
3374 let engine = config
3375 .build_remote_multimodal_engine()
3376 .expect("engine should be built");
3377
3378 assert_eq!(
3379 engine.vision_model.as_ref().map(|m| m.model_name.as_str()),
3380 Some("vision-model")
3381 );
3382 assert_eq!(
3383 engine.text_model.as_ref().map(|m| m.model_name.as_str()),
3384 Some("text-model")
3385 );
3386 assert_eq!(engine.vision_route_mode, VisionRouteMode::TextFirst);
3387 }
3388
3389 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3390 #[test]
3391 fn test_spider_browser_config_defaults() {
3392 let cfg = SpiderBrowserConfig::new("test-key");
3393 assert_eq!(cfg.api_key, "test-key");
3394 assert_eq!(cfg.wss_url, "wss://browser.spider.cloud/v1/browser");
3395 assert!(!cfg.stealth);
3396 assert!(cfg.browser.is_none());
3397 assert!(cfg.country.is_none());
3398 assert!(cfg.extra_params.is_none());
3399 }
3400
3401 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3402 #[test]
3403 fn test_spider_browser_connection_url_basic() {
3404 let cfg = SpiderBrowserConfig::new("sk-abc123");
3405 assert_eq!(
3406 cfg.connection_url(),
3407 "wss://browser.spider.cloud/v1/browser?token=sk-abc123"
3408 );
3409 }
3410
3411 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3412 #[test]
3413 fn test_spider_browser_connection_url_full() {
3414 let cfg = SpiderBrowserConfig::new("sk-abc123")
3415 .with_stealth(true)
3416 .with_browser("chrome")
3417 .with_country("us")
3418 .with_extra_params(vec![("timeout".into(), "30000".into())]);
3419 assert_eq!(
3420 cfg.connection_url(),
3421 "wss://browser.spider.cloud/v1/browser?token=sk-abc123&stealth=true&browser=chrome&country=us&timeout=30000"
3422 );
3423 }
3424
3425 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3426 #[test]
3427 fn test_spider_browser_connection_url_custom_wss() {
3428 let cfg = SpiderBrowserConfig::new("key")
3429 .with_wss_url("wss://custom.browser.example.com/v1/browser");
3430 assert_eq!(
3431 cfg.connection_url(),
3432 "wss://custom.browser.example.com/v1/browser?token=key"
3433 );
3434 }
3435
3436 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3437 #[test]
3438 fn test_with_spider_browser_sets_chrome_connection() {
3439 let mut config = Configuration::default();
3440 config.with_spider_browser("my-api-key");
3441 assert_eq!(
3442 config.chrome_connection_url.as_deref(),
3443 Some("wss://browser.spider.cloud/v1/browser?token=my-api-key")
3444 );
3445 assert!(config.spider_browser.is_some());
3446 }
3447
3448 #[cfg(all(feature = "spider_cloud", feature = "chrome"))]
3449 #[test]
3450 fn test_with_spider_browser_config_stealth() {
3451 let mut config = Configuration::default();
3452 let browser_cfg = SpiderBrowserConfig::new("key")
3453 .with_stealth(true)
3454 .with_country("gb");
3455 config.with_spider_browser_config(browser_cfg);
3456 assert_eq!(
3457 config.chrome_connection_url.as_deref(),
3458 Some("wss://browser.spider.cloud/v1/browser?token=key&stealth=true&country=gb")
3459 );
3460 }
3461}