1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize, Default)]
4pub struct AppConfig {
5 #[serde(default)]
6 pub server: ServerConfig,
7 #[serde(default)]
8 pub renderer: RendererConfig,
9 #[serde(default)]
10 pub crawler: CrawlerConfig,
11 #[serde(default)]
12 pub extraction: ExtractionConfig,
13 #[serde(default)]
14 pub auth: AuthConfig,
15 #[serde(default)]
16 pub request: RequestConfig,
17 #[serde(default)]
18 pub search: SearchConfig,
19 #[serde(default)]
20 pub map: MapConfig,
21 #[serde(default)]
24 pub client: ClientConfig,
25}
26
27#[derive(Debug, Clone, Default, Deserialize)]
31pub struct ClientConfig {
32 #[serde(default)]
34 pub api_url: Option<String>,
35 #[serde(default)]
37 pub api_key: Option<String>,
38}
39
40#[derive(Debug, Clone, Deserialize, Default)]
42pub struct MapConfig {
43 #[serde(default)]
44 pub url_filter: MapUrlFilterConfig,
45}
46
47#[derive(Debug, Clone, Deserialize)]
52pub struct MapUrlFilterConfig {
53 #[serde(default = "default_true_filter")]
55 pub strip_tracking_params: bool,
56 #[serde(default = "default_true_filter")]
58 pub drop_action_urls: bool,
59 #[serde(default)]
61 pub gov_tld_drop_actions: bool,
62 #[serde(default)]
64 pub extra_tracking_params: Vec<String>,
65 #[serde(default)]
67 pub extra_action_params: Vec<String>,
68 #[serde(default)]
70 pub extra_preserve_params: Vec<String>,
71}
72
73impl Default for MapUrlFilterConfig {
74 fn default() -> Self {
75 Self {
76 strip_tracking_params: true,
77 drop_action_urls: true,
78 gov_tld_drop_actions: false,
79 extra_tracking_params: Vec::new(),
80 extra_action_params: Vec::new(),
81 extra_preserve_params: Vec::new(),
82 }
83 }
84}
85
86fn default_true_filter() -> bool {
87 true
88}
89
90pub const CDP_TIER_OVERHEAD_MS: u64 = 28_000;
100
101pub const MAX_WAIT_FOR_MS: u64 = 60_000;
108
109#[derive(Debug, Clone, Deserialize)]
115pub struct SearchConfig {
116 #[serde(default = "default_true_search")]
119 pub enabled: bool,
120 #[serde(default)]
123 pub searxng_url: Option<String>,
124 #[serde(default = "default_search_timeout_ms")]
126 pub timeout_ms: u64,
127 #[serde(default = "default_search_limit")]
129 pub default_limit: u32,
130 #[serde(default = "default_search_max_limit")]
132 pub max_limit: u32,
133 #[serde(default = "default_research_engines")]
136 pub research_engines: Vec<String>,
137 #[serde(default = "default_github_engines")]
139 pub github_engines: Vec<String>,
140}
141
142impl Default for SearchConfig {
143 fn default() -> Self {
144 Self {
145 enabled: true,
146 searxng_url: None,
147 timeout_ms: default_search_timeout_ms(),
148 default_limit: default_search_limit(),
149 max_limit: default_search_max_limit(),
150 research_engines: default_research_engines(),
151 github_engines: default_github_engines(),
152 }
153 }
154}
155
156fn default_true_search() -> bool {
157 true
158}
159fn default_search_timeout_ms() -> u64 {
160 15_000
161}
162fn default_search_limit() -> u32 {
163 5
164}
165fn default_search_max_limit() -> u32 {
166 20
167}
168fn default_research_engines() -> Vec<String> {
169 vec![
170 "arxiv".into(),
171 "crossref".into(),
172 "google scholar".into(),
173 "semantic scholar".into(),
174 ]
175}
176fn default_github_engines() -> Vec<String> {
177 vec!["github".into()]
178}
179
180#[derive(Debug, Clone, Deserialize)]
184pub struct RequestConfig {
185 #[serde(default = "default_deadline_ms")]
190 pub deadline_ms_default: u64,
191 #[serde(default = "default_true_request")]
201 pub auto_extend_deadline_for_ladder: bool,
202}
203
204impl Default for RequestConfig {
205 fn default() -> Self {
206 Self {
207 deadline_ms_default: default_deadline_ms(),
208 auto_extend_deadline_for_ladder: true,
209 }
210 }
211}
212
213fn default_true_request() -> bool {
214 true
215}
216
217fn default_deadline_ms() -> u64 {
218 8000
219}
220
221#[derive(Debug, Clone, Deserialize)]
222pub struct ServerConfig {
223 #[serde(default = "default_host")]
224 pub host: String,
225 #[serde(default = "default_port")]
226 pub port: u16,
227 #[serde(default = "default_request_timeout")]
228 pub request_timeout_secs: u64,
229 #[serde(default = "default_rate_limit_rps")]
231 pub rate_limit_rps: u64,
232}
233
234impl Default for ServerConfig {
235 fn default() -> Self {
236 Self {
237 host: default_host(),
238 port: default_port(),
239 request_timeout_secs: default_request_timeout(),
240 rate_limit_rps: default_rate_limit_rps(),
241 }
242 }
243}
244
245fn default_rate_limit_rps() -> u64 {
246 10
247}
248
249fn default_host() -> String {
250 "0.0.0.0".into()
251}
252fn default_port() -> u16 {
253 3000
254}
255fn default_request_timeout() -> u64 {
256 60
257}
258
259#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize)]
269#[serde(rename_all = "lowercase")]
270pub enum RendererMode {
271 #[default]
272 Auto,
273 None,
274 Lightpanda,
275 Chrome,
276 Playwright,
277}
278
279#[derive(Debug, Clone, Deserialize)]
280pub struct RendererConfig {
281 #[serde(default)]
282 pub mode: RendererMode,
283 #[serde(default = "default_page_timeout")]
287 pub page_timeout_ms: u64,
288 #[serde(default)]
293 pub http_timeout_ms: Option<u64>,
294 #[serde(default)]
299 pub lightpanda_timeout_ms: Option<u64>,
300 #[serde(default)]
304 pub chrome_timeout_ms: Option<u64>,
305 #[serde(default = "default_pool_size")]
306 pub pool_size: usize,
307 #[serde(default, alias = "force_js")]
312 pub render_js_default: Option<bool>,
313 #[serde(default)]
314 pub lightpanda: Option<CdpEndpoint>,
315 #[serde(default)]
316 pub playwright: Option<CdpEndpoint>,
317 #[serde(default)]
318 pub chrome: Option<CdpEndpoint>,
319 #[serde(default)]
323 pub chrome_intercept_resources: bool,
324 #[serde(default)]
328 pub chrome_intercept_stylesheets: bool,
329 #[serde(default)]
332 pub chrome_host_intercept_disable: Vec<String>,
333 #[serde(default = "default_chrome_nav_budget_ms")]
338 pub chrome_nav_budget_ms: u64,
339 #[serde(default)]
346 pub chrome_context_pool_enabled: bool,
347 #[serde(default)]
350 pub chrome_pool: ChromePoolConfig,
351 #[serde(default)]
355 pub chrome_backend: ChromeBackend,
356 #[serde(default)]
360 pub use_predictor: bool,
361 #[serde(default)]
364 pub escalation: EscalationConfig,
365 #[serde(default)]
367 pub antibot: AntibotConfig,
368}
369
370#[derive(Debug, Clone, Deserialize)]
373pub struct EscalationConfig {
374 #[serde(default)]
376 pub enabled: bool,
377 #[serde(default = "default_waterfall_timeout_ms")]
381 pub waterfall_timeout_ms: u64,
382 #[serde(default = "default_escalation_global_timeout_ms")]
384 pub global_timeout_ms: u64,
385 #[serde(default)]
388 pub residential_proxy: bool,
389 #[serde(default = "default_proxy_country")]
391 pub proxy_country: String,
392}
393
394impl Default for EscalationConfig {
395 fn default() -> Self {
396 Self {
397 enabled: false,
398 waterfall_timeout_ms: default_waterfall_timeout_ms(),
399 global_timeout_ms: default_escalation_global_timeout_ms(),
400 residential_proxy: false,
401 proxy_country: default_proxy_country(),
402 }
403 }
404}
405
406fn default_waterfall_timeout_ms() -> u64 {
407 8_000
408}
409fn default_escalation_global_timeout_ms() -> u64 {
410 60_000
411}
412fn default_proxy_country() -> String {
413 "us".to_string()
414}
415
416#[derive(Debug, Clone, Deserialize)]
419pub struct AntibotConfig {
420 #[serde(default = "default_true")]
422 pub enabled: bool,
423 #[serde(default)]
426 pub escalate_on_signal: bool,
427}
428
429impl Default for AntibotConfig {
430 fn default() -> Self {
431 Self {
432 enabled: true,
433 escalate_on_signal: false,
434 }
435 }
436}
437
438fn default_chrome_nav_budget_ms() -> u64 {
439 12_000
440}
441
442#[derive(Debug, Clone, Deserialize)]
446pub struct ChromePoolConfig {
447 #[serde(default)]
450 pub size: Option<usize>,
451 #[serde(default = "default_recycle_after_navs")]
454 pub recycle_after_navs: u32,
455 #[serde(default = "default_idle_timeout_secs")]
457 pub idle_timeout_secs: u64,
458 #[serde(default = "default_health_check_secs")]
460 pub health_check_secs: u64,
461 #[serde(default = "default_shutdown_drain_secs")]
463 pub shutdown_drain_secs: u64,
464}
465
466impl Default for ChromePoolConfig {
467 fn default() -> Self {
468 Self {
469 size: None,
470 recycle_after_navs: default_recycle_after_navs(),
471 idle_timeout_secs: default_idle_timeout_secs(),
472 health_check_secs: default_health_check_secs(),
473 shutdown_drain_secs: default_shutdown_drain_secs(),
474 }
475 }
476}
477
478fn default_recycle_after_navs() -> u32 {
479 1
480}
481fn default_idle_timeout_secs() -> u64 {
482 300
483}
484fn default_health_check_secs() -> u64 {
485 60
486}
487fn default_shutdown_drain_secs() -> u64 {
488 30
489}
490
491#[derive(Debug, Clone, Copy, Default, Deserialize, PartialEq, Eq)]
496#[serde(rename_all = "lowercase")]
497pub enum ChromeBackend {
498 #[default]
501 Vanilla,
502 Browserless,
505}
506
507impl Default for RendererConfig {
508 fn default() -> Self {
509 Self {
510 mode: RendererMode::default(),
511 page_timeout_ms: default_page_timeout(),
512 http_timeout_ms: None,
513 lightpanda_timeout_ms: None,
514 chrome_timeout_ms: None,
515 pool_size: default_pool_size(),
516 render_js_default: None,
517 lightpanda: None,
518 playwright: None,
519 chrome: None,
520 chrome_intercept_resources: false,
521 chrome_intercept_stylesheets: false,
522 chrome_host_intercept_disable: Vec::new(),
523 chrome_nav_budget_ms: default_chrome_nav_budget_ms(),
524 chrome_context_pool_enabled: false,
525 chrome_pool: ChromePoolConfig::default(),
526 chrome_backend: ChromeBackend::default(),
527 use_predictor: false,
528 escalation: EscalationConfig::default(),
529 antibot: AntibotConfig::default(),
530 }
531 }
532}
533fn default_page_timeout() -> u64 {
534 30000
535}
536
537impl RendererConfig {
538 pub fn http_timeout(&self) -> u64 {
548 self.http_timeout_ms.unwrap_or(self.page_timeout_ms)
549 }
550 pub fn lightpanda_timeout(&self) -> u64 {
551 self.lightpanda_timeout_ms.unwrap_or(self.page_timeout_ms)
552 }
553 pub fn chrome_timeout(&self) -> u64 {
554 self.chrome_timeout_ms.unwrap_or(self.page_timeout_ms)
555 }
556
557 pub fn cdp_tier_count(&self) -> usize {
566 if !cfg!(feature = "cdp") {
567 return 0;
568 }
569 let want =
570 |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
571 let mut n = 0;
572 if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
573 n += 1;
574 }
575 if want(RendererMode::Playwright) && self.playwright.is_some() {
576 n += 1;
577 }
578 if want(RendererMode::Chrome) && self.chrome.is_some() {
579 n += 1;
580 }
581 n
582 }
583
584 pub fn min_deadline_for_full_ladder_ms(&self) -> u64 {
590 let want =
591 |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
592
593 let mut sum: u64 = 0;
594 if !matches!(self.mode, RendererMode::None) {
598 sum = sum.saturating_add(self.http_timeout());
599 }
600
601 if !cfg!(feature = "cdp") {
605 return sum;
606 }
607
608 let mut cdp_tier_count: u64 = 0;
609 if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
610 sum = sum.saturating_add(self.lightpanda_timeout());
611 cdp_tier_count += 1;
612 }
613 if want(RendererMode::Playwright) && self.playwright.is_some() {
614 sum = sum.saturating_add(self.chrome_timeout());
615 cdp_tier_count += 1;
616 }
617 if want(RendererMode::Chrome) && self.chrome.is_some() {
618 sum = sum.saturating_add(self.chrome_timeout());
619 cdp_tier_count += 1;
620 }
621 sum.saturating_add(cdp_tier_count.saturating_mul(CDP_TIER_OVERHEAD_MS))
622 }
623}
624fn default_pool_size() -> usize {
625 4
626}
627
628#[derive(Debug, Clone, Deserialize)]
629pub struct CdpEndpoint {
630 pub ws_url: String,
631}
632
633#[derive(Debug, Clone, Deserialize)]
635pub struct StealthConfig {
636 #[serde(default)]
638 pub enabled: bool,
639 #[serde(default)]
641 pub user_agents: Vec<String>,
642 #[serde(default = "default_jitter")]
644 pub jitter_factor: f64,
645 #[serde(default = "default_true")]
647 pub inject_headers: bool,
648}
649
650impl Default for StealthConfig {
651 fn default() -> Self {
652 Self {
653 enabled: false,
654 user_agents: vec![],
655 jitter_factor: default_jitter(),
656 inject_headers: true,
657 }
658 }
659}
660
661fn default_jitter() -> f64 {
662 0.2
663}
664
665pub const BUILTIN_UA_POOL: &[&str] = &[
667 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
668 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
669 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
670 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
671 "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
672];
673
674#[derive(Debug, Clone, Deserialize)]
675pub struct CrawlerConfig {
676 #[serde(default = "default_concurrency")]
677 pub max_concurrency: usize,
678 #[serde(default = "default_rps")]
679 pub requests_per_second: f64,
680 #[serde(default = "default_true")]
681 pub respect_robots_txt: bool,
682 #[serde(default = "default_ua")]
683 pub user_agent: String,
684 #[serde(default = "default_depth")]
685 pub default_max_depth: u32,
686 #[serde(default = "default_max_pages")]
687 pub default_max_pages: u32,
688 #[serde(default)]
691 pub proxy: Option<String>,
692 #[serde(default = "default_job_ttl")]
694 pub job_ttl_secs: u64,
695 #[serde(default)]
696 pub stealth: StealthConfig,
697 #[serde(default)]
702 pub per_host_min_interval_ms: u64,
703 #[serde(default = "default_per_host_max_concurrent")]
707 pub per_host_max_concurrent: u32,
708}
709
710fn default_per_host_max_concurrent() -> u32 {
711 1
712}
713
714impl Default for CrawlerConfig {
715 fn default() -> Self {
716 Self {
717 max_concurrency: default_concurrency(),
718 requests_per_second: default_rps(),
719 respect_robots_txt: true,
720 user_agent: default_ua(),
721 default_max_depth: default_depth(),
722 default_max_pages: default_max_pages(),
723 proxy: None,
724 job_ttl_secs: default_job_ttl(),
725 stealth: StealthConfig::default(),
726 per_host_min_interval_ms: 0,
727 per_host_max_concurrent: default_per_host_max_concurrent(),
728 }
729 }
730}
731
732fn default_concurrency() -> usize {
733 10
734}
735fn default_rps() -> f64 {
736 10.0
737}
738fn default_true() -> bool {
739 true
740}
741fn default_ua() -> String {
742 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 \
746 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
747 .into()
748}
749fn default_depth() -> u32 {
750 2
751}
752fn default_max_pages() -> u32 {
753 100
754}
755fn default_job_ttl() -> u64 {
756 3600
757}
758
759#[derive(Debug, Clone, Deserialize)]
760pub struct ExtractionConfig {
761 #[serde(default = "default_format")]
762 pub default_format: String,
763 #[serde(default = "default_true_ext")]
764 pub only_main_content: bool,
765 #[serde(default)]
766 pub llm: Option<LlmConfig>,
767 #[serde(default)]
770 pub domain_selectors: std::collections::HashMap<String, String>,
771 #[serde(default)]
772 pub llm_fallback: LlmFallbackConfig,
773 #[serde(default = "default_http_retry_threshold")]
776 pub http_retry_threshold_bytes: usize,
777 #[serde(default = "default_lightpanda_retry_threshold")]
781 pub lightpanda_retry_threshold_bytes: usize,
782}
783
784fn default_http_retry_threshold() -> usize {
785 100
786}
787
788fn default_lightpanda_retry_threshold() -> usize {
789 2000
790}
791
792impl Default for ExtractionConfig {
793 fn default() -> Self {
794 Self {
795 default_format: default_format(),
796 only_main_content: true,
797 llm: None,
798 domain_selectors: std::collections::HashMap::new(),
799 llm_fallback: LlmFallbackConfig::default(),
800 http_retry_threshold_bytes: default_http_retry_threshold(),
801 lightpanda_retry_threshold_bytes: default_lightpanda_retry_threshold(),
802 }
803 }
804}
805
806#[derive(Debug, Clone, Deserialize)]
807pub struct LlmFallbackConfig {
808 #[serde(default)]
809 pub enable: bool,
810 #[serde(default = "default_llm_quality_threshold")]
811 pub quality_threshold: f32,
812 #[serde(default = "default_llm_max_html_bytes")]
813 pub max_html_bytes: usize,
814 #[serde(default)]
819 pub always_run: bool,
820}
821
822impl Default for LlmFallbackConfig {
823 fn default() -> Self {
824 Self {
825 enable: false,
826 quality_threshold: default_llm_quality_threshold(),
827 max_html_bytes: default_llm_max_html_bytes(),
828 always_run: false,
829 }
830 }
831}
832
833fn default_llm_quality_threshold() -> f32 {
834 0.3
835}
836fn default_llm_max_html_bytes() -> usize {
837 100_000
838}
839
840#[derive(Debug, Clone, Deserialize)]
841pub struct LlmConfig {
842 #[serde(default = "default_llm_provider")]
843 pub provider: String,
844 pub api_key: String,
845 #[serde(default = "default_llm_model")]
846 pub model: String,
847 #[serde(default)]
848 pub base_url: Option<String>,
849 #[serde(default = "default_llm_max_tokens")]
850 pub max_tokens: u32,
851 #[serde(default)]
854 pub azure_api_version: Option<String>,
855 #[serde(default = "default_llm_max_concurrency")]
858 pub max_concurrency: usize,
859 #[serde(default = "default_llm_max_html_bytes")]
862 pub max_html_bytes: usize,
863 #[serde(default)]
867 pub require_byok_header: Option<String>,
868}
869
870impl Default for LlmConfig {
871 fn default() -> Self {
872 Self {
873 provider: default_llm_provider(),
874 api_key: String::new(),
875 model: default_llm_model(),
876 base_url: None,
877 max_tokens: default_llm_max_tokens(),
878 azure_api_version: None,
879 max_concurrency: default_llm_max_concurrency(),
880 max_html_bytes: default_llm_max_html_bytes(),
881 require_byok_header: None,
882 }
883 }
884}
885
886fn default_llm_max_concurrency() -> usize {
887 4
888}
889
890fn default_llm_provider() -> String {
891 "anthropic".into()
892}
893fn default_llm_model() -> String {
894 "claude-sonnet-4-20250514".into()
895}
896fn default_llm_max_tokens() -> u32 {
897 4096
898}
899
900fn default_format() -> String {
901 "markdown".into()
902}
903fn default_true_ext() -> bool {
904 true
905}
906
907#[derive(Debug, Clone, Default, Deserialize)]
908pub struct AuthConfig {
909 #[serde(default)]
910 pub api_keys: Vec<String>,
911}
912
913pub fn user_config_path() -> Option<std::path::PathBuf> {
918 if let Ok(dir) = std::env::var("CRW_USER_CONFIG_DIR") {
919 return Some(std::path::PathBuf::from(dir).join("config.toml"));
920 }
921 let home = std::env::var_os("HOME")?;
922 Some(
923 std::path::PathBuf::from(home)
924 .join(".config")
925 .join("crw")
926 .join("config.toml"),
927 )
928}
929
930impl AppConfig {
931 pub fn load() -> Result<Self, config::ConfigError> {
943 let mut builder = config::Config::builder()
944 .add_source(config::File::with_name("config.default").required(false));
945
946 if let Some(user_cfg) = user_config_path()
949 && user_cfg.exists()
950 {
951 builder = builder.add_source(config::File::from(user_cfg).required(false));
952 }
953
954 if let Ok(extra) = std::env::var("CRW_CONFIG") {
956 builder = builder.add_source(config::File::with_name(&extra).required(true));
957 } else {
958 builder = builder.add_source(config::File::with_name("config.local").required(false));
959 }
960
961 let cfg = builder
962 .add_source(
963 config::Environment::with_prefix("CRW")
964 .prefix_separator("_")
965 .separator("__")
966 .try_parsing(true),
967 )
968 .build()?;
969 cfg.try_deserialize()
970 }
971
972 pub fn effective_deadline_ms(
989 &self,
990 requested_deadline_ms: Option<u64>,
991 wait_for_ms: Option<u64>,
992 ) -> u64 {
993 if let Some(explicit) = requested_deadline_ms {
994 return explicit;
995 }
996 let default_ms = self.request.deadline_ms_default;
997 if !self.request.auto_extend_deadline_for_ladder {
998 return default_ms;
999 }
1000 if self.renderer.cdp_tier_count() == 0 {
1007 return default_ms;
1008 }
1009 let ladder_min = self.renderer.min_deadline_for_full_ladder_ms();
1010 const SPA_DEFAULT_MS: u64 = 8_000;
1015 let extra = if let Some(w) = wait_for_ms {
1021 let bounded = w.min(MAX_WAIT_FOR_MS);
1022 let per_tier = bounded.saturating_sub(SPA_DEFAULT_MS);
1023 per_tier.saturating_mul(self.renderer.cdp_tier_count() as u64)
1024 } else {
1025 0
1026 };
1027 default_ms.max(ladder_min.saturating_add(extra))
1028 }
1029
1030 pub fn effective_request_timeout_secs(&self) -> u64 {
1043 let baseline = self.server.request_timeout_secs;
1044 if !self.request.auto_extend_deadline_for_ladder {
1045 return baseline;
1046 }
1047 const OUTER_BUFFER_SECS: u64 = 5;
1048 const MAP_REQUEST_TIMEOUT_CEILING_MS: u64 = 300_000;
1052 let scrape_ms = self.effective_deadline_ms(None, Some(MAX_WAIT_FOR_MS));
1058
1059 let conc = (self.crawler.max_concurrency.max(1)) as u64;
1063 let max_results = self.search.max_limit as u64;
1064 let enrich_batches = max_results.div_ceil(conc);
1065 let search_enrichment_ms = enrich_batches.saturating_mul(scrape_ms);
1066 let search_ms = self.search.timeout_ms.saturating_add(search_enrichment_ms);
1067
1068 let max_handler_ms = scrape_ms.max(search_ms).max(MAP_REQUEST_TIMEOUT_CEILING_MS);
1069 let needed_secs = max_handler_ms
1070 .div_ceil(1_000)
1071 .saturating_add(OUTER_BUFFER_SECS);
1072 baseline.max(needed_secs)
1073 }
1074}
1075
1076#[cfg(test)]
1077mod tests {
1078 use super::*;
1079
1080 static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
1083
1084 fn clear_renderer_env() {
1085 for k in [
1086 "CRW_RENDERER__MODE",
1087 "CRW_RENDERER__FORCE_JS",
1088 "CRW_RENDERER__RENDER_JS_DEFAULT",
1089 "CRW_RENDERER__LIGHTPANDA__WS_URL",
1090 "CRW_SERVER__PORT",
1091 ] {
1092 unsafe { std::env::remove_var(k) };
1093 }
1094 }
1095
1096 #[test]
1097 fn renderer_mode_parses_variants() {
1098 #[derive(Deserialize)]
1099 struct Wrap {
1100 mode: RendererMode,
1101 }
1102 let cases = [
1103 ("mode = \"auto\"", RendererMode::Auto),
1104 ("mode = \"none\"", RendererMode::None),
1105 ("mode = \"lightpanda\"", RendererMode::Lightpanda),
1106 ("mode = \"chrome\"", RendererMode::Chrome),
1107 ("mode = \"playwright\"", RendererMode::Playwright),
1108 ];
1109 for (toml_str, expected) in cases {
1110 let w: Wrap = toml::from_str(toml_str).unwrap();
1111 assert_eq!(w.mode, expected, "toml: {toml_str}");
1112 }
1113 }
1114
1115 #[test]
1116 fn renderer_mode_bogus_errors() {
1117 #[derive(Deserialize)]
1118 struct Wrap {
1119 #[allow(dead_code)]
1120 mode: RendererMode,
1121 }
1122 let err: Result<Wrap, _> = toml::from_str("mode = \"bogus\"");
1123 assert!(err.is_err(), "bogus mode should fail to parse");
1124 }
1125
1126 #[test]
1127 fn renderer_config_default_mode_is_auto() {
1128 let cfg = RendererConfig::default();
1129 assert_eq!(cfg.mode, RendererMode::Auto);
1130 assert_eq!(cfg.render_js_default, None);
1131 }
1132
1133 #[test]
1134 fn render_js_default_force_js_alias() {
1135 let cfg: RendererConfig = toml::from_str("force_js = true").unwrap();
1136 assert_eq!(cfg.render_js_default, Some(true));
1137 }
1138
1139 #[test]
1140 fn render_js_default_direct_field() {
1141 let cfg: RendererConfig = toml::from_str("render_js_default = false").unwrap();
1142 assert_eq!(cfg.render_js_default, Some(false));
1143 }
1144
1145 #[test]
1146 fn env_var_renderer_mode_chrome() {
1147 let _g = ENV_LOCK.lock().unwrap();
1148 clear_renderer_env();
1149 unsafe { std::env::set_var("CRW_RENDERER__MODE", "chrome") };
1150 let cfg = AppConfig::load().unwrap();
1151 clear_renderer_env();
1152 assert_eq!(cfg.renderer.mode, RendererMode::Chrome);
1153 }
1154
1155 #[test]
1156 fn env_var_force_js_alias_works() {
1157 let _g = ENV_LOCK.lock().unwrap();
1158 clear_renderer_env();
1159 unsafe { std::env::set_var("CRW_RENDERER__FORCE_JS", "true") };
1160 let cfg = AppConfig::load().unwrap();
1161 clear_renderer_env();
1162 assert_eq!(cfg.renderer.render_js_default, Some(true));
1163 }
1164
1165 #[test]
1166 fn env_var_render_js_default_direct() {
1167 let _g = ENV_LOCK.lock().unwrap();
1168 clear_renderer_env();
1169 unsafe { std::env::set_var("CRW_RENDERER__RENDER_JS_DEFAULT", "true") };
1170 let cfg = AppConfig::load().unwrap();
1171 clear_renderer_env();
1172 assert_eq!(cfg.renderer.render_js_default, Some(true));
1173 }
1174
1175 #[test]
1176 fn request_config_defaults_match_plan() {
1177 let r = RequestConfig::default();
1178 assert_eq!(r.deadline_ms_default, 8000);
1179 assert!(r.auto_extend_deadline_for_ladder);
1180 }
1181
1182 #[test]
1183 fn default_app_config_enables_auto_extend() {
1184 let cfg = AppConfig::default();
1186 assert!(cfg.request.auto_extend_deadline_for_ladder);
1187 assert_eq!(cfg.request.deadline_ms_default, 8000);
1188 }
1189
1190 fn renderer_with_chrome_only(chrome_ms: u64) -> RendererConfig {
1191 RendererConfig {
1192 mode: RendererMode::Chrome,
1193 page_timeout_ms: chrome_ms,
1194 chrome_timeout_ms: Some(chrome_ms),
1195 chrome: Some(CdpEndpoint {
1196 ws_url: "ws://chrome:9222".into(),
1197 }),
1198 ..Default::default()
1199 }
1200 }
1201
1202 #[test]
1203 #[cfg(feature = "cdp")]
1204 fn min_deadline_full_ladder_chrome_only() {
1205 let r = renderer_with_chrome_only(30_000);
1207 assert_eq!(
1209 r.min_deadline_for_full_ladder_ms(),
1210 30_000 + 30_000 + 28_000
1211 );
1212 }
1213
1214 #[test]
1215 #[cfg(feature = "cdp")]
1216 fn min_deadline_full_ladder_auto_three_tiers() {
1217 let r = RendererConfig {
1218 mode: RendererMode::Auto,
1219 page_timeout_ms: 15_000,
1220 http_timeout_ms: Some(15_000),
1221 lightpanda_timeout_ms: Some(2_500),
1222 chrome_timeout_ms: Some(30_000),
1223 lightpanda: Some(CdpEndpoint {
1224 ws_url: "ws://lp:9222".into(),
1225 }),
1226 chrome: Some(CdpEndpoint {
1227 ws_url: "ws://chrome:9222".into(),
1228 }),
1229 ..Default::default()
1230 };
1231 assert_eq!(
1233 r.min_deadline_for_full_ladder_ms(),
1234 15_000 + 2_500 + 30_000 + 2 * 28_000
1235 );
1236 assert_eq!(r.cdp_tier_count(), 2);
1237 }
1238
1239 #[test]
1240 fn effective_deadline_explicit_bypasses_auto_extend() {
1241 let mut cfg = AppConfig::default();
1242 cfg.request.auto_extend_deadline_for_ladder = true;
1243 cfg.renderer = renderer_with_chrome_only(30_000);
1244 assert_eq!(cfg.effective_deadline_ms(Some(5_000), None), 5_000);
1246 assert_eq!(cfg.effective_deadline_ms(Some(500_000), None), 500_000);
1247 }
1248
1249 #[test]
1250 #[cfg(feature = "cdp")]
1251 fn effective_deadline_auto_extend_raises_to_ladder_min() {
1252 let mut cfg = AppConfig::default();
1253 cfg.request.auto_extend_deadline_for_ladder = true;
1254 cfg.request.deadline_ms_default = 8_000;
1255 cfg.renderer = renderer_with_chrome_only(30_000);
1256 let expected = cfg.renderer.min_deadline_for_full_ladder_ms();
1257 assert!(expected > 8_000);
1258 assert_eq!(cfg.effective_deadline_ms(None, None), expected);
1259 }
1260
1261 #[test]
1262 fn effective_deadline_default_wins_when_higher_than_ladder() {
1263 let mut cfg = AppConfig::default();
1264 cfg.request.auto_extend_deadline_for_ladder = true;
1265 cfg.request.deadline_ms_default = 1_000_000;
1266 cfg.renderer = renderer_with_chrome_only(30_000);
1267 assert_eq!(cfg.effective_deadline_ms(None, None), 1_000_000);
1268 }
1269
1270 #[test]
1271 fn effective_deadline_auto_extend_disabled_returns_baseline() {
1272 let mut cfg = AppConfig::default();
1273 cfg.request.auto_extend_deadline_for_ladder = false;
1274 cfg.request.deadline_ms_default = 8_000;
1275 cfg.renderer = renderer_with_chrome_only(30_000);
1276 assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1277 }
1278
1279 #[test]
1280 #[cfg(feature = "cdp")]
1281 fn effective_deadline_extends_for_long_wait_for() {
1282 let mut cfg = AppConfig::default();
1283 cfg.request.auto_extend_deadline_for_ladder = true;
1284 cfg.request.deadline_ms_default = 8_000;
1285 cfg.renderer = renderer_with_chrome_only(30_000);
1286 let base = cfg.renderer.min_deadline_for_full_ladder_ms();
1287 let tier_count = cfg.renderer.cdp_tier_count() as u64;
1288 let with_wait = cfg.effective_deadline_ms(None, Some(20_000));
1290 assert_eq!(with_wait, base + 12_000 * tier_count);
1291 assert_eq!(cfg.effective_deadline_ms(None, Some(2_000)), base);
1293 }
1294
1295 #[test]
1296 fn effective_request_timeout_covers_map_ceiling() {
1297 let mut cfg = AppConfig::default();
1298 cfg.request.auto_extend_deadline_for_ladder = true;
1299 cfg.request.deadline_ms_default = 8_000;
1300 cfg.renderer = renderer_with_chrome_only(30_000);
1301 cfg.search.timeout_ms = 15_000;
1302 cfg.crawler.max_concurrency = 10;
1303 cfg.search.max_limit = 20;
1304 cfg.server.request_timeout_secs = 60;
1305 assert!(cfg.effective_request_timeout_secs() >= 305);
1307 }
1308
1309 #[test]
1310 fn effective_request_timeout_disabled_returns_baseline() {
1311 let mut cfg = AppConfig::default();
1312 cfg.request.auto_extend_deadline_for_ladder = false;
1313 cfg.server.request_timeout_secs = 60;
1314 assert_eq!(cfg.effective_request_timeout_secs(), 60);
1315 }
1316
1317 #[test]
1318 fn effective_request_timeout_respects_operator_override() {
1319 let mut cfg = AppConfig::default();
1320 cfg.request.auto_extend_deadline_for_ladder = true;
1321 cfg.server.request_timeout_secs = 600; cfg.renderer = renderer_with_chrome_only(30_000);
1323 assert_eq!(cfg.effective_request_timeout_secs(), 600);
1325 }
1326
1327 #[test]
1328 fn effective_request_timeout_search_sequential_batching() {
1329 let mut cfg = AppConfig::default();
1331 cfg.request.auto_extend_deadline_for_ladder = true;
1332 cfg.request.deadline_ms_default = 8_000;
1333 cfg.renderer = renderer_with_chrome_only(30_000);
1334 cfg.search.timeout_ms = 15_000;
1335 cfg.search.max_limit = 20;
1336 cfg.crawler.max_concurrency = 1;
1337 cfg.server.request_timeout_secs = 60;
1338 let secs = cfg.effective_request_timeout_secs();
1342 let scrape_ms = cfg.effective_deadline_ms(None, Some(60_000));
1343 let expected_search_ms = 15_000 + 20 * scrape_ms;
1344 let expected_max_ms = scrape_ms.max(expected_search_ms).max(300_000);
1345 let expected_secs = expected_max_ms.div_ceil(1_000) + 5;
1346 assert_eq!(secs, 60u64.max(expected_secs));
1347 }
1348
1349 #[test]
1350 #[cfg(not(feature = "cdp"))]
1351 fn cdp_tier_count_zero_without_cdp_feature() {
1352 let r = RendererConfig {
1356 mode: RendererMode::Auto,
1357 page_timeout_ms: 15_000,
1358 chrome_timeout_ms: Some(30_000),
1359 chrome: Some(CdpEndpoint {
1360 ws_url: "ws://chrome:9222".into(),
1361 }),
1362 lightpanda: Some(CdpEndpoint {
1363 ws_url: "ws://lp:9222".into(),
1364 }),
1365 ..Default::default()
1366 };
1367 assert_eq!(r.cdp_tier_count(), 0);
1368 assert_eq!(r.min_deadline_for_full_ladder_ms(), 15_000);
1370 }
1371
1372 #[test]
1373 fn effective_deadline_skipped_for_http_only_mode() {
1374 let mut cfg = AppConfig::default();
1379 cfg.request.auto_extend_deadline_for_ladder = true;
1380 cfg.request.deadline_ms_default = 8_000;
1381 cfg.renderer = RendererConfig {
1382 mode: RendererMode::Auto,
1383 page_timeout_ms: 30_000,
1384 lightpanda: None,
1386 playwright: None,
1387 chrome: None,
1388 ..Default::default()
1389 };
1390 assert_eq!(cfg.renderer.cdp_tier_count(), 0);
1391 assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1392 assert_eq!(cfg.effective_deadline_ms(None, Some(30_000)), 8_000);
1393 }
1394
1395 #[test]
1396 #[cfg(feature = "cdp")]
1397 fn min_deadline_full_ladder_playwright_only() {
1398 let r = RendererConfig {
1401 mode: RendererMode::Playwright,
1402 page_timeout_ms: 15_000,
1403 http_timeout_ms: Some(15_000),
1404 chrome_timeout_ms: Some(30_000),
1405 playwright: Some(CdpEndpoint {
1406 ws_url: "ws://playwright:9222".into(),
1407 }),
1408 ..Default::default()
1409 };
1410 assert_eq!(r.cdp_tier_count(), 1);
1411 assert_eq!(
1413 r.min_deadline_for_full_ladder_ms(),
1414 15_000 + 30_000 + 28_000
1415 );
1416 }
1417
1418 #[test]
1419 fn renderer_phase_toggles_default_off_or_safe() {
1420 let r = RendererConfig::default();
1421 assert!(!r.chrome_intercept_resources);
1422 assert!(!r.chrome_intercept_stylesheets);
1423 assert!(r.chrome_host_intercept_disable.is_empty());
1424 assert_eq!(r.chrome_nav_budget_ms, 12_000);
1425 assert!(!r.chrome_context_pool_enabled);
1426 assert!(!r.use_predictor);
1427 }
1428
1429 #[test]
1430 fn crawler_per_host_limiter_defaults() {
1431 let c = CrawlerConfig::default();
1432 assert_eq!(c.per_host_min_interval_ms, 0);
1433 assert_eq!(c.per_host_max_concurrent, 1);
1434 }
1435
1436 #[test]
1437 fn env_var_overrides_toml_defaults() {
1438 let _g = ENV_LOCK.lock().unwrap();
1439 clear_renderer_env();
1440 unsafe {
1441 std::env::set_var("CRW_SERVER__PORT", "4444");
1442 std::env::set_var("CRW_RENDERER__LIGHTPANDA__WS_URL", "ws://test:9999/");
1443 }
1444 let cfg = AppConfig::load().unwrap();
1445 clear_renderer_env();
1446
1447 assert_eq!(cfg.server.port, 4444, "env var should override server.port");
1448 assert_eq!(
1449 cfg.renderer.lightpanda.as_ref().unwrap().ws_url,
1450 "ws://test:9999/",
1451 "env var should override renderer.lightpanda.ws_url"
1452 );
1453 }
1454
1455 #[test]
1456 fn user_config_path_honors_override_env() {
1457 let _g = ENV_LOCK.lock().unwrap();
1458 let tmp = std::env::temp_dir().join(format!("crw-cfg-test-{}", std::process::id()));
1459 unsafe {
1460 std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1461 }
1462 let p = user_config_path().unwrap();
1463 unsafe {
1464 std::env::remove_var("CRW_USER_CONFIG_DIR");
1465 }
1466 assert_eq!(p, tmp.join("config.toml"));
1467 }
1468
1469 #[test]
1470 fn user_config_file_is_picked_up_by_load() {
1471 let _g = ENV_LOCK.lock().unwrap();
1472 clear_renderer_env();
1473 let tmp = std::env::temp_dir().join(format!("crw-load-test-{}", std::process::id()));
1474 std::fs::create_dir_all(&tmp).unwrap();
1475 let cfg_path = tmp.join("config.toml");
1476 std::fs::write(
1477 &cfg_path,
1478 r#"
1479[client]
1480api_url = "https://api.example.com"
1481api_key = "test-key-123"
1482
1483[search]
1484searxng_url = "http://localhost:9999"
1485
1486[extraction.llm]
1487provider = "deepseek"
1488api_key = "sk-test"
1489model = "deepseek-chat"
1490"#,
1491 )
1492 .unwrap();
1493
1494 unsafe {
1495 std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1496 }
1497 let cfg = AppConfig::load().unwrap();
1498 unsafe {
1499 std::env::remove_var("CRW_USER_CONFIG_DIR");
1500 }
1501 std::fs::remove_dir_all(&tmp).ok();
1502
1503 assert_eq!(
1504 cfg.client.api_url.as_deref(),
1505 Some("https://api.example.com")
1506 );
1507 assert_eq!(cfg.client.api_key.as_deref(), Some("test-key-123"));
1508 assert_eq!(
1509 cfg.search.searxng_url.as_deref(),
1510 Some("http://localhost:9999")
1511 );
1512 let llm = cfg.extraction.llm.expect("llm config present");
1513 assert_eq!(llm.provider, "deepseek");
1514 assert_eq!(llm.api_key, "sk-test");
1515 }
1516
1517 #[test]
1518 fn env_var_beats_user_config() {
1519 let _g = ENV_LOCK.lock().unwrap();
1520 clear_renderer_env();
1521 let tmp = std::env::temp_dir().join(format!("crw-prec-test-{}", std::process::id()));
1522 std::fs::create_dir_all(&tmp).unwrap();
1523 std::fs::write(
1524 tmp.join("config.toml"),
1525 r#"
1526[search]
1527searxng_url = "http://from-file:8080"
1528"#,
1529 )
1530 .unwrap();
1531
1532 unsafe {
1533 std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1534 std::env::set_var("CRW_SEARCH__SEARXNG_URL", "http://from-env:8080");
1535 }
1536 let cfg = AppConfig::load().unwrap();
1537 unsafe {
1538 std::env::remove_var("CRW_USER_CONFIG_DIR");
1539 std::env::remove_var("CRW_SEARCH__SEARXNG_URL");
1540 }
1541 std::fs::remove_dir_all(&tmp).ok();
1542
1543 assert_eq!(
1544 cfg.search.searxng_url.as_deref(),
1545 Some("http://from-env:8080"),
1546 "env var must win over user config file"
1547 );
1548 }
1549}