1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize, Default)]
4pub struct AppConfig {
5 #[serde(default)]
6 pub server: ServerConfig,
7 #[serde(default)]
8 pub renderer: RendererConfig,
9 #[serde(default)]
10 pub crawler: CrawlerConfig,
11 #[serde(default)]
12 pub extraction: ExtractionConfig,
13 #[serde(default)]
14 pub auth: AuthConfig,
15 #[serde(default)]
16 pub request: RequestConfig,
17 #[serde(default)]
18 pub search: SearchConfig,
19 #[serde(default)]
20 pub map: MapConfig,
21}
22
23#[derive(Debug, Clone, Deserialize, Default)]
25pub struct MapConfig {
26 #[serde(default)]
27 pub url_filter: MapUrlFilterConfig,
28}
29
30#[derive(Debug, Clone, Deserialize)]
35pub struct MapUrlFilterConfig {
36 #[serde(default = "default_true_filter")]
38 pub strip_tracking_params: bool,
39 #[serde(default = "default_true_filter")]
41 pub drop_action_urls: bool,
42 #[serde(default)]
44 pub gov_tld_drop_actions: bool,
45 #[serde(default)]
47 pub extra_tracking_params: Vec<String>,
48 #[serde(default)]
50 pub extra_action_params: Vec<String>,
51 #[serde(default)]
53 pub extra_preserve_params: Vec<String>,
54}
55
56impl Default for MapUrlFilterConfig {
57 fn default() -> Self {
58 Self {
59 strip_tracking_params: true,
60 drop_action_urls: true,
61 gov_tld_drop_actions: false,
62 extra_tracking_params: Vec::new(),
63 extra_action_params: Vec::new(),
64 extra_preserve_params: Vec::new(),
65 }
66 }
67}
68
69fn default_true_filter() -> bool {
70 true
71}
72
73pub const CDP_TIER_OVERHEAD_MS: u64 = 28_000;
83
84pub const MAX_WAIT_FOR_MS: u64 = 60_000;
91
92#[derive(Debug, Clone, Deserialize)]
98pub struct SearchConfig {
99 #[serde(default = "default_true_search")]
102 pub enabled: bool,
103 #[serde(default)]
106 pub searxng_url: Option<String>,
107 #[serde(default = "default_search_timeout_ms")]
109 pub timeout_ms: u64,
110 #[serde(default = "default_search_limit")]
112 pub default_limit: u32,
113 #[serde(default = "default_search_max_limit")]
115 pub max_limit: u32,
116 #[serde(default = "default_research_engines")]
119 pub research_engines: Vec<String>,
120 #[serde(default = "default_github_engines")]
122 pub github_engines: Vec<String>,
123}
124
125impl Default for SearchConfig {
126 fn default() -> Self {
127 Self {
128 enabled: true,
129 searxng_url: None,
130 timeout_ms: default_search_timeout_ms(),
131 default_limit: default_search_limit(),
132 max_limit: default_search_max_limit(),
133 research_engines: default_research_engines(),
134 github_engines: default_github_engines(),
135 }
136 }
137}
138
139fn default_true_search() -> bool {
140 true
141}
142fn default_search_timeout_ms() -> u64 {
143 15_000
144}
145fn default_search_limit() -> u32 {
146 5
147}
148fn default_search_max_limit() -> u32 {
149 20
150}
151fn default_research_engines() -> Vec<String> {
152 vec![
153 "arxiv".into(),
154 "crossref".into(),
155 "google scholar".into(),
156 "semantic scholar".into(),
157 ]
158}
159fn default_github_engines() -> Vec<String> {
160 vec!["github".into()]
161}
162
163#[derive(Debug, Clone, Deserialize)]
167pub struct RequestConfig {
168 #[serde(default = "default_deadline_ms")]
173 pub deadline_ms_default: u64,
174 #[serde(default = "default_true_request")]
184 pub auto_extend_deadline_for_ladder: bool,
185}
186
187impl Default for RequestConfig {
188 fn default() -> Self {
189 Self {
190 deadline_ms_default: default_deadline_ms(),
191 auto_extend_deadline_for_ladder: true,
192 }
193 }
194}
195
196fn default_true_request() -> bool {
197 true
198}
199
200fn default_deadline_ms() -> u64 {
201 8000
202}
203
204#[derive(Debug, Clone, Deserialize)]
205pub struct ServerConfig {
206 #[serde(default = "default_host")]
207 pub host: String,
208 #[serde(default = "default_port")]
209 pub port: u16,
210 #[serde(default = "default_request_timeout")]
211 pub request_timeout_secs: u64,
212 #[serde(default = "default_rate_limit_rps")]
214 pub rate_limit_rps: u64,
215}
216
217impl Default for ServerConfig {
218 fn default() -> Self {
219 Self {
220 host: default_host(),
221 port: default_port(),
222 request_timeout_secs: default_request_timeout(),
223 rate_limit_rps: default_rate_limit_rps(),
224 }
225 }
226}
227
228fn default_rate_limit_rps() -> u64 {
229 10
230}
231
232fn default_host() -> String {
233 "0.0.0.0".into()
234}
235fn default_port() -> u16 {
236 3000
237}
238fn default_request_timeout() -> u64 {
239 60
240}
241
242#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize)]
252#[serde(rename_all = "lowercase")]
253pub enum RendererMode {
254 #[default]
255 Auto,
256 None,
257 Lightpanda,
258 Chrome,
259 Playwright,
260}
261
262#[derive(Debug, Clone, Deserialize)]
263pub struct RendererConfig {
264 #[serde(default)]
265 pub mode: RendererMode,
266 #[serde(default = "default_page_timeout")]
270 pub page_timeout_ms: u64,
271 #[serde(default)]
276 pub http_timeout_ms: Option<u64>,
277 #[serde(default)]
282 pub lightpanda_timeout_ms: Option<u64>,
283 #[serde(default)]
287 pub chrome_timeout_ms: Option<u64>,
288 #[serde(default = "default_pool_size")]
289 pub pool_size: usize,
290 #[serde(default, alias = "force_js")]
295 pub render_js_default: Option<bool>,
296 #[serde(default)]
297 pub lightpanda: Option<CdpEndpoint>,
298 #[serde(default)]
299 pub playwright: Option<CdpEndpoint>,
300 #[serde(default)]
301 pub chrome: Option<CdpEndpoint>,
302 #[serde(default)]
306 pub chrome_intercept_resources: bool,
307 #[serde(default)]
311 pub chrome_intercept_stylesheets: bool,
312 #[serde(default)]
315 pub chrome_host_intercept_disable: Vec<String>,
316 #[serde(default = "default_chrome_nav_budget_ms")]
321 pub chrome_nav_budget_ms: u64,
322 #[serde(default)]
329 pub chrome_context_pool_enabled: bool,
330 #[serde(default)]
333 pub chrome_pool: ChromePoolConfig,
334 #[serde(default)]
338 pub chrome_backend: ChromeBackend,
339 #[serde(default)]
343 pub use_predictor: bool,
344 #[serde(default)]
347 pub escalation: EscalationConfig,
348 #[serde(default)]
350 pub antibot: AntibotConfig,
351}
352
353#[derive(Debug, Clone, Deserialize)]
356pub struct EscalationConfig {
357 #[serde(default)]
359 pub enabled: bool,
360 #[serde(default = "default_waterfall_timeout_ms")]
364 pub waterfall_timeout_ms: u64,
365 #[serde(default = "default_escalation_global_timeout_ms")]
367 pub global_timeout_ms: u64,
368 #[serde(default)]
371 pub residential_proxy: bool,
372 #[serde(default = "default_proxy_country")]
374 pub proxy_country: String,
375}
376
377impl Default for EscalationConfig {
378 fn default() -> Self {
379 Self {
380 enabled: false,
381 waterfall_timeout_ms: default_waterfall_timeout_ms(),
382 global_timeout_ms: default_escalation_global_timeout_ms(),
383 residential_proxy: false,
384 proxy_country: default_proxy_country(),
385 }
386 }
387}
388
389fn default_waterfall_timeout_ms() -> u64 {
390 8_000
391}
392fn default_escalation_global_timeout_ms() -> u64 {
393 60_000
394}
395fn default_proxy_country() -> String {
396 "us".to_string()
397}
398
399#[derive(Debug, Clone, Deserialize)]
402pub struct AntibotConfig {
403 #[serde(default = "default_true")]
405 pub enabled: bool,
406 #[serde(default)]
409 pub escalate_on_signal: bool,
410}
411
412impl Default for AntibotConfig {
413 fn default() -> Self {
414 Self {
415 enabled: true,
416 escalate_on_signal: false,
417 }
418 }
419}
420
421fn default_chrome_nav_budget_ms() -> u64 {
422 12_000
423}
424
425#[derive(Debug, Clone, Deserialize)]
429pub struct ChromePoolConfig {
430 #[serde(default)]
433 pub size: Option<usize>,
434 #[serde(default = "default_recycle_after_navs")]
437 pub recycle_after_navs: u32,
438 #[serde(default = "default_idle_timeout_secs")]
440 pub idle_timeout_secs: u64,
441 #[serde(default = "default_health_check_secs")]
443 pub health_check_secs: u64,
444 #[serde(default = "default_shutdown_drain_secs")]
446 pub shutdown_drain_secs: u64,
447}
448
449impl Default for ChromePoolConfig {
450 fn default() -> Self {
451 Self {
452 size: None,
453 recycle_after_navs: default_recycle_after_navs(),
454 idle_timeout_secs: default_idle_timeout_secs(),
455 health_check_secs: default_health_check_secs(),
456 shutdown_drain_secs: default_shutdown_drain_secs(),
457 }
458 }
459}
460
461fn default_recycle_after_navs() -> u32 {
462 1
463}
464fn default_idle_timeout_secs() -> u64 {
465 300
466}
467fn default_health_check_secs() -> u64 {
468 60
469}
470fn default_shutdown_drain_secs() -> u64 {
471 30
472}
473
474#[derive(Debug, Clone, Copy, Default, Deserialize, PartialEq, Eq)]
479#[serde(rename_all = "lowercase")]
480pub enum ChromeBackend {
481 #[default]
484 Vanilla,
485 Browserless,
488}
489
490impl Default for RendererConfig {
491 fn default() -> Self {
492 Self {
493 mode: RendererMode::default(),
494 page_timeout_ms: default_page_timeout(),
495 http_timeout_ms: None,
496 lightpanda_timeout_ms: None,
497 chrome_timeout_ms: None,
498 pool_size: default_pool_size(),
499 render_js_default: None,
500 lightpanda: None,
501 playwright: None,
502 chrome: None,
503 chrome_intercept_resources: false,
504 chrome_intercept_stylesheets: false,
505 chrome_host_intercept_disable: Vec::new(),
506 chrome_nav_budget_ms: default_chrome_nav_budget_ms(),
507 chrome_context_pool_enabled: false,
508 chrome_pool: ChromePoolConfig::default(),
509 chrome_backend: ChromeBackend::default(),
510 use_predictor: false,
511 escalation: EscalationConfig::default(),
512 antibot: AntibotConfig::default(),
513 }
514 }
515}
516fn default_page_timeout() -> u64 {
517 30000
518}
519
520impl RendererConfig {
521 pub fn http_timeout(&self) -> u64 {
531 self.http_timeout_ms.unwrap_or(self.page_timeout_ms)
532 }
533 pub fn lightpanda_timeout(&self) -> u64 {
534 self.lightpanda_timeout_ms.unwrap_or(self.page_timeout_ms)
535 }
536 pub fn chrome_timeout(&self) -> u64 {
537 self.chrome_timeout_ms.unwrap_or(self.page_timeout_ms)
538 }
539
540 pub fn cdp_tier_count(&self) -> usize {
549 if !cfg!(feature = "cdp") {
550 return 0;
551 }
552 let want =
553 |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
554 let mut n = 0;
555 if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
556 n += 1;
557 }
558 if want(RendererMode::Playwright) && self.playwright.is_some() {
559 n += 1;
560 }
561 if want(RendererMode::Chrome) && self.chrome.is_some() {
562 n += 1;
563 }
564 n
565 }
566
567 pub fn min_deadline_for_full_ladder_ms(&self) -> u64 {
573 let want =
574 |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
575
576 let mut sum: u64 = 0;
577 if !matches!(self.mode, RendererMode::None) {
581 sum = sum.saturating_add(self.http_timeout());
582 }
583
584 if !cfg!(feature = "cdp") {
588 return sum;
589 }
590
591 let mut cdp_tier_count: u64 = 0;
592 if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
593 sum = sum.saturating_add(self.lightpanda_timeout());
594 cdp_tier_count += 1;
595 }
596 if want(RendererMode::Playwright) && self.playwright.is_some() {
597 sum = sum.saturating_add(self.chrome_timeout());
598 cdp_tier_count += 1;
599 }
600 if want(RendererMode::Chrome) && self.chrome.is_some() {
601 sum = sum.saturating_add(self.chrome_timeout());
602 cdp_tier_count += 1;
603 }
604 sum.saturating_add(cdp_tier_count.saturating_mul(CDP_TIER_OVERHEAD_MS))
605 }
606}
607fn default_pool_size() -> usize {
608 4
609}
610
611#[derive(Debug, Clone, Deserialize)]
612pub struct CdpEndpoint {
613 pub ws_url: String,
614}
615
616#[derive(Debug, Clone, Deserialize)]
618pub struct StealthConfig {
619 #[serde(default)]
621 pub enabled: bool,
622 #[serde(default)]
624 pub user_agents: Vec<String>,
625 #[serde(default = "default_jitter")]
627 pub jitter_factor: f64,
628 #[serde(default = "default_true")]
630 pub inject_headers: bool,
631}
632
633impl Default for StealthConfig {
634 fn default() -> Self {
635 Self {
636 enabled: false,
637 user_agents: vec![],
638 jitter_factor: default_jitter(),
639 inject_headers: true,
640 }
641 }
642}
643
644fn default_jitter() -> f64 {
645 0.2
646}
647
648pub const BUILTIN_UA_POOL: &[&str] = &[
650 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
651 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
652 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
653 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
654 "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
655];
656
657#[derive(Debug, Clone, Deserialize)]
658pub struct CrawlerConfig {
659 #[serde(default = "default_concurrency")]
660 pub max_concurrency: usize,
661 #[serde(default = "default_rps")]
662 pub requests_per_second: f64,
663 #[serde(default = "default_true")]
664 pub respect_robots_txt: bool,
665 #[serde(default = "default_ua")]
666 pub user_agent: String,
667 #[serde(default = "default_depth")]
668 pub default_max_depth: u32,
669 #[serde(default = "default_max_pages")]
670 pub default_max_pages: u32,
671 #[serde(default)]
674 pub proxy: Option<String>,
675 #[serde(default = "default_job_ttl")]
677 pub job_ttl_secs: u64,
678 #[serde(default)]
679 pub stealth: StealthConfig,
680 #[serde(default)]
685 pub per_host_min_interval_ms: u64,
686 #[serde(default = "default_per_host_max_concurrent")]
690 pub per_host_max_concurrent: u32,
691}
692
693fn default_per_host_max_concurrent() -> u32 {
694 1
695}
696
697impl Default for CrawlerConfig {
698 fn default() -> Self {
699 Self {
700 max_concurrency: default_concurrency(),
701 requests_per_second: default_rps(),
702 respect_robots_txt: true,
703 user_agent: default_ua(),
704 default_max_depth: default_depth(),
705 default_max_pages: default_max_pages(),
706 proxy: None,
707 job_ttl_secs: default_job_ttl(),
708 stealth: StealthConfig::default(),
709 per_host_min_interval_ms: 0,
710 per_host_max_concurrent: default_per_host_max_concurrent(),
711 }
712 }
713}
714
715fn default_concurrency() -> usize {
716 10
717}
718fn default_rps() -> f64 {
719 10.0
720}
721fn default_true() -> bool {
722 true
723}
724fn default_ua() -> String {
725 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 \
729 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
730 .into()
731}
732fn default_depth() -> u32 {
733 2
734}
735fn default_max_pages() -> u32 {
736 100
737}
738fn default_job_ttl() -> u64 {
739 3600
740}
741
742#[derive(Debug, Clone, Deserialize)]
743pub struct ExtractionConfig {
744 #[serde(default = "default_format")]
745 pub default_format: String,
746 #[serde(default = "default_true_ext")]
747 pub only_main_content: bool,
748 #[serde(default)]
749 pub llm: Option<LlmConfig>,
750 #[serde(default)]
753 pub domain_selectors: std::collections::HashMap<String, String>,
754 #[serde(default)]
755 pub llm_fallback: LlmFallbackConfig,
756 #[serde(default = "default_http_retry_threshold")]
759 pub http_retry_threshold_bytes: usize,
760 #[serde(default = "default_lightpanda_retry_threshold")]
764 pub lightpanda_retry_threshold_bytes: usize,
765}
766
767fn default_http_retry_threshold() -> usize {
768 100
769}
770
771fn default_lightpanda_retry_threshold() -> usize {
772 2000
773}
774
775impl Default for ExtractionConfig {
776 fn default() -> Self {
777 Self {
778 default_format: default_format(),
779 only_main_content: true,
780 llm: None,
781 domain_selectors: std::collections::HashMap::new(),
782 llm_fallback: LlmFallbackConfig::default(),
783 http_retry_threshold_bytes: default_http_retry_threshold(),
784 lightpanda_retry_threshold_bytes: default_lightpanda_retry_threshold(),
785 }
786 }
787}
788
789#[derive(Debug, Clone, Deserialize)]
790pub struct LlmFallbackConfig {
791 #[serde(default)]
792 pub enable: bool,
793 #[serde(default = "default_llm_quality_threshold")]
794 pub quality_threshold: f32,
795 #[serde(default = "default_llm_max_html_bytes")]
796 pub max_html_bytes: usize,
797 #[serde(default)]
802 pub always_run: bool,
803}
804
805impl Default for LlmFallbackConfig {
806 fn default() -> Self {
807 Self {
808 enable: false,
809 quality_threshold: default_llm_quality_threshold(),
810 max_html_bytes: default_llm_max_html_bytes(),
811 always_run: false,
812 }
813 }
814}
815
816fn default_llm_quality_threshold() -> f32 {
817 0.3
818}
819fn default_llm_max_html_bytes() -> usize {
820 100_000
821}
822
823#[derive(Debug, Clone, Deserialize)]
824pub struct LlmConfig {
825 #[serde(default = "default_llm_provider")]
826 pub provider: String,
827 pub api_key: String,
828 #[serde(default = "default_llm_model")]
829 pub model: String,
830 #[serde(default)]
831 pub base_url: Option<String>,
832 #[serde(default = "default_llm_max_tokens")]
833 pub max_tokens: u32,
834 #[serde(default)]
837 pub azure_api_version: Option<String>,
838 #[serde(default = "default_llm_max_concurrency")]
841 pub max_concurrency: usize,
842 #[serde(default = "default_llm_max_html_bytes")]
845 pub max_html_bytes: usize,
846 #[serde(default)]
850 pub require_byok_header: Option<String>,
851}
852
853impl Default for LlmConfig {
854 fn default() -> Self {
855 Self {
856 provider: default_llm_provider(),
857 api_key: String::new(),
858 model: default_llm_model(),
859 base_url: None,
860 max_tokens: default_llm_max_tokens(),
861 azure_api_version: None,
862 max_concurrency: default_llm_max_concurrency(),
863 max_html_bytes: default_llm_max_html_bytes(),
864 require_byok_header: None,
865 }
866 }
867}
868
869fn default_llm_max_concurrency() -> usize {
870 4
871}
872
873fn default_llm_provider() -> String {
874 "anthropic".into()
875}
876fn default_llm_model() -> String {
877 "claude-sonnet-4-20250514".into()
878}
879fn default_llm_max_tokens() -> u32 {
880 4096
881}
882
883fn default_format() -> String {
884 "markdown".into()
885}
886fn default_true_ext() -> bool {
887 true
888}
889
890#[derive(Debug, Clone, Default, Deserialize)]
891pub struct AuthConfig {
892 #[serde(default)]
893 pub api_keys: Vec<String>,
894}
895
896impl AppConfig {
897 pub fn load() -> Result<Self, config::ConfigError> {
900 let mut builder = config::Config::builder()
901 .add_source(config::File::with_name("config.default").required(false));
902
903 if let Ok(extra) = std::env::var("CRW_CONFIG") {
905 builder = builder.add_source(config::File::with_name(&extra).required(true));
906 } else {
907 builder = builder.add_source(config::File::with_name("config.local").required(false));
908 }
909
910 let cfg = builder
911 .add_source(
912 config::Environment::with_prefix("CRW")
913 .prefix_separator("_")
914 .separator("__")
915 .try_parsing(true),
916 )
917 .build()?;
918 cfg.try_deserialize()
919 }
920
921 pub fn effective_deadline_ms(
938 &self,
939 requested_deadline_ms: Option<u64>,
940 wait_for_ms: Option<u64>,
941 ) -> u64 {
942 if let Some(explicit) = requested_deadline_ms {
943 return explicit;
944 }
945 let default_ms = self.request.deadline_ms_default;
946 if !self.request.auto_extend_deadline_for_ladder {
947 return default_ms;
948 }
949 if self.renderer.cdp_tier_count() == 0 {
956 return default_ms;
957 }
958 let ladder_min = self.renderer.min_deadline_for_full_ladder_ms();
959 const SPA_DEFAULT_MS: u64 = 8_000;
964 let extra = if let Some(w) = wait_for_ms {
970 let bounded = w.min(MAX_WAIT_FOR_MS);
971 let per_tier = bounded.saturating_sub(SPA_DEFAULT_MS);
972 per_tier.saturating_mul(self.renderer.cdp_tier_count() as u64)
973 } else {
974 0
975 };
976 default_ms.max(ladder_min.saturating_add(extra))
977 }
978
979 pub fn effective_request_timeout_secs(&self) -> u64 {
992 let baseline = self.server.request_timeout_secs;
993 if !self.request.auto_extend_deadline_for_ladder {
994 return baseline;
995 }
996 const OUTER_BUFFER_SECS: u64 = 5;
997 const MAP_REQUEST_TIMEOUT_CEILING_MS: u64 = 300_000;
1001 let scrape_ms = self.effective_deadline_ms(None, Some(MAX_WAIT_FOR_MS));
1007
1008 let conc = (self.crawler.max_concurrency.max(1)) as u64;
1012 let max_results = self.search.max_limit as u64;
1013 let enrich_batches = max_results.div_ceil(conc);
1014 let search_enrichment_ms = enrich_batches.saturating_mul(scrape_ms);
1015 let search_ms = self.search.timeout_ms.saturating_add(search_enrichment_ms);
1016
1017 let max_handler_ms = scrape_ms.max(search_ms).max(MAP_REQUEST_TIMEOUT_CEILING_MS);
1018 let needed_secs = max_handler_ms
1019 .div_ceil(1_000)
1020 .saturating_add(OUTER_BUFFER_SECS);
1021 baseline.max(needed_secs)
1022 }
1023}
1024
1025#[cfg(test)]
1026mod tests {
1027 use super::*;
1028
1029 static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
1032
1033 fn clear_renderer_env() {
1034 for k in [
1035 "CRW_RENDERER__MODE",
1036 "CRW_RENDERER__FORCE_JS",
1037 "CRW_RENDERER__RENDER_JS_DEFAULT",
1038 "CRW_RENDERER__LIGHTPANDA__WS_URL",
1039 "CRW_SERVER__PORT",
1040 ] {
1041 unsafe { std::env::remove_var(k) };
1042 }
1043 }
1044
1045 #[test]
1046 fn renderer_mode_parses_variants() {
1047 #[derive(Deserialize)]
1048 struct Wrap {
1049 mode: RendererMode,
1050 }
1051 let cases = [
1052 ("mode = \"auto\"", RendererMode::Auto),
1053 ("mode = \"none\"", RendererMode::None),
1054 ("mode = \"lightpanda\"", RendererMode::Lightpanda),
1055 ("mode = \"chrome\"", RendererMode::Chrome),
1056 ("mode = \"playwright\"", RendererMode::Playwright),
1057 ];
1058 for (toml_str, expected) in cases {
1059 let w: Wrap = toml::from_str(toml_str).unwrap();
1060 assert_eq!(w.mode, expected, "toml: {toml_str}");
1061 }
1062 }
1063
1064 #[test]
1065 fn renderer_mode_bogus_errors() {
1066 #[derive(Deserialize)]
1067 struct Wrap {
1068 #[allow(dead_code)]
1069 mode: RendererMode,
1070 }
1071 let err: Result<Wrap, _> = toml::from_str("mode = \"bogus\"");
1072 assert!(err.is_err(), "bogus mode should fail to parse");
1073 }
1074
1075 #[test]
1076 fn renderer_config_default_mode_is_auto() {
1077 let cfg = RendererConfig::default();
1078 assert_eq!(cfg.mode, RendererMode::Auto);
1079 assert_eq!(cfg.render_js_default, None);
1080 }
1081
1082 #[test]
1083 fn render_js_default_force_js_alias() {
1084 let cfg: RendererConfig = toml::from_str("force_js = true").unwrap();
1085 assert_eq!(cfg.render_js_default, Some(true));
1086 }
1087
1088 #[test]
1089 fn render_js_default_direct_field() {
1090 let cfg: RendererConfig = toml::from_str("render_js_default = false").unwrap();
1091 assert_eq!(cfg.render_js_default, Some(false));
1092 }
1093
1094 #[test]
1095 fn env_var_renderer_mode_chrome() {
1096 let _g = ENV_LOCK.lock().unwrap();
1097 clear_renderer_env();
1098 unsafe { std::env::set_var("CRW_RENDERER__MODE", "chrome") };
1099 let cfg = AppConfig::load().unwrap();
1100 clear_renderer_env();
1101 assert_eq!(cfg.renderer.mode, RendererMode::Chrome);
1102 }
1103
1104 #[test]
1105 fn env_var_force_js_alias_works() {
1106 let _g = ENV_LOCK.lock().unwrap();
1107 clear_renderer_env();
1108 unsafe { std::env::set_var("CRW_RENDERER__FORCE_JS", "true") };
1109 let cfg = AppConfig::load().unwrap();
1110 clear_renderer_env();
1111 assert_eq!(cfg.renderer.render_js_default, Some(true));
1112 }
1113
1114 #[test]
1115 fn env_var_render_js_default_direct() {
1116 let _g = ENV_LOCK.lock().unwrap();
1117 clear_renderer_env();
1118 unsafe { std::env::set_var("CRW_RENDERER__RENDER_JS_DEFAULT", "true") };
1119 let cfg = AppConfig::load().unwrap();
1120 clear_renderer_env();
1121 assert_eq!(cfg.renderer.render_js_default, Some(true));
1122 }
1123
1124 #[test]
1125 fn request_config_defaults_match_plan() {
1126 let r = RequestConfig::default();
1127 assert_eq!(r.deadline_ms_default, 8000);
1128 assert!(r.auto_extend_deadline_for_ladder);
1129 }
1130
1131 #[test]
1132 fn default_app_config_enables_auto_extend() {
1133 let cfg = AppConfig::default();
1135 assert!(cfg.request.auto_extend_deadline_for_ladder);
1136 assert_eq!(cfg.request.deadline_ms_default, 8000);
1137 }
1138
1139 fn renderer_with_chrome_only(chrome_ms: u64) -> RendererConfig {
1140 RendererConfig {
1141 mode: RendererMode::Chrome,
1142 page_timeout_ms: chrome_ms,
1143 chrome_timeout_ms: Some(chrome_ms),
1144 chrome: Some(CdpEndpoint {
1145 ws_url: "ws://chrome:9222".into(),
1146 }),
1147 ..Default::default()
1148 }
1149 }
1150
1151 #[test]
1152 #[cfg(feature = "cdp")]
1153 fn min_deadline_full_ladder_chrome_only() {
1154 let r = renderer_with_chrome_only(30_000);
1156 assert_eq!(
1158 r.min_deadline_for_full_ladder_ms(),
1159 30_000 + 30_000 + 28_000
1160 );
1161 }
1162
1163 #[test]
1164 #[cfg(feature = "cdp")]
1165 fn min_deadline_full_ladder_auto_three_tiers() {
1166 let r = RendererConfig {
1167 mode: RendererMode::Auto,
1168 page_timeout_ms: 15_000,
1169 http_timeout_ms: Some(15_000),
1170 lightpanda_timeout_ms: Some(2_500),
1171 chrome_timeout_ms: Some(30_000),
1172 lightpanda: Some(CdpEndpoint {
1173 ws_url: "ws://lp:9222".into(),
1174 }),
1175 chrome: Some(CdpEndpoint {
1176 ws_url: "ws://chrome:9222".into(),
1177 }),
1178 ..Default::default()
1179 };
1180 assert_eq!(
1182 r.min_deadline_for_full_ladder_ms(),
1183 15_000 + 2_500 + 30_000 + 2 * 28_000
1184 );
1185 assert_eq!(r.cdp_tier_count(), 2);
1186 }
1187
1188 #[test]
1189 fn effective_deadline_explicit_bypasses_auto_extend() {
1190 let mut cfg = AppConfig::default();
1191 cfg.request.auto_extend_deadline_for_ladder = true;
1192 cfg.renderer = renderer_with_chrome_only(30_000);
1193 assert_eq!(cfg.effective_deadline_ms(Some(5_000), None), 5_000);
1195 assert_eq!(cfg.effective_deadline_ms(Some(500_000), None), 500_000);
1196 }
1197
1198 #[test]
1199 #[cfg(feature = "cdp")]
1200 fn effective_deadline_auto_extend_raises_to_ladder_min() {
1201 let mut cfg = AppConfig::default();
1202 cfg.request.auto_extend_deadline_for_ladder = true;
1203 cfg.request.deadline_ms_default = 8_000;
1204 cfg.renderer = renderer_with_chrome_only(30_000);
1205 let expected = cfg.renderer.min_deadline_for_full_ladder_ms();
1206 assert!(expected > 8_000);
1207 assert_eq!(cfg.effective_deadline_ms(None, None), expected);
1208 }
1209
1210 #[test]
1211 fn effective_deadline_default_wins_when_higher_than_ladder() {
1212 let mut cfg = AppConfig::default();
1213 cfg.request.auto_extend_deadline_for_ladder = true;
1214 cfg.request.deadline_ms_default = 1_000_000;
1215 cfg.renderer = renderer_with_chrome_only(30_000);
1216 assert_eq!(cfg.effective_deadline_ms(None, None), 1_000_000);
1217 }
1218
1219 #[test]
1220 fn effective_deadline_auto_extend_disabled_returns_baseline() {
1221 let mut cfg = AppConfig::default();
1222 cfg.request.auto_extend_deadline_for_ladder = false;
1223 cfg.request.deadline_ms_default = 8_000;
1224 cfg.renderer = renderer_with_chrome_only(30_000);
1225 assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1226 }
1227
1228 #[test]
1229 #[cfg(feature = "cdp")]
1230 fn effective_deadline_extends_for_long_wait_for() {
1231 let mut cfg = AppConfig::default();
1232 cfg.request.auto_extend_deadline_for_ladder = true;
1233 cfg.request.deadline_ms_default = 8_000;
1234 cfg.renderer = renderer_with_chrome_only(30_000);
1235 let base = cfg.renderer.min_deadline_for_full_ladder_ms();
1236 let tier_count = cfg.renderer.cdp_tier_count() as u64;
1237 let with_wait = cfg.effective_deadline_ms(None, Some(20_000));
1239 assert_eq!(with_wait, base + 12_000 * tier_count);
1240 assert_eq!(cfg.effective_deadline_ms(None, Some(2_000)), base);
1242 }
1243
1244 #[test]
1245 fn effective_request_timeout_covers_map_ceiling() {
1246 let mut cfg = AppConfig::default();
1247 cfg.request.auto_extend_deadline_for_ladder = true;
1248 cfg.request.deadline_ms_default = 8_000;
1249 cfg.renderer = renderer_with_chrome_only(30_000);
1250 cfg.search.timeout_ms = 15_000;
1251 cfg.crawler.max_concurrency = 10;
1252 cfg.search.max_limit = 20;
1253 cfg.server.request_timeout_secs = 60;
1254 assert!(cfg.effective_request_timeout_secs() >= 305);
1256 }
1257
1258 #[test]
1259 fn effective_request_timeout_disabled_returns_baseline() {
1260 let mut cfg = AppConfig::default();
1261 cfg.request.auto_extend_deadline_for_ladder = false;
1262 cfg.server.request_timeout_secs = 60;
1263 assert_eq!(cfg.effective_request_timeout_secs(), 60);
1264 }
1265
1266 #[test]
1267 fn effective_request_timeout_respects_operator_override() {
1268 let mut cfg = AppConfig::default();
1269 cfg.request.auto_extend_deadline_for_ladder = true;
1270 cfg.server.request_timeout_secs = 600; cfg.renderer = renderer_with_chrome_only(30_000);
1272 assert_eq!(cfg.effective_request_timeout_secs(), 600);
1274 }
1275
1276 #[test]
1277 fn effective_request_timeout_search_sequential_batching() {
1278 let mut cfg = AppConfig::default();
1280 cfg.request.auto_extend_deadline_for_ladder = true;
1281 cfg.request.deadline_ms_default = 8_000;
1282 cfg.renderer = renderer_with_chrome_only(30_000);
1283 cfg.search.timeout_ms = 15_000;
1284 cfg.search.max_limit = 20;
1285 cfg.crawler.max_concurrency = 1;
1286 cfg.server.request_timeout_secs = 60;
1287 let secs = cfg.effective_request_timeout_secs();
1291 let scrape_ms = cfg.effective_deadline_ms(None, Some(60_000));
1292 let expected_search_ms = 15_000 + 20 * scrape_ms;
1293 let expected_max_ms = scrape_ms.max(expected_search_ms).max(300_000);
1294 let expected_secs = expected_max_ms.div_ceil(1_000) + 5;
1295 assert_eq!(secs, 60u64.max(expected_secs));
1296 }
1297
1298 #[test]
1299 #[cfg(not(feature = "cdp"))]
1300 fn cdp_tier_count_zero_without_cdp_feature() {
1301 let r = RendererConfig {
1305 mode: RendererMode::Auto,
1306 page_timeout_ms: 15_000,
1307 chrome_timeout_ms: Some(30_000),
1308 chrome: Some(CdpEndpoint {
1309 ws_url: "ws://chrome:9222".into(),
1310 }),
1311 lightpanda: Some(CdpEndpoint {
1312 ws_url: "ws://lp:9222".into(),
1313 }),
1314 ..Default::default()
1315 };
1316 assert_eq!(r.cdp_tier_count(), 0);
1317 assert_eq!(r.min_deadline_for_full_ladder_ms(), 15_000);
1319 }
1320
1321 #[test]
1322 fn effective_deadline_skipped_for_http_only_mode() {
1323 let mut cfg = AppConfig::default();
1328 cfg.request.auto_extend_deadline_for_ladder = true;
1329 cfg.request.deadline_ms_default = 8_000;
1330 cfg.renderer = RendererConfig {
1331 mode: RendererMode::Auto,
1332 page_timeout_ms: 30_000,
1333 lightpanda: None,
1335 playwright: None,
1336 chrome: None,
1337 ..Default::default()
1338 };
1339 assert_eq!(cfg.renderer.cdp_tier_count(), 0);
1340 assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1341 assert_eq!(cfg.effective_deadline_ms(None, Some(30_000)), 8_000);
1342 }
1343
1344 #[test]
1345 #[cfg(feature = "cdp")]
1346 fn min_deadline_full_ladder_playwright_only() {
1347 let r = RendererConfig {
1350 mode: RendererMode::Playwright,
1351 page_timeout_ms: 15_000,
1352 http_timeout_ms: Some(15_000),
1353 chrome_timeout_ms: Some(30_000),
1354 playwright: Some(CdpEndpoint {
1355 ws_url: "ws://playwright:9222".into(),
1356 }),
1357 ..Default::default()
1358 };
1359 assert_eq!(r.cdp_tier_count(), 1);
1360 assert_eq!(
1362 r.min_deadline_for_full_ladder_ms(),
1363 15_000 + 30_000 + 28_000
1364 );
1365 }
1366
1367 #[test]
1368 fn renderer_phase_toggles_default_off_or_safe() {
1369 let r = RendererConfig::default();
1370 assert!(!r.chrome_intercept_resources);
1371 assert!(!r.chrome_intercept_stylesheets);
1372 assert!(r.chrome_host_intercept_disable.is_empty());
1373 assert_eq!(r.chrome_nav_budget_ms, 12_000);
1374 assert!(!r.chrome_context_pool_enabled);
1375 assert!(!r.use_predictor);
1376 }
1377
1378 #[test]
1379 fn crawler_per_host_limiter_defaults() {
1380 let c = CrawlerConfig::default();
1381 assert_eq!(c.per_host_min_interval_ms, 0);
1382 assert_eq!(c.per_host_max_concurrent, 1);
1383 }
1384
1385 #[test]
1386 fn env_var_overrides_toml_defaults() {
1387 let _g = ENV_LOCK.lock().unwrap();
1388 clear_renderer_env();
1389 unsafe {
1390 std::env::set_var("CRW_SERVER__PORT", "4444");
1391 std::env::set_var("CRW_RENDERER__LIGHTPANDA__WS_URL", "ws://test:9999/");
1392 }
1393 let cfg = AppConfig::load().unwrap();
1394 clear_renderer_env();
1395
1396 assert_eq!(cfg.server.port, 4444, "env var should override server.port");
1397 assert_eq!(
1398 cfg.renderer.lightpanda.as_ref().unwrap().ws_url,
1399 "ws://test:9999/",
1400 "env var should override renderer.lightpanda.ws_url"
1401 );
1402 }
1403}