1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize, Default)]
4pub struct AppConfig {
5 #[serde(default)]
6 pub server: ServerConfig,
7 #[serde(default)]
8 pub renderer: RendererConfig,
9 #[serde(default)]
10 pub crawler: CrawlerConfig,
11 #[serde(default)]
12 pub extraction: ExtractionConfig,
13 #[serde(default)]
14 pub auth: AuthConfig,
15 #[serde(default)]
16 pub request: RequestConfig,
17 #[serde(default)]
18 pub search: SearchConfig,
19 #[serde(default)]
20 pub map: MapConfig,
21}
22
23#[derive(Debug, Clone, Deserialize, Default)]
25pub struct MapConfig {
26 #[serde(default)]
27 pub url_filter: MapUrlFilterConfig,
28}
29
30#[derive(Debug, Clone, Deserialize)]
35pub struct MapUrlFilterConfig {
36 #[serde(default = "default_true_filter")]
38 pub strip_tracking_params: bool,
39 #[serde(default = "default_true_filter")]
41 pub drop_action_urls: bool,
42 #[serde(default)]
44 pub gov_tld_drop_actions: bool,
45 #[serde(default)]
47 pub extra_tracking_params: Vec<String>,
48 #[serde(default)]
50 pub extra_action_params: Vec<String>,
51 #[serde(default)]
53 pub extra_preserve_params: Vec<String>,
54}
55
56impl Default for MapUrlFilterConfig {
57 fn default() -> Self {
58 Self {
59 strip_tracking_params: true,
60 drop_action_urls: true,
61 gov_tld_drop_actions: false,
62 extra_tracking_params: Vec::new(),
63 extra_action_params: Vec::new(),
64 extra_preserve_params: Vec::new(),
65 }
66 }
67}
68
69fn default_true_filter() -> bool {
70 true
71}
72
73pub const CDP_TIER_OVERHEAD_MS: u64 = 28_000;
83
84pub const MAX_WAIT_FOR_MS: u64 = 60_000;
91
92#[derive(Debug, Clone, Deserialize)]
98pub struct SearchConfig {
99 #[serde(default = "default_true_search")]
102 pub enabled: bool,
103 #[serde(default)]
106 pub searxng_url: Option<String>,
107 #[serde(default = "default_search_timeout_ms")]
109 pub timeout_ms: u64,
110 #[serde(default = "default_search_limit")]
112 pub default_limit: u32,
113 #[serde(default = "default_search_max_limit")]
115 pub max_limit: u32,
116 #[serde(default = "default_research_engines")]
119 pub research_engines: Vec<String>,
120 #[serde(default = "default_github_engines")]
122 pub github_engines: Vec<String>,
123}
124
125impl Default for SearchConfig {
126 fn default() -> Self {
127 Self {
128 enabled: true,
129 searxng_url: None,
130 timeout_ms: default_search_timeout_ms(),
131 default_limit: default_search_limit(),
132 max_limit: default_search_max_limit(),
133 research_engines: default_research_engines(),
134 github_engines: default_github_engines(),
135 }
136 }
137}
138
139fn default_true_search() -> bool {
140 true
141}
142fn default_search_timeout_ms() -> u64 {
143 15_000
144}
145fn default_search_limit() -> u32 {
146 5
147}
148fn default_search_max_limit() -> u32 {
149 20
150}
151fn default_research_engines() -> Vec<String> {
152 vec![
153 "arxiv".into(),
154 "crossref".into(),
155 "google scholar".into(),
156 "semantic scholar".into(),
157 ]
158}
159fn default_github_engines() -> Vec<String> {
160 vec!["github".into()]
161}
162
163#[derive(Debug, Clone, Deserialize)]
167pub struct RequestConfig {
168 #[serde(default = "default_deadline_ms")]
173 pub deadline_ms_default: u64,
174 #[serde(default = "default_true_request")]
184 pub auto_extend_deadline_for_ladder: bool,
185}
186
187impl Default for RequestConfig {
188 fn default() -> Self {
189 Self {
190 deadline_ms_default: default_deadline_ms(),
191 auto_extend_deadline_for_ladder: true,
192 }
193 }
194}
195
196fn default_true_request() -> bool {
197 true
198}
199
200fn default_deadline_ms() -> u64 {
201 8000
202}
203
204#[derive(Debug, Clone, Deserialize)]
205pub struct ServerConfig {
206 #[serde(default = "default_host")]
207 pub host: String,
208 #[serde(default = "default_port")]
209 pub port: u16,
210 #[serde(default = "default_request_timeout")]
211 pub request_timeout_secs: u64,
212 #[serde(default = "default_rate_limit_rps")]
214 pub rate_limit_rps: u64,
215}
216
217impl Default for ServerConfig {
218 fn default() -> Self {
219 Self {
220 host: default_host(),
221 port: default_port(),
222 request_timeout_secs: default_request_timeout(),
223 rate_limit_rps: default_rate_limit_rps(),
224 }
225 }
226}
227
228fn default_rate_limit_rps() -> u64 {
229 10
230}
231
232fn default_host() -> String {
233 "0.0.0.0".into()
234}
235fn default_port() -> u16 {
236 3000
237}
238fn default_request_timeout() -> u64 {
239 60
240}
241
242#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize)]
252#[serde(rename_all = "lowercase")]
253pub enum RendererMode {
254 #[default]
255 Auto,
256 None,
257 Lightpanda,
258 Chrome,
259 Playwright,
260}
261
262#[derive(Debug, Clone, Deserialize)]
263pub struct RendererConfig {
264 #[serde(default)]
265 pub mode: RendererMode,
266 #[serde(default = "default_page_timeout")]
270 pub page_timeout_ms: u64,
271 #[serde(default)]
276 pub http_timeout_ms: Option<u64>,
277 #[serde(default)]
282 pub lightpanda_timeout_ms: Option<u64>,
283 #[serde(default)]
287 pub chrome_timeout_ms: Option<u64>,
288 #[serde(default = "default_pool_size")]
289 pub pool_size: usize,
290 #[serde(default, alias = "force_js")]
295 pub render_js_default: Option<bool>,
296 #[serde(default)]
297 pub lightpanda: Option<CdpEndpoint>,
298 #[serde(default)]
299 pub playwright: Option<CdpEndpoint>,
300 #[serde(default)]
301 pub chrome: Option<CdpEndpoint>,
302 #[serde(default)]
306 pub chrome_intercept_resources: bool,
307 #[serde(default)]
311 pub chrome_intercept_stylesheets: bool,
312 #[serde(default)]
315 pub chrome_host_intercept_disable: Vec<String>,
316 #[serde(default = "default_chrome_nav_budget_ms")]
321 pub chrome_nav_budget_ms: u64,
322 #[serde(default)]
329 pub chrome_context_pool_enabled: bool,
330 #[serde(default)]
333 pub chrome_pool: ChromePoolConfig,
334 #[serde(default)]
338 pub chrome_backend: ChromeBackend,
339 #[serde(default)]
343 pub use_predictor: bool,
344 #[serde(default)]
347 pub escalation: EscalationConfig,
348 #[serde(default)]
350 pub antibot: AntibotConfig,
351}
352
353#[derive(Debug, Clone, Deserialize)]
356pub struct EscalationConfig {
357 #[serde(default)]
359 pub enabled: bool,
360 #[serde(default = "default_waterfall_timeout_ms")]
364 pub waterfall_timeout_ms: u64,
365 #[serde(default = "default_escalation_global_timeout_ms")]
367 pub global_timeout_ms: u64,
368 #[serde(default)]
371 pub residential_proxy: bool,
372 #[serde(default = "default_proxy_country")]
374 pub proxy_country: String,
375}
376
377impl Default for EscalationConfig {
378 fn default() -> Self {
379 Self {
380 enabled: false,
381 waterfall_timeout_ms: default_waterfall_timeout_ms(),
382 global_timeout_ms: default_escalation_global_timeout_ms(),
383 residential_proxy: false,
384 proxy_country: default_proxy_country(),
385 }
386 }
387}
388
389fn default_waterfall_timeout_ms() -> u64 {
390 8_000
391}
392fn default_escalation_global_timeout_ms() -> u64 {
393 60_000
394}
395fn default_proxy_country() -> String {
396 "us".to_string()
397}
398
399#[derive(Debug, Clone, Deserialize)]
402pub struct AntibotConfig {
403 #[serde(default = "default_true")]
405 pub enabled: bool,
406 #[serde(default)]
409 pub escalate_on_signal: bool,
410}
411
412impl Default for AntibotConfig {
413 fn default() -> Self {
414 Self {
415 enabled: true,
416 escalate_on_signal: false,
417 }
418 }
419}
420
421fn default_chrome_nav_budget_ms() -> u64 {
422 12_000
423}
424
425#[derive(Debug, Clone, Deserialize)]
429pub struct ChromePoolConfig {
430 #[serde(default)]
433 pub size: Option<usize>,
434 #[serde(default = "default_recycle_after_navs")]
437 pub recycle_after_navs: u32,
438 #[serde(default = "default_idle_timeout_secs")]
440 pub idle_timeout_secs: u64,
441 #[serde(default = "default_health_check_secs")]
443 pub health_check_secs: u64,
444 #[serde(default = "default_shutdown_drain_secs")]
446 pub shutdown_drain_secs: u64,
447}
448
449impl Default for ChromePoolConfig {
450 fn default() -> Self {
451 Self {
452 size: None,
453 recycle_after_navs: default_recycle_after_navs(),
454 idle_timeout_secs: default_idle_timeout_secs(),
455 health_check_secs: default_health_check_secs(),
456 shutdown_drain_secs: default_shutdown_drain_secs(),
457 }
458 }
459}
460
461fn default_recycle_after_navs() -> u32 {
462 1
463}
464fn default_idle_timeout_secs() -> u64 {
465 300
466}
467fn default_health_check_secs() -> u64 {
468 60
469}
470fn default_shutdown_drain_secs() -> u64 {
471 30
472}
473
474#[derive(Debug, Clone, Copy, Default, Deserialize, PartialEq, Eq)]
479#[serde(rename_all = "lowercase")]
480pub enum ChromeBackend {
481 #[default]
484 Vanilla,
485 Browserless,
488}
489
490impl Default for RendererConfig {
491 fn default() -> Self {
492 Self {
493 mode: RendererMode::default(),
494 page_timeout_ms: default_page_timeout(),
495 http_timeout_ms: None,
496 lightpanda_timeout_ms: None,
497 chrome_timeout_ms: None,
498 pool_size: default_pool_size(),
499 render_js_default: None,
500 lightpanda: None,
501 playwright: None,
502 chrome: None,
503 chrome_intercept_resources: false,
504 chrome_intercept_stylesheets: false,
505 chrome_host_intercept_disable: Vec::new(),
506 chrome_nav_budget_ms: default_chrome_nav_budget_ms(),
507 chrome_context_pool_enabled: false,
508 chrome_pool: ChromePoolConfig::default(),
509 chrome_backend: ChromeBackend::default(),
510 use_predictor: false,
511 escalation: EscalationConfig::default(),
512 antibot: AntibotConfig::default(),
513 }
514 }
515}
516fn default_page_timeout() -> u64 {
517 30000
518}
519
520impl RendererConfig {
521 pub fn http_timeout(&self) -> u64 {
531 self.http_timeout_ms.unwrap_or(self.page_timeout_ms)
532 }
533 pub fn lightpanda_timeout(&self) -> u64 {
534 self.lightpanda_timeout_ms.unwrap_or(self.page_timeout_ms)
535 }
536 pub fn chrome_timeout(&self) -> u64 {
537 self.chrome_timeout_ms.unwrap_or(self.page_timeout_ms)
538 }
539
540 pub fn cdp_tier_count(&self) -> usize {
549 if !cfg!(feature = "cdp") {
550 return 0;
551 }
552 let want =
553 |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
554 let mut n = 0;
555 if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
556 n += 1;
557 }
558 if want(RendererMode::Playwright) && self.playwright.is_some() {
559 n += 1;
560 }
561 if want(RendererMode::Chrome) && self.chrome.is_some() {
562 n += 1;
563 }
564 n
565 }
566
567 pub fn min_deadline_for_full_ladder_ms(&self) -> u64 {
573 let want =
574 |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
575
576 let mut sum: u64 = 0;
577 if !matches!(self.mode, RendererMode::None) {
581 sum = sum.saturating_add(self.http_timeout());
582 }
583
584 if !cfg!(feature = "cdp") {
588 return sum;
589 }
590
591 let mut cdp_tier_count: u64 = 0;
592 if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
593 sum = sum.saturating_add(self.lightpanda_timeout());
594 cdp_tier_count += 1;
595 }
596 if want(RendererMode::Playwright) && self.playwright.is_some() {
597 sum = sum.saturating_add(self.chrome_timeout());
598 cdp_tier_count += 1;
599 }
600 if want(RendererMode::Chrome) && self.chrome.is_some() {
601 sum = sum.saturating_add(self.chrome_timeout());
602 cdp_tier_count += 1;
603 }
604 sum.saturating_add(cdp_tier_count.saturating_mul(CDP_TIER_OVERHEAD_MS))
605 }
606}
607fn default_pool_size() -> usize {
608 4
609}
610
611#[derive(Debug, Clone, Deserialize)]
612pub struct CdpEndpoint {
613 pub ws_url: String,
614}
615
616#[derive(Debug, Clone, Deserialize)]
618pub struct StealthConfig {
619 #[serde(default)]
621 pub enabled: bool,
622 #[serde(default)]
624 pub user_agents: Vec<String>,
625 #[serde(default = "default_jitter")]
627 pub jitter_factor: f64,
628 #[serde(default = "default_true")]
630 pub inject_headers: bool,
631}
632
633impl Default for StealthConfig {
634 fn default() -> Self {
635 Self {
636 enabled: false,
637 user_agents: vec![],
638 jitter_factor: default_jitter(),
639 inject_headers: true,
640 }
641 }
642}
643
644fn default_jitter() -> f64 {
645 0.2
646}
647
648pub const BUILTIN_UA_POOL: &[&str] = &[
650 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
651 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
652 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
653 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
654 "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
655];
656
657#[derive(Debug, Clone, Deserialize)]
658pub struct CrawlerConfig {
659 #[serde(default = "default_concurrency")]
660 pub max_concurrency: usize,
661 #[serde(default = "default_rps")]
662 pub requests_per_second: f64,
663 #[serde(default = "default_true")]
664 pub respect_robots_txt: bool,
665 #[serde(default = "default_ua")]
666 pub user_agent: String,
667 #[serde(default = "default_depth")]
668 pub default_max_depth: u32,
669 #[serde(default = "default_max_pages")]
670 pub default_max_pages: u32,
671 #[serde(default)]
674 pub proxy: Option<String>,
675 #[serde(default = "default_job_ttl")]
677 pub job_ttl_secs: u64,
678 #[serde(default)]
679 pub stealth: StealthConfig,
680 #[serde(default)]
685 pub per_host_min_interval_ms: u64,
686 #[serde(default = "default_per_host_max_concurrent")]
690 pub per_host_max_concurrent: u32,
691}
692
693fn default_per_host_max_concurrent() -> u32 {
694 1
695}
696
697impl Default for CrawlerConfig {
698 fn default() -> Self {
699 Self {
700 max_concurrency: default_concurrency(),
701 requests_per_second: default_rps(),
702 respect_robots_txt: true,
703 user_agent: default_ua(),
704 default_max_depth: default_depth(),
705 default_max_pages: default_max_pages(),
706 proxy: None,
707 job_ttl_secs: default_job_ttl(),
708 stealth: StealthConfig::default(),
709 per_host_min_interval_ms: 0,
710 per_host_max_concurrent: default_per_host_max_concurrent(),
711 }
712 }
713}
714
715fn default_concurrency() -> usize {
716 10
717}
718fn default_rps() -> f64 {
719 10.0
720}
721fn default_true() -> bool {
722 true
723}
724fn default_ua() -> String {
725 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 \
729 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
730 .into()
731}
732fn default_depth() -> u32 {
733 2
734}
735fn default_max_pages() -> u32 {
736 100
737}
738fn default_job_ttl() -> u64 {
739 3600
740}
741
742#[derive(Debug, Clone, Deserialize)]
743pub struct ExtractionConfig {
744 #[serde(default = "default_format")]
745 pub default_format: String,
746 #[serde(default = "default_true_ext")]
747 pub only_main_content: bool,
748 #[serde(default)]
749 pub llm: Option<LlmConfig>,
750 #[serde(default)]
753 pub domain_selectors: std::collections::HashMap<String, String>,
754 #[serde(default)]
755 pub llm_fallback: LlmFallbackConfig,
756 #[serde(default = "default_http_retry_threshold")]
759 pub http_retry_threshold_bytes: usize,
760 #[serde(default = "default_lightpanda_retry_threshold")]
764 pub lightpanda_retry_threshold_bytes: usize,
765}
766
767fn default_http_retry_threshold() -> usize {
768 100
769}
770
771fn default_lightpanda_retry_threshold() -> usize {
772 2000
773}
774
775impl Default for ExtractionConfig {
776 fn default() -> Self {
777 Self {
778 default_format: default_format(),
779 only_main_content: true,
780 llm: None,
781 domain_selectors: std::collections::HashMap::new(),
782 llm_fallback: LlmFallbackConfig::default(),
783 http_retry_threshold_bytes: default_http_retry_threshold(),
784 lightpanda_retry_threshold_bytes: default_lightpanda_retry_threshold(),
785 }
786 }
787}
788
789#[derive(Debug, Clone, Deserialize)]
790pub struct LlmFallbackConfig {
791 #[serde(default)]
792 pub enable: bool,
793 #[serde(default = "default_llm_quality_threshold")]
794 pub quality_threshold: f32,
795 #[serde(default = "default_llm_max_html_bytes")]
796 pub max_html_bytes: usize,
797 #[serde(default)]
802 pub always_run: bool,
803}
804
805impl Default for LlmFallbackConfig {
806 fn default() -> Self {
807 Self {
808 enable: false,
809 quality_threshold: default_llm_quality_threshold(),
810 max_html_bytes: default_llm_max_html_bytes(),
811 always_run: false,
812 }
813 }
814}
815
816fn default_llm_quality_threshold() -> f32 {
817 0.3
818}
819fn default_llm_max_html_bytes() -> usize {
820 100_000
821}
822
823#[derive(Debug, Clone, Deserialize)]
824pub struct LlmConfig {
825 #[serde(default = "default_llm_provider")]
826 pub provider: String,
827 pub api_key: String,
828 #[serde(default = "default_llm_model")]
829 pub model: String,
830 #[serde(default)]
831 pub base_url: Option<String>,
832 #[serde(default = "default_llm_max_tokens")]
833 pub max_tokens: u32,
834 #[serde(default)]
837 pub azure_api_version: Option<String>,
838}
839
840fn default_llm_provider() -> String {
841 "anthropic".into()
842}
843fn default_llm_model() -> String {
844 "claude-sonnet-4-20250514".into()
845}
846fn default_llm_max_tokens() -> u32 {
847 4096
848}
849
850fn default_format() -> String {
851 "markdown".into()
852}
853fn default_true_ext() -> bool {
854 true
855}
856
857#[derive(Debug, Clone, Default, Deserialize)]
858pub struct AuthConfig {
859 #[serde(default)]
860 pub api_keys: Vec<String>,
861}
862
863impl AppConfig {
864 pub fn load() -> Result<Self, config::ConfigError> {
867 let mut builder = config::Config::builder()
868 .add_source(config::File::with_name("config.default").required(false));
869
870 if let Ok(extra) = std::env::var("CRW_CONFIG") {
872 builder = builder.add_source(config::File::with_name(&extra).required(true));
873 } else {
874 builder = builder.add_source(config::File::with_name("config.local").required(false));
875 }
876
877 let cfg = builder
878 .add_source(
879 config::Environment::with_prefix("CRW")
880 .prefix_separator("_")
881 .separator("__")
882 .try_parsing(true),
883 )
884 .build()?;
885 cfg.try_deserialize()
886 }
887
888 pub fn effective_deadline_ms(
905 &self,
906 requested_deadline_ms: Option<u64>,
907 wait_for_ms: Option<u64>,
908 ) -> u64 {
909 if let Some(explicit) = requested_deadline_ms {
910 return explicit;
911 }
912 let default_ms = self.request.deadline_ms_default;
913 if !self.request.auto_extend_deadline_for_ladder {
914 return default_ms;
915 }
916 if self.renderer.cdp_tier_count() == 0 {
923 return default_ms;
924 }
925 let ladder_min = self.renderer.min_deadline_for_full_ladder_ms();
926 const SPA_DEFAULT_MS: u64 = 8_000;
931 let extra = if let Some(w) = wait_for_ms {
937 let bounded = w.min(MAX_WAIT_FOR_MS);
938 let per_tier = bounded.saturating_sub(SPA_DEFAULT_MS);
939 per_tier.saturating_mul(self.renderer.cdp_tier_count() as u64)
940 } else {
941 0
942 };
943 default_ms.max(ladder_min.saturating_add(extra))
944 }
945
946 pub fn effective_request_timeout_secs(&self) -> u64 {
959 let baseline = self.server.request_timeout_secs;
960 if !self.request.auto_extend_deadline_for_ladder {
961 return baseline;
962 }
963 const OUTER_BUFFER_SECS: u64 = 5;
964 const MAP_REQUEST_TIMEOUT_CEILING_MS: u64 = 300_000;
968 let scrape_ms = self.effective_deadline_ms(None, Some(MAX_WAIT_FOR_MS));
974
975 let conc = (self.crawler.max_concurrency.max(1)) as u64;
979 let max_results = self.search.max_limit as u64;
980 let enrich_batches = max_results.div_ceil(conc);
981 let search_enrichment_ms = enrich_batches.saturating_mul(scrape_ms);
982 let search_ms = self.search.timeout_ms.saturating_add(search_enrichment_ms);
983
984 let max_handler_ms = scrape_ms.max(search_ms).max(MAP_REQUEST_TIMEOUT_CEILING_MS);
985 let needed_secs = max_handler_ms
986 .div_ceil(1_000)
987 .saturating_add(OUTER_BUFFER_SECS);
988 baseline.max(needed_secs)
989 }
990}
991
992#[cfg(test)]
993mod tests {
994 use super::*;
995
996 static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
999
1000 fn clear_renderer_env() {
1001 for k in [
1002 "CRW_RENDERER__MODE",
1003 "CRW_RENDERER__FORCE_JS",
1004 "CRW_RENDERER__RENDER_JS_DEFAULT",
1005 "CRW_RENDERER__LIGHTPANDA__WS_URL",
1006 "CRW_SERVER__PORT",
1007 ] {
1008 unsafe { std::env::remove_var(k) };
1009 }
1010 }
1011
1012 #[test]
1013 fn renderer_mode_parses_variants() {
1014 #[derive(Deserialize)]
1015 struct Wrap {
1016 mode: RendererMode,
1017 }
1018 let cases = [
1019 ("mode = \"auto\"", RendererMode::Auto),
1020 ("mode = \"none\"", RendererMode::None),
1021 ("mode = \"lightpanda\"", RendererMode::Lightpanda),
1022 ("mode = \"chrome\"", RendererMode::Chrome),
1023 ("mode = \"playwright\"", RendererMode::Playwright),
1024 ];
1025 for (toml_str, expected) in cases {
1026 let w: Wrap = toml::from_str(toml_str).unwrap();
1027 assert_eq!(w.mode, expected, "toml: {toml_str}");
1028 }
1029 }
1030
1031 #[test]
1032 fn renderer_mode_bogus_errors() {
1033 #[derive(Deserialize)]
1034 struct Wrap {
1035 #[allow(dead_code)]
1036 mode: RendererMode,
1037 }
1038 let err: Result<Wrap, _> = toml::from_str("mode = \"bogus\"");
1039 assert!(err.is_err(), "bogus mode should fail to parse");
1040 }
1041
1042 #[test]
1043 fn renderer_config_default_mode_is_auto() {
1044 let cfg = RendererConfig::default();
1045 assert_eq!(cfg.mode, RendererMode::Auto);
1046 assert_eq!(cfg.render_js_default, None);
1047 }
1048
1049 #[test]
1050 fn render_js_default_force_js_alias() {
1051 let cfg: RendererConfig = toml::from_str("force_js = true").unwrap();
1052 assert_eq!(cfg.render_js_default, Some(true));
1053 }
1054
1055 #[test]
1056 fn render_js_default_direct_field() {
1057 let cfg: RendererConfig = toml::from_str("render_js_default = false").unwrap();
1058 assert_eq!(cfg.render_js_default, Some(false));
1059 }
1060
1061 #[test]
1062 fn env_var_renderer_mode_chrome() {
1063 let _g = ENV_LOCK.lock().unwrap();
1064 clear_renderer_env();
1065 unsafe { std::env::set_var("CRW_RENDERER__MODE", "chrome") };
1066 let cfg = AppConfig::load().unwrap();
1067 clear_renderer_env();
1068 assert_eq!(cfg.renderer.mode, RendererMode::Chrome);
1069 }
1070
1071 #[test]
1072 fn env_var_force_js_alias_works() {
1073 let _g = ENV_LOCK.lock().unwrap();
1074 clear_renderer_env();
1075 unsafe { std::env::set_var("CRW_RENDERER__FORCE_JS", "true") };
1076 let cfg = AppConfig::load().unwrap();
1077 clear_renderer_env();
1078 assert_eq!(cfg.renderer.render_js_default, Some(true));
1079 }
1080
1081 #[test]
1082 fn env_var_render_js_default_direct() {
1083 let _g = ENV_LOCK.lock().unwrap();
1084 clear_renderer_env();
1085 unsafe { std::env::set_var("CRW_RENDERER__RENDER_JS_DEFAULT", "true") };
1086 let cfg = AppConfig::load().unwrap();
1087 clear_renderer_env();
1088 assert_eq!(cfg.renderer.render_js_default, Some(true));
1089 }
1090
1091 #[test]
1092 fn request_config_defaults_match_plan() {
1093 let r = RequestConfig::default();
1094 assert_eq!(r.deadline_ms_default, 8000);
1095 assert!(r.auto_extend_deadline_for_ladder);
1096 }
1097
1098 #[test]
1099 fn default_app_config_enables_auto_extend() {
1100 let cfg = AppConfig::default();
1102 assert!(cfg.request.auto_extend_deadline_for_ladder);
1103 assert_eq!(cfg.request.deadline_ms_default, 8000);
1104 }
1105
1106 fn renderer_with_chrome_only(chrome_ms: u64) -> RendererConfig {
1107 RendererConfig {
1108 mode: RendererMode::Chrome,
1109 page_timeout_ms: chrome_ms,
1110 chrome_timeout_ms: Some(chrome_ms),
1111 chrome: Some(CdpEndpoint {
1112 ws_url: "ws://chrome:9222".into(),
1113 }),
1114 ..Default::default()
1115 }
1116 }
1117
1118 #[test]
1119 #[cfg(feature = "cdp")]
1120 fn min_deadline_full_ladder_chrome_only() {
1121 let r = renderer_with_chrome_only(30_000);
1123 assert_eq!(
1125 r.min_deadline_for_full_ladder_ms(),
1126 30_000 + 30_000 + 28_000
1127 );
1128 }
1129
1130 #[test]
1131 #[cfg(feature = "cdp")]
1132 fn min_deadline_full_ladder_auto_three_tiers() {
1133 let r = RendererConfig {
1134 mode: RendererMode::Auto,
1135 page_timeout_ms: 15_000,
1136 http_timeout_ms: Some(15_000),
1137 lightpanda_timeout_ms: Some(2_500),
1138 chrome_timeout_ms: Some(30_000),
1139 lightpanda: Some(CdpEndpoint {
1140 ws_url: "ws://lp:9222".into(),
1141 }),
1142 chrome: Some(CdpEndpoint {
1143 ws_url: "ws://chrome:9222".into(),
1144 }),
1145 ..Default::default()
1146 };
1147 assert_eq!(
1149 r.min_deadline_for_full_ladder_ms(),
1150 15_000 + 2_500 + 30_000 + 2 * 28_000
1151 );
1152 assert_eq!(r.cdp_tier_count(), 2);
1153 }
1154
1155 #[test]
1156 fn effective_deadline_explicit_bypasses_auto_extend() {
1157 let mut cfg = AppConfig::default();
1158 cfg.request.auto_extend_deadline_for_ladder = true;
1159 cfg.renderer = renderer_with_chrome_only(30_000);
1160 assert_eq!(cfg.effective_deadline_ms(Some(5_000), None), 5_000);
1162 assert_eq!(cfg.effective_deadline_ms(Some(500_000), None), 500_000);
1163 }
1164
1165 #[test]
1166 #[cfg(feature = "cdp")]
1167 fn effective_deadline_auto_extend_raises_to_ladder_min() {
1168 let mut cfg = AppConfig::default();
1169 cfg.request.auto_extend_deadline_for_ladder = true;
1170 cfg.request.deadline_ms_default = 8_000;
1171 cfg.renderer = renderer_with_chrome_only(30_000);
1172 let expected = cfg.renderer.min_deadline_for_full_ladder_ms();
1173 assert!(expected > 8_000);
1174 assert_eq!(cfg.effective_deadline_ms(None, None), expected);
1175 }
1176
1177 #[test]
1178 fn effective_deadline_default_wins_when_higher_than_ladder() {
1179 let mut cfg = AppConfig::default();
1180 cfg.request.auto_extend_deadline_for_ladder = true;
1181 cfg.request.deadline_ms_default = 1_000_000;
1182 cfg.renderer = renderer_with_chrome_only(30_000);
1183 assert_eq!(cfg.effective_deadline_ms(None, None), 1_000_000);
1184 }
1185
1186 #[test]
1187 fn effective_deadline_auto_extend_disabled_returns_baseline() {
1188 let mut cfg = AppConfig::default();
1189 cfg.request.auto_extend_deadline_for_ladder = false;
1190 cfg.request.deadline_ms_default = 8_000;
1191 cfg.renderer = renderer_with_chrome_only(30_000);
1192 assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1193 }
1194
1195 #[test]
1196 #[cfg(feature = "cdp")]
1197 fn effective_deadline_extends_for_long_wait_for() {
1198 let mut cfg = AppConfig::default();
1199 cfg.request.auto_extend_deadline_for_ladder = true;
1200 cfg.request.deadline_ms_default = 8_000;
1201 cfg.renderer = renderer_with_chrome_only(30_000);
1202 let base = cfg.renderer.min_deadline_for_full_ladder_ms();
1203 let tier_count = cfg.renderer.cdp_tier_count() as u64;
1204 let with_wait = cfg.effective_deadline_ms(None, Some(20_000));
1206 assert_eq!(with_wait, base + 12_000 * tier_count);
1207 assert_eq!(cfg.effective_deadline_ms(None, Some(2_000)), base);
1209 }
1210
1211 #[test]
1212 fn effective_request_timeout_covers_map_ceiling() {
1213 let mut cfg = AppConfig::default();
1214 cfg.request.auto_extend_deadline_for_ladder = true;
1215 cfg.request.deadline_ms_default = 8_000;
1216 cfg.renderer = renderer_with_chrome_only(30_000);
1217 cfg.search.timeout_ms = 15_000;
1218 cfg.crawler.max_concurrency = 10;
1219 cfg.search.max_limit = 20;
1220 cfg.server.request_timeout_secs = 60;
1221 assert!(cfg.effective_request_timeout_secs() >= 305);
1223 }
1224
1225 #[test]
1226 fn effective_request_timeout_disabled_returns_baseline() {
1227 let mut cfg = AppConfig::default();
1228 cfg.request.auto_extend_deadline_for_ladder = false;
1229 cfg.server.request_timeout_secs = 60;
1230 assert_eq!(cfg.effective_request_timeout_secs(), 60);
1231 }
1232
1233 #[test]
1234 fn effective_request_timeout_respects_operator_override() {
1235 let mut cfg = AppConfig::default();
1236 cfg.request.auto_extend_deadline_for_ladder = true;
1237 cfg.server.request_timeout_secs = 600; cfg.renderer = renderer_with_chrome_only(30_000);
1239 assert_eq!(cfg.effective_request_timeout_secs(), 600);
1241 }
1242
1243 #[test]
1244 fn effective_request_timeout_search_sequential_batching() {
1245 let mut cfg = AppConfig::default();
1247 cfg.request.auto_extend_deadline_for_ladder = true;
1248 cfg.request.deadline_ms_default = 8_000;
1249 cfg.renderer = renderer_with_chrome_only(30_000);
1250 cfg.search.timeout_ms = 15_000;
1251 cfg.search.max_limit = 20;
1252 cfg.crawler.max_concurrency = 1;
1253 cfg.server.request_timeout_secs = 60;
1254 let secs = cfg.effective_request_timeout_secs();
1258 let scrape_ms = cfg.effective_deadline_ms(None, Some(60_000));
1259 let expected_search_ms = 15_000 + 20 * scrape_ms;
1260 let expected_max_ms = scrape_ms.max(expected_search_ms).max(300_000);
1261 let expected_secs = expected_max_ms.div_ceil(1_000) + 5;
1262 assert_eq!(secs, 60u64.max(expected_secs));
1263 }
1264
1265 #[test]
1266 #[cfg(not(feature = "cdp"))]
1267 fn cdp_tier_count_zero_without_cdp_feature() {
1268 let r = RendererConfig {
1272 mode: RendererMode::Auto,
1273 page_timeout_ms: 15_000,
1274 chrome_timeout_ms: Some(30_000),
1275 chrome: Some(CdpEndpoint {
1276 ws_url: "ws://chrome:9222".into(),
1277 }),
1278 lightpanda: Some(CdpEndpoint {
1279 ws_url: "ws://lp:9222".into(),
1280 }),
1281 ..Default::default()
1282 };
1283 assert_eq!(r.cdp_tier_count(), 0);
1284 assert_eq!(r.min_deadline_for_full_ladder_ms(), 15_000);
1286 }
1287
1288 #[test]
1289 fn effective_deadline_skipped_for_http_only_mode() {
1290 let mut cfg = AppConfig::default();
1295 cfg.request.auto_extend_deadline_for_ladder = true;
1296 cfg.request.deadline_ms_default = 8_000;
1297 cfg.renderer = RendererConfig {
1298 mode: RendererMode::Auto,
1299 page_timeout_ms: 30_000,
1300 lightpanda: None,
1302 playwright: None,
1303 chrome: None,
1304 ..Default::default()
1305 };
1306 assert_eq!(cfg.renderer.cdp_tier_count(), 0);
1307 assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1308 assert_eq!(cfg.effective_deadline_ms(None, Some(30_000)), 8_000);
1309 }
1310
1311 #[test]
1312 #[cfg(feature = "cdp")]
1313 fn min_deadline_full_ladder_playwright_only() {
1314 let r = RendererConfig {
1317 mode: RendererMode::Playwright,
1318 page_timeout_ms: 15_000,
1319 http_timeout_ms: Some(15_000),
1320 chrome_timeout_ms: Some(30_000),
1321 playwright: Some(CdpEndpoint {
1322 ws_url: "ws://playwright:9222".into(),
1323 }),
1324 ..Default::default()
1325 };
1326 assert_eq!(r.cdp_tier_count(), 1);
1327 assert_eq!(
1329 r.min_deadline_for_full_ladder_ms(),
1330 15_000 + 30_000 + 28_000
1331 );
1332 }
1333
1334 #[test]
1335 fn renderer_phase_toggles_default_off_or_safe() {
1336 let r = RendererConfig::default();
1337 assert!(!r.chrome_intercept_resources);
1338 assert!(!r.chrome_intercept_stylesheets);
1339 assert!(r.chrome_host_intercept_disable.is_empty());
1340 assert_eq!(r.chrome_nav_budget_ms, 12_000);
1341 assert!(!r.chrome_context_pool_enabled);
1342 assert!(!r.use_predictor);
1343 }
1344
1345 #[test]
1346 fn crawler_per_host_limiter_defaults() {
1347 let c = CrawlerConfig::default();
1348 assert_eq!(c.per_host_min_interval_ms, 0);
1349 assert_eq!(c.per_host_max_concurrent, 1);
1350 }
1351
1352 #[test]
1353 fn env_var_overrides_toml_defaults() {
1354 let _g = ENV_LOCK.lock().unwrap();
1355 clear_renderer_env();
1356 unsafe {
1357 std::env::set_var("CRW_SERVER__PORT", "4444");
1358 std::env::set_var("CRW_RENDERER__LIGHTPANDA__WS_URL", "ws://test:9999/");
1359 }
1360 let cfg = AppConfig::load().unwrap();
1361 clear_renderer_env();
1362
1363 assert_eq!(cfg.server.port, 4444, "env var should override server.port");
1364 assert_eq!(
1365 cfg.renderer.lightpanda.as_ref().unwrap().ws_url,
1366 "ws://test:9999/",
1367 "env var should override renderer.lightpanda.ws_url"
1368 );
1369 }
1370}