1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize, Default)]
4pub struct AppConfig {
5 #[serde(default)]
6 pub server: ServerConfig,
7 #[serde(default)]
8 pub renderer: RendererConfig,
9 #[serde(default)]
10 pub crawler: CrawlerConfig,
11 #[serde(default)]
12 pub extraction: ExtractionConfig,
13 #[serde(default)]
14 pub auth: AuthConfig,
15 #[serde(default)]
16 pub request: RequestConfig,
17 #[serde(default)]
18 pub search: SearchConfig,
19}
20
21pub const CDP_TIER_OVERHEAD_MS: u64 = 28_000;
31
32pub const MAX_WAIT_FOR_MS: u64 = 60_000;
39
40#[derive(Debug, Clone, Deserialize)]
46pub struct SearchConfig {
47 #[serde(default = "default_true_search")]
50 pub enabled: bool,
51 #[serde(default)]
54 pub searxng_url: Option<String>,
55 #[serde(default = "default_search_timeout_ms")]
57 pub timeout_ms: u64,
58 #[serde(default = "default_search_limit")]
60 pub default_limit: u32,
61 #[serde(default = "default_search_max_limit")]
63 pub max_limit: u32,
64 #[serde(default = "default_research_engines")]
67 pub research_engines: Vec<String>,
68 #[serde(default = "default_github_engines")]
70 pub github_engines: Vec<String>,
71}
72
73impl Default for SearchConfig {
74 fn default() -> Self {
75 Self {
76 enabled: true,
77 searxng_url: None,
78 timeout_ms: default_search_timeout_ms(),
79 default_limit: default_search_limit(),
80 max_limit: default_search_max_limit(),
81 research_engines: default_research_engines(),
82 github_engines: default_github_engines(),
83 }
84 }
85}
86
87fn default_true_search() -> bool {
88 true
89}
90fn default_search_timeout_ms() -> u64 {
91 15_000
92}
93fn default_search_limit() -> u32 {
94 5
95}
96fn default_search_max_limit() -> u32 {
97 20
98}
99fn default_research_engines() -> Vec<String> {
100 vec![
101 "arxiv".into(),
102 "crossref".into(),
103 "google scholar".into(),
104 "semantic scholar".into(),
105 ]
106}
107fn default_github_engines() -> Vec<String> {
108 vec!["github".into()]
109}
110
111#[derive(Debug, Clone, Deserialize)]
115pub struct RequestConfig {
116 #[serde(default = "default_deadline_ms")]
121 pub deadline_ms_default: u64,
122 #[serde(default = "default_true_request")]
132 pub auto_extend_deadline_for_ladder: bool,
133}
134
135impl Default for RequestConfig {
136 fn default() -> Self {
137 Self {
138 deadline_ms_default: default_deadline_ms(),
139 auto_extend_deadline_for_ladder: true,
140 }
141 }
142}
143
144fn default_true_request() -> bool {
145 true
146}
147
148fn default_deadline_ms() -> u64 {
149 8000
150}
151
152#[derive(Debug, Clone, Deserialize)]
153pub struct ServerConfig {
154 #[serde(default = "default_host")]
155 pub host: String,
156 #[serde(default = "default_port")]
157 pub port: u16,
158 #[serde(default = "default_request_timeout")]
159 pub request_timeout_secs: u64,
160 #[serde(default = "default_rate_limit_rps")]
162 pub rate_limit_rps: u64,
163}
164
165impl Default for ServerConfig {
166 fn default() -> Self {
167 Self {
168 host: default_host(),
169 port: default_port(),
170 request_timeout_secs: default_request_timeout(),
171 rate_limit_rps: default_rate_limit_rps(),
172 }
173 }
174}
175
176fn default_rate_limit_rps() -> u64 {
177 10
178}
179
180fn default_host() -> String {
181 "0.0.0.0".into()
182}
183fn default_port() -> u16 {
184 3000
185}
186fn default_request_timeout() -> u64 {
187 60
188}
189
190#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize)]
200#[serde(rename_all = "lowercase")]
201pub enum RendererMode {
202 #[default]
203 Auto,
204 None,
205 Lightpanda,
206 Chrome,
207 Playwright,
208}
209
210#[derive(Debug, Clone, Deserialize)]
211pub struct RendererConfig {
212 #[serde(default)]
213 pub mode: RendererMode,
214 #[serde(default = "default_page_timeout")]
218 pub page_timeout_ms: u64,
219 #[serde(default)]
224 pub http_timeout_ms: Option<u64>,
225 #[serde(default)]
230 pub lightpanda_timeout_ms: Option<u64>,
231 #[serde(default)]
235 pub chrome_timeout_ms: Option<u64>,
236 #[serde(default = "default_pool_size")]
237 pub pool_size: usize,
238 #[serde(default, alias = "force_js")]
243 pub render_js_default: Option<bool>,
244 #[serde(default)]
245 pub lightpanda: Option<CdpEndpoint>,
246 #[serde(default)]
247 pub playwright: Option<CdpEndpoint>,
248 #[serde(default)]
249 pub chrome: Option<CdpEndpoint>,
250 #[serde(default)]
254 pub chrome_intercept_resources: bool,
255 #[serde(default)]
259 pub chrome_intercept_stylesheets: bool,
260 #[serde(default)]
263 pub chrome_host_intercept_disable: Vec<String>,
264 #[serde(default = "default_chrome_nav_budget_ms")]
269 pub chrome_nav_budget_ms: u64,
270 #[serde(default)]
274 pub chrome_context_pool_enabled: bool,
275 #[serde(default)]
279 pub use_predictor: bool,
280 #[serde(default)]
283 pub escalation: EscalationConfig,
284 #[serde(default)]
286 pub antibot: AntibotConfig,
287}
288
289#[derive(Debug, Clone, Deserialize)]
292pub struct EscalationConfig {
293 #[serde(default)]
295 pub enabled: bool,
296 #[serde(default = "default_waterfall_timeout_ms")]
300 pub waterfall_timeout_ms: u64,
301 #[serde(default = "default_escalation_global_timeout_ms")]
303 pub global_timeout_ms: u64,
304 #[serde(default)]
307 pub residential_proxy: bool,
308 #[serde(default = "default_proxy_country")]
310 pub proxy_country: String,
311}
312
313impl Default for EscalationConfig {
314 fn default() -> Self {
315 Self {
316 enabled: false,
317 waterfall_timeout_ms: default_waterfall_timeout_ms(),
318 global_timeout_ms: default_escalation_global_timeout_ms(),
319 residential_proxy: false,
320 proxy_country: default_proxy_country(),
321 }
322 }
323}
324
325fn default_waterfall_timeout_ms() -> u64 {
326 8_000
327}
328fn default_escalation_global_timeout_ms() -> u64 {
329 60_000
330}
331fn default_proxy_country() -> String {
332 "us".to_string()
333}
334
335#[derive(Debug, Clone, Deserialize)]
338pub struct AntibotConfig {
339 #[serde(default = "default_true")]
341 pub enabled: bool,
342 #[serde(default)]
345 pub escalate_on_signal: bool,
346}
347
348impl Default for AntibotConfig {
349 fn default() -> Self {
350 Self {
351 enabled: true,
352 escalate_on_signal: false,
353 }
354 }
355}
356
357fn default_chrome_nav_budget_ms() -> u64 {
358 12_000
359}
360
361impl Default for RendererConfig {
362 fn default() -> Self {
363 Self {
364 mode: RendererMode::default(),
365 page_timeout_ms: default_page_timeout(),
366 http_timeout_ms: None,
367 lightpanda_timeout_ms: None,
368 chrome_timeout_ms: None,
369 pool_size: default_pool_size(),
370 render_js_default: None,
371 lightpanda: None,
372 playwright: None,
373 chrome: None,
374 chrome_intercept_resources: false,
375 chrome_intercept_stylesheets: false,
376 chrome_host_intercept_disable: Vec::new(),
377 chrome_nav_budget_ms: default_chrome_nav_budget_ms(),
378 chrome_context_pool_enabled: false,
379 use_predictor: false,
380 escalation: EscalationConfig::default(),
381 antibot: AntibotConfig::default(),
382 }
383 }
384}
385fn default_page_timeout() -> u64 {
386 30000
387}
388
389impl RendererConfig {
390 pub fn http_timeout(&self) -> u64 {
400 self.http_timeout_ms.unwrap_or(self.page_timeout_ms)
401 }
402 pub fn lightpanda_timeout(&self) -> u64 {
403 self.lightpanda_timeout_ms.unwrap_or(self.page_timeout_ms)
404 }
405 pub fn chrome_timeout(&self) -> u64 {
406 self.chrome_timeout_ms.unwrap_or(self.page_timeout_ms)
407 }
408
409 pub fn cdp_tier_count(&self) -> usize {
418 if !cfg!(feature = "cdp") {
419 return 0;
420 }
421 let want =
422 |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
423 let mut n = 0;
424 if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
425 n += 1;
426 }
427 if want(RendererMode::Playwright) && self.playwright.is_some() {
428 n += 1;
429 }
430 if want(RendererMode::Chrome) && self.chrome.is_some() {
431 n += 1;
432 }
433 n
434 }
435
436 pub fn min_deadline_for_full_ladder_ms(&self) -> u64 {
442 let want =
443 |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
444
445 let mut sum: u64 = 0;
446 if !matches!(self.mode, RendererMode::None) {
450 sum = sum.saturating_add(self.http_timeout());
451 }
452
453 if !cfg!(feature = "cdp") {
457 return sum;
458 }
459
460 let mut cdp_tier_count: u64 = 0;
461 if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
462 sum = sum.saturating_add(self.lightpanda_timeout());
463 cdp_tier_count += 1;
464 }
465 if want(RendererMode::Playwright) && self.playwright.is_some() {
466 sum = sum.saturating_add(self.chrome_timeout());
467 cdp_tier_count += 1;
468 }
469 if want(RendererMode::Chrome) && self.chrome.is_some() {
470 sum = sum.saturating_add(self.chrome_timeout());
471 cdp_tier_count += 1;
472 }
473 sum.saturating_add(cdp_tier_count.saturating_mul(CDP_TIER_OVERHEAD_MS))
474 }
475}
476fn default_pool_size() -> usize {
477 4
478}
479
480#[derive(Debug, Clone, Deserialize)]
481pub struct CdpEndpoint {
482 pub ws_url: String,
483}
484
485#[derive(Debug, Clone, Deserialize)]
487pub struct StealthConfig {
488 #[serde(default)]
490 pub enabled: bool,
491 #[serde(default)]
493 pub user_agents: Vec<String>,
494 #[serde(default = "default_jitter")]
496 pub jitter_factor: f64,
497 #[serde(default = "default_true")]
499 pub inject_headers: bool,
500}
501
502impl Default for StealthConfig {
503 fn default() -> Self {
504 Self {
505 enabled: false,
506 user_agents: vec![],
507 jitter_factor: default_jitter(),
508 inject_headers: true,
509 }
510 }
511}
512
513fn default_jitter() -> f64 {
514 0.2
515}
516
517pub const BUILTIN_UA_POOL: &[&str] = &[
519 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
520 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
521 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
522 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
523 "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
524];
525
526#[derive(Debug, Clone, Deserialize)]
527pub struct CrawlerConfig {
528 #[serde(default = "default_concurrency")]
529 pub max_concurrency: usize,
530 #[serde(default = "default_rps")]
531 pub requests_per_second: f64,
532 #[serde(default = "default_true")]
533 pub respect_robots_txt: bool,
534 #[serde(default = "default_ua")]
535 pub user_agent: String,
536 #[serde(default = "default_depth")]
537 pub default_max_depth: u32,
538 #[serde(default = "default_max_pages")]
539 pub default_max_pages: u32,
540 #[serde(default)]
543 pub proxy: Option<String>,
544 #[serde(default = "default_job_ttl")]
546 pub job_ttl_secs: u64,
547 #[serde(default)]
548 pub stealth: StealthConfig,
549 #[serde(default)]
554 pub per_host_min_interval_ms: u64,
555 #[serde(default = "default_per_host_max_concurrent")]
559 pub per_host_max_concurrent: u32,
560}
561
562fn default_per_host_max_concurrent() -> u32 {
563 1
564}
565
566impl Default for CrawlerConfig {
567 fn default() -> Self {
568 Self {
569 max_concurrency: default_concurrency(),
570 requests_per_second: default_rps(),
571 respect_robots_txt: true,
572 user_agent: default_ua(),
573 default_max_depth: default_depth(),
574 default_max_pages: default_max_pages(),
575 proxy: None,
576 job_ttl_secs: default_job_ttl(),
577 stealth: StealthConfig::default(),
578 per_host_min_interval_ms: 0,
579 per_host_max_concurrent: default_per_host_max_concurrent(),
580 }
581 }
582}
583
584fn default_concurrency() -> usize {
585 10
586}
587fn default_rps() -> f64 {
588 10.0
589}
590fn default_true() -> bool {
591 true
592}
593fn default_ua() -> String {
594 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 \
598 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
599 .into()
600}
601fn default_depth() -> u32 {
602 2
603}
604fn default_max_pages() -> u32 {
605 100
606}
607fn default_job_ttl() -> u64 {
608 3600
609}
610
611#[derive(Debug, Clone, Deserialize)]
612pub struct ExtractionConfig {
613 #[serde(default = "default_format")]
614 pub default_format: String,
615 #[serde(default = "default_true_ext")]
616 pub only_main_content: bool,
617 #[serde(default)]
618 pub llm: Option<LlmConfig>,
619 #[serde(default)]
622 pub domain_selectors: std::collections::HashMap<String, String>,
623 #[serde(default)]
624 pub llm_fallback: LlmFallbackConfig,
625 #[serde(default = "default_http_retry_threshold")]
628 pub http_retry_threshold_bytes: usize,
629 #[serde(default = "default_lightpanda_retry_threshold")]
633 pub lightpanda_retry_threshold_bytes: usize,
634}
635
636fn default_http_retry_threshold() -> usize {
637 100
638}
639
640fn default_lightpanda_retry_threshold() -> usize {
641 2000
642}
643
644impl Default for ExtractionConfig {
645 fn default() -> Self {
646 Self {
647 default_format: default_format(),
648 only_main_content: true,
649 llm: None,
650 domain_selectors: std::collections::HashMap::new(),
651 llm_fallback: LlmFallbackConfig::default(),
652 http_retry_threshold_bytes: default_http_retry_threshold(),
653 lightpanda_retry_threshold_bytes: default_lightpanda_retry_threshold(),
654 }
655 }
656}
657
658#[derive(Debug, Clone, Deserialize)]
659pub struct LlmFallbackConfig {
660 #[serde(default)]
661 pub enable: bool,
662 #[serde(default = "default_llm_quality_threshold")]
663 pub quality_threshold: f32,
664 #[serde(default = "default_llm_max_html_bytes")]
665 pub max_html_bytes: usize,
666 #[serde(default)]
671 pub always_run: bool,
672}
673
674impl Default for LlmFallbackConfig {
675 fn default() -> Self {
676 Self {
677 enable: false,
678 quality_threshold: default_llm_quality_threshold(),
679 max_html_bytes: default_llm_max_html_bytes(),
680 always_run: false,
681 }
682 }
683}
684
685fn default_llm_quality_threshold() -> f32 {
686 0.3
687}
688fn default_llm_max_html_bytes() -> usize {
689 100_000
690}
691
692#[derive(Debug, Clone, Deserialize)]
693pub struct LlmConfig {
694 #[serde(default = "default_llm_provider")]
695 pub provider: String,
696 pub api_key: String,
697 #[serde(default = "default_llm_model")]
698 pub model: String,
699 #[serde(default)]
700 pub base_url: Option<String>,
701 #[serde(default = "default_llm_max_tokens")]
702 pub max_tokens: u32,
703 #[serde(default)]
706 pub azure_api_version: Option<String>,
707}
708
709fn default_llm_provider() -> String {
710 "anthropic".into()
711}
712fn default_llm_model() -> String {
713 "claude-sonnet-4-20250514".into()
714}
715fn default_llm_max_tokens() -> u32 {
716 4096
717}
718
719fn default_format() -> String {
720 "markdown".into()
721}
722fn default_true_ext() -> bool {
723 true
724}
725
726#[derive(Debug, Clone, Default, Deserialize)]
727pub struct AuthConfig {
728 #[serde(default)]
729 pub api_keys: Vec<String>,
730}
731
732impl AppConfig {
733 pub fn load() -> Result<Self, config::ConfigError> {
736 let mut builder = config::Config::builder()
737 .add_source(config::File::with_name("config.default").required(false));
738
739 if let Ok(extra) = std::env::var("CRW_CONFIG") {
741 builder = builder.add_source(config::File::with_name(&extra).required(true));
742 } else {
743 builder = builder.add_source(config::File::with_name("config.local").required(false));
744 }
745
746 let cfg = builder
747 .add_source(
748 config::Environment::with_prefix("CRW")
749 .prefix_separator("_")
750 .separator("__")
751 .try_parsing(true),
752 )
753 .build()?;
754 cfg.try_deserialize()
755 }
756
757 pub fn effective_deadline_ms(
774 &self,
775 requested_deadline_ms: Option<u64>,
776 wait_for_ms: Option<u64>,
777 ) -> u64 {
778 if let Some(explicit) = requested_deadline_ms {
779 return explicit;
780 }
781 let default_ms = self.request.deadline_ms_default;
782 if !self.request.auto_extend_deadline_for_ladder {
783 return default_ms;
784 }
785 if self.renderer.cdp_tier_count() == 0 {
792 return default_ms;
793 }
794 let ladder_min = self.renderer.min_deadline_for_full_ladder_ms();
795 const SPA_DEFAULT_MS: u64 = 8_000;
800 let extra = if let Some(w) = wait_for_ms {
806 let bounded = w.min(MAX_WAIT_FOR_MS);
807 let per_tier = bounded.saturating_sub(SPA_DEFAULT_MS);
808 per_tier.saturating_mul(self.renderer.cdp_tier_count() as u64)
809 } else {
810 0
811 };
812 default_ms.max(ladder_min.saturating_add(extra))
813 }
814
815 pub fn effective_request_timeout_secs(&self) -> u64 {
828 let baseline = self.server.request_timeout_secs;
829 if !self.request.auto_extend_deadline_for_ladder {
830 return baseline;
831 }
832 const OUTER_BUFFER_SECS: u64 = 5;
833 const MAP_REQUEST_TIMEOUT_CEILING_MS: u64 = 300_000;
837 let scrape_ms = self.effective_deadline_ms(None, Some(MAX_WAIT_FOR_MS));
843
844 let conc = (self.crawler.max_concurrency.max(1)) as u64;
848 let max_results = self.search.max_limit as u64;
849 let enrich_batches = max_results.div_ceil(conc);
850 let search_enrichment_ms = enrich_batches.saturating_mul(scrape_ms);
851 let search_ms = self.search.timeout_ms.saturating_add(search_enrichment_ms);
852
853 let max_handler_ms = scrape_ms.max(search_ms).max(MAP_REQUEST_TIMEOUT_CEILING_MS);
854 let needed_secs = max_handler_ms
855 .div_ceil(1_000)
856 .saturating_add(OUTER_BUFFER_SECS);
857 baseline.max(needed_secs)
858 }
859}
860
861#[cfg(test)]
862mod tests {
863 use super::*;
864
865 static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
868
869 fn clear_renderer_env() {
870 for k in [
871 "CRW_RENDERER__MODE",
872 "CRW_RENDERER__FORCE_JS",
873 "CRW_RENDERER__RENDER_JS_DEFAULT",
874 "CRW_RENDERER__LIGHTPANDA__WS_URL",
875 "CRW_SERVER__PORT",
876 ] {
877 unsafe { std::env::remove_var(k) };
878 }
879 }
880
881 #[test]
882 fn renderer_mode_parses_variants() {
883 #[derive(Deserialize)]
884 struct Wrap {
885 mode: RendererMode,
886 }
887 let cases = [
888 ("mode = \"auto\"", RendererMode::Auto),
889 ("mode = \"none\"", RendererMode::None),
890 ("mode = \"lightpanda\"", RendererMode::Lightpanda),
891 ("mode = \"chrome\"", RendererMode::Chrome),
892 ("mode = \"playwright\"", RendererMode::Playwright),
893 ];
894 for (toml_str, expected) in cases {
895 let w: Wrap = toml::from_str(toml_str).unwrap();
896 assert_eq!(w.mode, expected, "toml: {toml_str}");
897 }
898 }
899
900 #[test]
901 fn renderer_mode_bogus_errors() {
902 #[derive(Deserialize)]
903 struct Wrap {
904 #[allow(dead_code)]
905 mode: RendererMode,
906 }
907 let err: Result<Wrap, _> = toml::from_str("mode = \"bogus\"");
908 assert!(err.is_err(), "bogus mode should fail to parse");
909 }
910
911 #[test]
912 fn renderer_config_default_mode_is_auto() {
913 let cfg = RendererConfig::default();
914 assert_eq!(cfg.mode, RendererMode::Auto);
915 assert_eq!(cfg.render_js_default, None);
916 }
917
918 #[test]
919 fn render_js_default_force_js_alias() {
920 let cfg: RendererConfig = toml::from_str("force_js = true").unwrap();
921 assert_eq!(cfg.render_js_default, Some(true));
922 }
923
924 #[test]
925 fn render_js_default_direct_field() {
926 let cfg: RendererConfig = toml::from_str("render_js_default = false").unwrap();
927 assert_eq!(cfg.render_js_default, Some(false));
928 }
929
930 #[test]
931 fn env_var_renderer_mode_chrome() {
932 let _g = ENV_LOCK.lock().unwrap();
933 clear_renderer_env();
934 unsafe { std::env::set_var("CRW_RENDERER__MODE", "chrome") };
935 let cfg = AppConfig::load().unwrap();
936 clear_renderer_env();
937 assert_eq!(cfg.renderer.mode, RendererMode::Chrome);
938 }
939
940 #[test]
941 fn env_var_force_js_alias_works() {
942 let _g = ENV_LOCK.lock().unwrap();
943 clear_renderer_env();
944 unsafe { std::env::set_var("CRW_RENDERER__FORCE_JS", "true") };
945 let cfg = AppConfig::load().unwrap();
946 clear_renderer_env();
947 assert_eq!(cfg.renderer.render_js_default, Some(true));
948 }
949
950 #[test]
951 fn env_var_render_js_default_direct() {
952 let _g = ENV_LOCK.lock().unwrap();
953 clear_renderer_env();
954 unsafe { std::env::set_var("CRW_RENDERER__RENDER_JS_DEFAULT", "true") };
955 let cfg = AppConfig::load().unwrap();
956 clear_renderer_env();
957 assert_eq!(cfg.renderer.render_js_default, Some(true));
958 }
959
960 #[test]
961 fn request_config_defaults_match_plan() {
962 let r = RequestConfig::default();
963 assert_eq!(r.deadline_ms_default, 8000);
964 assert!(r.auto_extend_deadline_for_ladder);
965 }
966
967 #[test]
968 fn default_app_config_enables_auto_extend() {
969 let cfg = AppConfig::default();
971 assert!(cfg.request.auto_extend_deadline_for_ladder);
972 assert_eq!(cfg.request.deadline_ms_default, 8000);
973 }
974
975 fn renderer_with_chrome_only(chrome_ms: u64) -> RendererConfig {
976 RendererConfig {
977 mode: RendererMode::Chrome,
978 page_timeout_ms: chrome_ms,
979 chrome_timeout_ms: Some(chrome_ms),
980 chrome: Some(CdpEndpoint {
981 ws_url: "ws://chrome:9222".into(),
982 }),
983 ..Default::default()
984 }
985 }
986
987 #[test]
988 #[cfg(feature = "cdp")]
989 fn min_deadline_full_ladder_chrome_only() {
990 let r = renderer_with_chrome_only(30_000);
992 assert_eq!(
994 r.min_deadline_for_full_ladder_ms(),
995 30_000 + 30_000 + 28_000
996 );
997 }
998
999 #[test]
1000 #[cfg(feature = "cdp")]
1001 fn min_deadline_full_ladder_auto_three_tiers() {
1002 let r = RendererConfig {
1003 mode: RendererMode::Auto,
1004 page_timeout_ms: 15_000,
1005 http_timeout_ms: Some(15_000),
1006 lightpanda_timeout_ms: Some(2_500),
1007 chrome_timeout_ms: Some(30_000),
1008 lightpanda: Some(CdpEndpoint {
1009 ws_url: "ws://lp:9222".into(),
1010 }),
1011 chrome: Some(CdpEndpoint {
1012 ws_url: "ws://chrome:9222".into(),
1013 }),
1014 ..Default::default()
1015 };
1016 assert_eq!(
1018 r.min_deadline_for_full_ladder_ms(),
1019 15_000 + 2_500 + 30_000 + 2 * 28_000
1020 );
1021 assert_eq!(r.cdp_tier_count(), 2);
1022 }
1023
1024 #[test]
1025 fn effective_deadline_explicit_bypasses_auto_extend() {
1026 let mut cfg = AppConfig::default();
1027 cfg.request.auto_extend_deadline_for_ladder = true;
1028 cfg.renderer = renderer_with_chrome_only(30_000);
1029 assert_eq!(cfg.effective_deadline_ms(Some(5_000), None), 5_000);
1031 assert_eq!(cfg.effective_deadline_ms(Some(500_000), None), 500_000);
1032 }
1033
1034 #[test]
1035 #[cfg(feature = "cdp")]
1036 fn effective_deadline_auto_extend_raises_to_ladder_min() {
1037 let mut cfg = AppConfig::default();
1038 cfg.request.auto_extend_deadline_for_ladder = true;
1039 cfg.request.deadline_ms_default = 8_000;
1040 cfg.renderer = renderer_with_chrome_only(30_000);
1041 let expected = cfg.renderer.min_deadline_for_full_ladder_ms();
1042 assert!(expected > 8_000);
1043 assert_eq!(cfg.effective_deadline_ms(None, None), expected);
1044 }
1045
1046 #[test]
1047 fn effective_deadline_default_wins_when_higher_than_ladder() {
1048 let mut cfg = AppConfig::default();
1049 cfg.request.auto_extend_deadline_for_ladder = true;
1050 cfg.request.deadline_ms_default = 1_000_000;
1051 cfg.renderer = renderer_with_chrome_only(30_000);
1052 assert_eq!(cfg.effective_deadline_ms(None, None), 1_000_000);
1053 }
1054
1055 #[test]
1056 fn effective_deadline_auto_extend_disabled_returns_baseline() {
1057 let mut cfg = AppConfig::default();
1058 cfg.request.auto_extend_deadline_for_ladder = false;
1059 cfg.request.deadline_ms_default = 8_000;
1060 cfg.renderer = renderer_with_chrome_only(30_000);
1061 assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1062 }
1063
1064 #[test]
1065 #[cfg(feature = "cdp")]
1066 fn effective_deadline_extends_for_long_wait_for() {
1067 let mut cfg = AppConfig::default();
1068 cfg.request.auto_extend_deadline_for_ladder = true;
1069 cfg.request.deadline_ms_default = 8_000;
1070 cfg.renderer = renderer_with_chrome_only(30_000);
1071 let base = cfg.renderer.min_deadline_for_full_ladder_ms();
1072 let tier_count = cfg.renderer.cdp_tier_count() as u64;
1073 let with_wait = cfg.effective_deadline_ms(None, Some(20_000));
1075 assert_eq!(with_wait, base + 12_000 * tier_count);
1076 assert_eq!(cfg.effective_deadline_ms(None, Some(2_000)), base);
1078 }
1079
1080 #[test]
1081 fn effective_request_timeout_covers_map_ceiling() {
1082 let mut cfg = AppConfig::default();
1083 cfg.request.auto_extend_deadline_for_ladder = true;
1084 cfg.request.deadline_ms_default = 8_000;
1085 cfg.renderer = renderer_with_chrome_only(30_000);
1086 cfg.search.timeout_ms = 15_000;
1087 cfg.crawler.max_concurrency = 10;
1088 cfg.search.max_limit = 20;
1089 cfg.server.request_timeout_secs = 60;
1090 assert!(cfg.effective_request_timeout_secs() >= 305);
1092 }
1093
1094 #[test]
1095 fn effective_request_timeout_disabled_returns_baseline() {
1096 let mut cfg = AppConfig::default();
1097 cfg.request.auto_extend_deadline_for_ladder = false;
1098 cfg.server.request_timeout_secs = 60;
1099 assert_eq!(cfg.effective_request_timeout_secs(), 60);
1100 }
1101
1102 #[test]
1103 fn effective_request_timeout_respects_operator_override() {
1104 let mut cfg = AppConfig::default();
1105 cfg.request.auto_extend_deadline_for_ladder = true;
1106 cfg.server.request_timeout_secs = 600; cfg.renderer = renderer_with_chrome_only(30_000);
1108 assert_eq!(cfg.effective_request_timeout_secs(), 600);
1110 }
1111
1112 #[test]
1113 fn effective_request_timeout_search_sequential_batching() {
1114 let mut cfg = AppConfig::default();
1116 cfg.request.auto_extend_deadline_for_ladder = true;
1117 cfg.request.deadline_ms_default = 8_000;
1118 cfg.renderer = renderer_with_chrome_only(30_000);
1119 cfg.search.timeout_ms = 15_000;
1120 cfg.search.max_limit = 20;
1121 cfg.crawler.max_concurrency = 1;
1122 cfg.server.request_timeout_secs = 60;
1123 let secs = cfg.effective_request_timeout_secs();
1127 let scrape_ms = cfg.effective_deadline_ms(None, Some(60_000));
1128 let expected_search_ms = 15_000 + 20 * scrape_ms;
1129 let expected_max_ms = scrape_ms.max(expected_search_ms).max(300_000);
1130 let expected_secs = expected_max_ms.div_ceil(1_000) + 5;
1131 assert_eq!(secs, 60u64.max(expected_secs));
1132 }
1133
1134 #[test]
1135 #[cfg(not(feature = "cdp"))]
1136 fn cdp_tier_count_zero_without_cdp_feature() {
1137 let r = RendererConfig {
1141 mode: RendererMode::Auto,
1142 page_timeout_ms: 15_000,
1143 chrome_timeout_ms: Some(30_000),
1144 chrome: Some(CdpEndpoint {
1145 ws_url: "ws://chrome:9222".into(),
1146 }),
1147 lightpanda: Some(CdpEndpoint {
1148 ws_url: "ws://lp:9222".into(),
1149 }),
1150 ..Default::default()
1151 };
1152 assert_eq!(r.cdp_tier_count(), 0);
1153 assert_eq!(r.min_deadline_for_full_ladder_ms(), 15_000);
1155 }
1156
1157 #[test]
1158 fn effective_deadline_skipped_for_http_only_mode() {
1159 let mut cfg = AppConfig::default();
1164 cfg.request.auto_extend_deadline_for_ladder = true;
1165 cfg.request.deadline_ms_default = 8_000;
1166 cfg.renderer = RendererConfig {
1167 mode: RendererMode::Auto,
1168 page_timeout_ms: 30_000,
1169 lightpanda: None,
1171 playwright: None,
1172 chrome: None,
1173 ..Default::default()
1174 };
1175 assert_eq!(cfg.renderer.cdp_tier_count(), 0);
1176 assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1177 assert_eq!(cfg.effective_deadline_ms(None, Some(30_000)), 8_000);
1178 }
1179
1180 #[test]
1181 #[cfg(feature = "cdp")]
1182 fn min_deadline_full_ladder_playwright_only() {
1183 let r = RendererConfig {
1186 mode: RendererMode::Playwright,
1187 page_timeout_ms: 15_000,
1188 http_timeout_ms: Some(15_000),
1189 chrome_timeout_ms: Some(30_000),
1190 playwright: Some(CdpEndpoint {
1191 ws_url: "ws://playwright:9222".into(),
1192 }),
1193 ..Default::default()
1194 };
1195 assert_eq!(r.cdp_tier_count(), 1);
1196 assert_eq!(
1198 r.min_deadline_for_full_ladder_ms(),
1199 15_000 + 30_000 + 28_000
1200 );
1201 }
1202
1203 #[test]
1204 fn renderer_phase_toggles_default_off_or_safe() {
1205 let r = RendererConfig::default();
1206 assert!(!r.chrome_intercept_resources);
1207 assert!(!r.chrome_intercept_stylesheets);
1208 assert!(r.chrome_host_intercept_disable.is_empty());
1209 assert_eq!(r.chrome_nav_budget_ms, 12_000);
1210 assert!(!r.chrome_context_pool_enabled);
1211 assert!(!r.use_predictor);
1212 }
1213
1214 #[test]
1215 fn crawler_per_host_limiter_defaults() {
1216 let c = CrawlerConfig::default();
1217 assert_eq!(c.per_host_min_interval_ms, 0);
1218 assert_eq!(c.per_host_max_concurrent, 1);
1219 }
1220
1221 #[test]
1222 fn env_var_overrides_toml_defaults() {
1223 let _g = ENV_LOCK.lock().unwrap();
1224 clear_renderer_env();
1225 unsafe {
1226 std::env::set_var("CRW_SERVER__PORT", "4444");
1227 std::env::set_var("CRW_RENDERER__LIGHTPANDA__WS_URL", "ws://test:9999/");
1228 }
1229 let cfg = AppConfig::load().unwrap();
1230 clear_renderer_env();
1231
1232 assert_eq!(cfg.server.port, 4444, "env var should override server.port");
1233 assert_eq!(
1234 cfg.renderer.lightpanda.as_ref().unwrap().ws_url,
1235 "ws://test:9999/",
1236 "env var should override renderer.lightpanda.ws_url"
1237 );
1238 }
1239}