1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize, Default)]
4pub struct AppConfig {
5 #[serde(default)]
6 pub server: ServerConfig,
7 #[serde(default)]
8 pub renderer: RendererConfig,
9 #[serde(default)]
10 pub crawler: CrawlerConfig,
11 #[serde(default)]
12 pub extraction: ExtractionConfig,
13 #[serde(default)]
14 pub auth: AuthConfig,
15 #[serde(default)]
16 pub request: RequestConfig,
17 #[serde(default)]
18 pub search: SearchConfig,
19 #[serde(default)]
20 pub map: MapConfig,
21}
22
23#[derive(Debug, Clone, Deserialize, Default)]
25pub struct MapConfig {
26 #[serde(default)]
27 pub url_filter: MapUrlFilterConfig,
28}
29
30#[derive(Debug, Clone, Deserialize)]
35pub struct MapUrlFilterConfig {
36 #[serde(default = "default_true_filter")]
38 pub strip_tracking_params: bool,
39 #[serde(default = "default_true_filter")]
41 pub drop_action_urls: bool,
42 #[serde(default)]
44 pub gov_tld_drop_actions: bool,
45 #[serde(default)]
47 pub extra_tracking_params: Vec<String>,
48 #[serde(default)]
50 pub extra_action_params: Vec<String>,
51 #[serde(default)]
53 pub extra_preserve_params: Vec<String>,
54}
55
56impl Default for MapUrlFilterConfig {
57 fn default() -> Self {
58 Self {
59 strip_tracking_params: true,
60 drop_action_urls: true,
61 gov_tld_drop_actions: false,
62 extra_tracking_params: Vec::new(),
63 extra_action_params: Vec::new(),
64 extra_preserve_params: Vec::new(),
65 }
66 }
67}
68
69fn default_true_filter() -> bool {
70 true
71}
72
73pub const CDP_TIER_OVERHEAD_MS: u64 = 28_000;
83
84pub const MAX_WAIT_FOR_MS: u64 = 60_000;
91
92#[derive(Debug, Clone, Deserialize)]
98pub struct SearchConfig {
99 #[serde(default = "default_true_search")]
102 pub enabled: bool,
103 #[serde(default)]
106 pub searxng_url: Option<String>,
107 #[serde(default = "default_search_timeout_ms")]
109 pub timeout_ms: u64,
110 #[serde(default = "default_search_limit")]
112 pub default_limit: u32,
113 #[serde(default = "default_search_max_limit")]
115 pub max_limit: u32,
116 #[serde(default = "default_research_engines")]
119 pub research_engines: Vec<String>,
120 #[serde(default = "default_github_engines")]
122 pub github_engines: Vec<String>,
123}
124
125impl Default for SearchConfig {
126 fn default() -> Self {
127 Self {
128 enabled: true,
129 searxng_url: None,
130 timeout_ms: default_search_timeout_ms(),
131 default_limit: default_search_limit(),
132 max_limit: default_search_max_limit(),
133 research_engines: default_research_engines(),
134 github_engines: default_github_engines(),
135 }
136 }
137}
138
139fn default_true_search() -> bool {
140 true
141}
142fn default_search_timeout_ms() -> u64 {
143 15_000
144}
145fn default_search_limit() -> u32 {
146 5
147}
148fn default_search_max_limit() -> u32 {
149 20
150}
151fn default_research_engines() -> Vec<String> {
152 vec![
153 "arxiv".into(),
154 "crossref".into(),
155 "google scholar".into(),
156 "semantic scholar".into(),
157 ]
158}
159fn default_github_engines() -> Vec<String> {
160 vec!["github".into()]
161}
162
163#[derive(Debug, Clone, Deserialize)]
167pub struct RequestConfig {
168 #[serde(default = "default_deadline_ms")]
173 pub deadline_ms_default: u64,
174 #[serde(default = "default_true_request")]
184 pub auto_extend_deadline_for_ladder: bool,
185}
186
187impl Default for RequestConfig {
188 fn default() -> Self {
189 Self {
190 deadline_ms_default: default_deadline_ms(),
191 auto_extend_deadline_for_ladder: true,
192 }
193 }
194}
195
196fn default_true_request() -> bool {
197 true
198}
199
200fn default_deadline_ms() -> u64 {
201 8000
202}
203
204#[derive(Debug, Clone, Deserialize)]
205pub struct ServerConfig {
206 #[serde(default = "default_host")]
207 pub host: String,
208 #[serde(default = "default_port")]
209 pub port: u16,
210 #[serde(default = "default_request_timeout")]
211 pub request_timeout_secs: u64,
212 #[serde(default = "default_rate_limit_rps")]
214 pub rate_limit_rps: u64,
215}
216
217impl Default for ServerConfig {
218 fn default() -> Self {
219 Self {
220 host: default_host(),
221 port: default_port(),
222 request_timeout_secs: default_request_timeout(),
223 rate_limit_rps: default_rate_limit_rps(),
224 }
225 }
226}
227
228fn default_rate_limit_rps() -> u64 {
229 10
230}
231
232fn default_host() -> String {
233 "0.0.0.0".into()
234}
235fn default_port() -> u16 {
236 3000
237}
238fn default_request_timeout() -> u64 {
239 60
240}
241
242#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize)]
252#[serde(rename_all = "lowercase")]
253pub enum RendererMode {
254 #[default]
255 Auto,
256 None,
257 Lightpanda,
258 Chrome,
259 Playwright,
260}
261
262#[derive(Debug, Clone, Deserialize)]
263pub struct RendererConfig {
264 #[serde(default)]
265 pub mode: RendererMode,
266 #[serde(default = "default_page_timeout")]
270 pub page_timeout_ms: u64,
271 #[serde(default)]
276 pub http_timeout_ms: Option<u64>,
277 #[serde(default)]
282 pub lightpanda_timeout_ms: Option<u64>,
283 #[serde(default)]
287 pub chrome_timeout_ms: Option<u64>,
288 #[serde(default = "default_pool_size")]
289 pub pool_size: usize,
290 #[serde(default, alias = "force_js")]
295 pub render_js_default: Option<bool>,
296 #[serde(default)]
297 pub lightpanda: Option<CdpEndpoint>,
298 #[serde(default)]
299 pub playwright: Option<CdpEndpoint>,
300 #[serde(default)]
301 pub chrome: Option<CdpEndpoint>,
302 #[serde(default)]
306 pub chrome_intercept_resources: bool,
307 #[serde(default)]
311 pub chrome_intercept_stylesheets: bool,
312 #[serde(default)]
315 pub chrome_host_intercept_disable: Vec<String>,
316 #[serde(default = "default_chrome_nav_budget_ms")]
321 pub chrome_nav_budget_ms: u64,
322 #[serde(default)]
326 pub chrome_context_pool_enabled: bool,
327 #[serde(default)]
331 pub use_predictor: bool,
332 #[serde(default)]
335 pub escalation: EscalationConfig,
336 #[serde(default)]
338 pub antibot: AntibotConfig,
339}
340
341#[derive(Debug, Clone, Deserialize)]
344pub struct EscalationConfig {
345 #[serde(default)]
347 pub enabled: bool,
348 #[serde(default = "default_waterfall_timeout_ms")]
352 pub waterfall_timeout_ms: u64,
353 #[serde(default = "default_escalation_global_timeout_ms")]
355 pub global_timeout_ms: u64,
356 #[serde(default)]
359 pub residential_proxy: bool,
360 #[serde(default = "default_proxy_country")]
362 pub proxy_country: String,
363}
364
365impl Default for EscalationConfig {
366 fn default() -> Self {
367 Self {
368 enabled: false,
369 waterfall_timeout_ms: default_waterfall_timeout_ms(),
370 global_timeout_ms: default_escalation_global_timeout_ms(),
371 residential_proxy: false,
372 proxy_country: default_proxy_country(),
373 }
374 }
375}
376
377fn default_waterfall_timeout_ms() -> u64 {
378 8_000
379}
380fn default_escalation_global_timeout_ms() -> u64 {
381 60_000
382}
383fn default_proxy_country() -> String {
384 "us".to_string()
385}
386
387#[derive(Debug, Clone, Deserialize)]
390pub struct AntibotConfig {
391 #[serde(default = "default_true")]
393 pub enabled: bool,
394 #[serde(default)]
397 pub escalate_on_signal: bool,
398}
399
400impl Default for AntibotConfig {
401 fn default() -> Self {
402 Self {
403 enabled: true,
404 escalate_on_signal: false,
405 }
406 }
407}
408
409fn default_chrome_nav_budget_ms() -> u64 {
410 12_000
411}
412
413impl Default for RendererConfig {
414 fn default() -> Self {
415 Self {
416 mode: RendererMode::default(),
417 page_timeout_ms: default_page_timeout(),
418 http_timeout_ms: None,
419 lightpanda_timeout_ms: None,
420 chrome_timeout_ms: None,
421 pool_size: default_pool_size(),
422 render_js_default: None,
423 lightpanda: None,
424 playwright: None,
425 chrome: None,
426 chrome_intercept_resources: false,
427 chrome_intercept_stylesheets: false,
428 chrome_host_intercept_disable: Vec::new(),
429 chrome_nav_budget_ms: default_chrome_nav_budget_ms(),
430 chrome_context_pool_enabled: false,
431 use_predictor: false,
432 escalation: EscalationConfig::default(),
433 antibot: AntibotConfig::default(),
434 }
435 }
436}
437fn default_page_timeout() -> u64 {
438 30000
439}
440
441impl RendererConfig {
442 pub fn http_timeout(&self) -> u64 {
452 self.http_timeout_ms.unwrap_or(self.page_timeout_ms)
453 }
454 pub fn lightpanda_timeout(&self) -> u64 {
455 self.lightpanda_timeout_ms.unwrap_or(self.page_timeout_ms)
456 }
457 pub fn chrome_timeout(&self) -> u64 {
458 self.chrome_timeout_ms.unwrap_or(self.page_timeout_ms)
459 }
460
461 pub fn cdp_tier_count(&self) -> usize {
470 if !cfg!(feature = "cdp") {
471 return 0;
472 }
473 let want =
474 |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
475 let mut n = 0;
476 if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
477 n += 1;
478 }
479 if want(RendererMode::Playwright) && self.playwright.is_some() {
480 n += 1;
481 }
482 if want(RendererMode::Chrome) && self.chrome.is_some() {
483 n += 1;
484 }
485 n
486 }
487
488 pub fn min_deadline_for_full_ladder_ms(&self) -> u64 {
494 let want =
495 |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
496
497 let mut sum: u64 = 0;
498 if !matches!(self.mode, RendererMode::None) {
502 sum = sum.saturating_add(self.http_timeout());
503 }
504
505 if !cfg!(feature = "cdp") {
509 return sum;
510 }
511
512 let mut cdp_tier_count: u64 = 0;
513 if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
514 sum = sum.saturating_add(self.lightpanda_timeout());
515 cdp_tier_count += 1;
516 }
517 if want(RendererMode::Playwright) && self.playwright.is_some() {
518 sum = sum.saturating_add(self.chrome_timeout());
519 cdp_tier_count += 1;
520 }
521 if want(RendererMode::Chrome) && self.chrome.is_some() {
522 sum = sum.saturating_add(self.chrome_timeout());
523 cdp_tier_count += 1;
524 }
525 sum.saturating_add(cdp_tier_count.saturating_mul(CDP_TIER_OVERHEAD_MS))
526 }
527}
528fn default_pool_size() -> usize {
529 4
530}
531
532#[derive(Debug, Clone, Deserialize)]
533pub struct CdpEndpoint {
534 pub ws_url: String,
535}
536
537#[derive(Debug, Clone, Deserialize)]
539pub struct StealthConfig {
540 #[serde(default)]
542 pub enabled: bool,
543 #[serde(default)]
545 pub user_agents: Vec<String>,
546 #[serde(default = "default_jitter")]
548 pub jitter_factor: f64,
549 #[serde(default = "default_true")]
551 pub inject_headers: bool,
552}
553
554impl Default for StealthConfig {
555 fn default() -> Self {
556 Self {
557 enabled: false,
558 user_agents: vec![],
559 jitter_factor: default_jitter(),
560 inject_headers: true,
561 }
562 }
563}
564
565fn default_jitter() -> f64 {
566 0.2
567}
568
569pub const BUILTIN_UA_POOL: &[&str] = &[
571 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
572 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
573 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
574 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
575 "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
576];
577
578#[derive(Debug, Clone, Deserialize)]
579pub struct CrawlerConfig {
580 #[serde(default = "default_concurrency")]
581 pub max_concurrency: usize,
582 #[serde(default = "default_rps")]
583 pub requests_per_second: f64,
584 #[serde(default = "default_true")]
585 pub respect_robots_txt: bool,
586 #[serde(default = "default_ua")]
587 pub user_agent: String,
588 #[serde(default = "default_depth")]
589 pub default_max_depth: u32,
590 #[serde(default = "default_max_pages")]
591 pub default_max_pages: u32,
592 #[serde(default)]
595 pub proxy: Option<String>,
596 #[serde(default = "default_job_ttl")]
598 pub job_ttl_secs: u64,
599 #[serde(default)]
600 pub stealth: StealthConfig,
601 #[serde(default)]
606 pub per_host_min_interval_ms: u64,
607 #[serde(default = "default_per_host_max_concurrent")]
611 pub per_host_max_concurrent: u32,
612}
613
614fn default_per_host_max_concurrent() -> u32 {
615 1
616}
617
618impl Default for CrawlerConfig {
619 fn default() -> Self {
620 Self {
621 max_concurrency: default_concurrency(),
622 requests_per_second: default_rps(),
623 respect_robots_txt: true,
624 user_agent: default_ua(),
625 default_max_depth: default_depth(),
626 default_max_pages: default_max_pages(),
627 proxy: None,
628 job_ttl_secs: default_job_ttl(),
629 stealth: StealthConfig::default(),
630 per_host_min_interval_ms: 0,
631 per_host_max_concurrent: default_per_host_max_concurrent(),
632 }
633 }
634}
635
636fn default_concurrency() -> usize {
637 10
638}
639fn default_rps() -> f64 {
640 10.0
641}
642fn default_true() -> bool {
643 true
644}
645fn default_ua() -> String {
646 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 \
650 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
651 .into()
652}
653fn default_depth() -> u32 {
654 2
655}
656fn default_max_pages() -> u32 {
657 100
658}
659fn default_job_ttl() -> u64 {
660 3600
661}
662
663#[derive(Debug, Clone, Deserialize)]
664pub struct ExtractionConfig {
665 #[serde(default = "default_format")]
666 pub default_format: String,
667 #[serde(default = "default_true_ext")]
668 pub only_main_content: bool,
669 #[serde(default)]
670 pub llm: Option<LlmConfig>,
671 #[serde(default)]
674 pub domain_selectors: std::collections::HashMap<String, String>,
675 #[serde(default)]
676 pub llm_fallback: LlmFallbackConfig,
677 #[serde(default = "default_http_retry_threshold")]
680 pub http_retry_threshold_bytes: usize,
681 #[serde(default = "default_lightpanda_retry_threshold")]
685 pub lightpanda_retry_threshold_bytes: usize,
686}
687
688fn default_http_retry_threshold() -> usize {
689 100
690}
691
692fn default_lightpanda_retry_threshold() -> usize {
693 2000
694}
695
696impl Default for ExtractionConfig {
697 fn default() -> Self {
698 Self {
699 default_format: default_format(),
700 only_main_content: true,
701 llm: None,
702 domain_selectors: std::collections::HashMap::new(),
703 llm_fallback: LlmFallbackConfig::default(),
704 http_retry_threshold_bytes: default_http_retry_threshold(),
705 lightpanda_retry_threshold_bytes: default_lightpanda_retry_threshold(),
706 }
707 }
708}
709
710#[derive(Debug, Clone, Deserialize)]
711pub struct LlmFallbackConfig {
712 #[serde(default)]
713 pub enable: bool,
714 #[serde(default = "default_llm_quality_threshold")]
715 pub quality_threshold: f32,
716 #[serde(default = "default_llm_max_html_bytes")]
717 pub max_html_bytes: usize,
718 #[serde(default)]
723 pub always_run: bool,
724}
725
726impl Default for LlmFallbackConfig {
727 fn default() -> Self {
728 Self {
729 enable: false,
730 quality_threshold: default_llm_quality_threshold(),
731 max_html_bytes: default_llm_max_html_bytes(),
732 always_run: false,
733 }
734 }
735}
736
737fn default_llm_quality_threshold() -> f32 {
738 0.3
739}
740fn default_llm_max_html_bytes() -> usize {
741 100_000
742}
743
744#[derive(Debug, Clone, Deserialize)]
745pub struct LlmConfig {
746 #[serde(default = "default_llm_provider")]
747 pub provider: String,
748 pub api_key: String,
749 #[serde(default = "default_llm_model")]
750 pub model: String,
751 #[serde(default)]
752 pub base_url: Option<String>,
753 #[serde(default = "default_llm_max_tokens")]
754 pub max_tokens: u32,
755 #[serde(default)]
758 pub azure_api_version: Option<String>,
759}
760
761fn default_llm_provider() -> String {
762 "anthropic".into()
763}
764fn default_llm_model() -> String {
765 "claude-sonnet-4-20250514".into()
766}
767fn default_llm_max_tokens() -> u32 {
768 4096
769}
770
771fn default_format() -> String {
772 "markdown".into()
773}
774fn default_true_ext() -> bool {
775 true
776}
777
778#[derive(Debug, Clone, Default, Deserialize)]
779pub struct AuthConfig {
780 #[serde(default)]
781 pub api_keys: Vec<String>,
782}
783
784impl AppConfig {
785 pub fn load() -> Result<Self, config::ConfigError> {
788 let mut builder = config::Config::builder()
789 .add_source(config::File::with_name("config.default").required(false));
790
791 if let Ok(extra) = std::env::var("CRW_CONFIG") {
793 builder = builder.add_source(config::File::with_name(&extra).required(true));
794 } else {
795 builder = builder.add_source(config::File::with_name("config.local").required(false));
796 }
797
798 let cfg = builder
799 .add_source(
800 config::Environment::with_prefix("CRW")
801 .prefix_separator("_")
802 .separator("__")
803 .try_parsing(true),
804 )
805 .build()?;
806 cfg.try_deserialize()
807 }
808
809 pub fn effective_deadline_ms(
826 &self,
827 requested_deadline_ms: Option<u64>,
828 wait_for_ms: Option<u64>,
829 ) -> u64 {
830 if let Some(explicit) = requested_deadline_ms {
831 return explicit;
832 }
833 let default_ms = self.request.deadline_ms_default;
834 if !self.request.auto_extend_deadline_for_ladder {
835 return default_ms;
836 }
837 if self.renderer.cdp_tier_count() == 0 {
844 return default_ms;
845 }
846 let ladder_min = self.renderer.min_deadline_for_full_ladder_ms();
847 const SPA_DEFAULT_MS: u64 = 8_000;
852 let extra = if let Some(w) = wait_for_ms {
858 let bounded = w.min(MAX_WAIT_FOR_MS);
859 let per_tier = bounded.saturating_sub(SPA_DEFAULT_MS);
860 per_tier.saturating_mul(self.renderer.cdp_tier_count() as u64)
861 } else {
862 0
863 };
864 default_ms.max(ladder_min.saturating_add(extra))
865 }
866
867 pub fn effective_request_timeout_secs(&self) -> u64 {
880 let baseline = self.server.request_timeout_secs;
881 if !self.request.auto_extend_deadline_for_ladder {
882 return baseline;
883 }
884 const OUTER_BUFFER_SECS: u64 = 5;
885 const MAP_REQUEST_TIMEOUT_CEILING_MS: u64 = 300_000;
889 let scrape_ms = self.effective_deadline_ms(None, Some(MAX_WAIT_FOR_MS));
895
896 let conc = (self.crawler.max_concurrency.max(1)) as u64;
900 let max_results = self.search.max_limit as u64;
901 let enrich_batches = max_results.div_ceil(conc);
902 let search_enrichment_ms = enrich_batches.saturating_mul(scrape_ms);
903 let search_ms = self.search.timeout_ms.saturating_add(search_enrichment_ms);
904
905 let max_handler_ms = scrape_ms.max(search_ms).max(MAP_REQUEST_TIMEOUT_CEILING_MS);
906 let needed_secs = max_handler_ms
907 .div_ceil(1_000)
908 .saturating_add(OUTER_BUFFER_SECS);
909 baseline.max(needed_secs)
910 }
911}
912
913#[cfg(test)]
914mod tests {
915 use super::*;
916
917 static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
920
921 fn clear_renderer_env() {
922 for k in [
923 "CRW_RENDERER__MODE",
924 "CRW_RENDERER__FORCE_JS",
925 "CRW_RENDERER__RENDER_JS_DEFAULT",
926 "CRW_RENDERER__LIGHTPANDA__WS_URL",
927 "CRW_SERVER__PORT",
928 ] {
929 unsafe { std::env::remove_var(k) };
930 }
931 }
932
933 #[test]
934 fn renderer_mode_parses_variants() {
935 #[derive(Deserialize)]
936 struct Wrap {
937 mode: RendererMode,
938 }
939 let cases = [
940 ("mode = \"auto\"", RendererMode::Auto),
941 ("mode = \"none\"", RendererMode::None),
942 ("mode = \"lightpanda\"", RendererMode::Lightpanda),
943 ("mode = \"chrome\"", RendererMode::Chrome),
944 ("mode = \"playwright\"", RendererMode::Playwright),
945 ];
946 for (toml_str, expected) in cases {
947 let w: Wrap = toml::from_str(toml_str).unwrap();
948 assert_eq!(w.mode, expected, "toml: {toml_str}");
949 }
950 }
951
952 #[test]
953 fn renderer_mode_bogus_errors() {
954 #[derive(Deserialize)]
955 struct Wrap {
956 #[allow(dead_code)]
957 mode: RendererMode,
958 }
959 let err: Result<Wrap, _> = toml::from_str("mode = \"bogus\"");
960 assert!(err.is_err(), "bogus mode should fail to parse");
961 }
962
963 #[test]
964 fn renderer_config_default_mode_is_auto() {
965 let cfg = RendererConfig::default();
966 assert_eq!(cfg.mode, RendererMode::Auto);
967 assert_eq!(cfg.render_js_default, None);
968 }
969
970 #[test]
971 fn render_js_default_force_js_alias() {
972 let cfg: RendererConfig = toml::from_str("force_js = true").unwrap();
973 assert_eq!(cfg.render_js_default, Some(true));
974 }
975
976 #[test]
977 fn render_js_default_direct_field() {
978 let cfg: RendererConfig = toml::from_str("render_js_default = false").unwrap();
979 assert_eq!(cfg.render_js_default, Some(false));
980 }
981
982 #[test]
983 fn env_var_renderer_mode_chrome() {
984 let _g = ENV_LOCK.lock().unwrap();
985 clear_renderer_env();
986 unsafe { std::env::set_var("CRW_RENDERER__MODE", "chrome") };
987 let cfg = AppConfig::load().unwrap();
988 clear_renderer_env();
989 assert_eq!(cfg.renderer.mode, RendererMode::Chrome);
990 }
991
992 #[test]
993 fn env_var_force_js_alias_works() {
994 let _g = ENV_LOCK.lock().unwrap();
995 clear_renderer_env();
996 unsafe { std::env::set_var("CRW_RENDERER__FORCE_JS", "true") };
997 let cfg = AppConfig::load().unwrap();
998 clear_renderer_env();
999 assert_eq!(cfg.renderer.render_js_default, Some(true));
1000 }
1001
1002 #[test]
1003 fn env_var_render_js_default_direct() {
1004 let _g = ENV_LOCK.lock().unwrap();
1005 clear_renderer_env();
1006 unsafe { std::env::set_var("CRW_RENDERER__RENDER_JS_DEFAULT", "true") };
1007 let cfg = AppConfig::load().unwrap();
1008 clear_renderer_env();
1009 assert_eq!(cfg.renderer.render_js_default, Some(true));
1010 }
1011
1012 #[test]
1013 fn request_config_defaults_match_plan() {
1014 let r = RequestConfig::default();
1015 assert_eq!(r.deadline_ms_default, 8000);
1016 assert!(r.auto_extend_deadline_for_ladder);
1017 }
1018
1019 #[test]
1020 fn default_app_config_enables_auto_extend() {
1021 let cfg = AppConfig::default();
1023 assert!(cfg.request.auto_extend_deadline_for_ladder);
1024 assert_eq!(cfg.request.deadline_ms_default, 8000);
1025 }
1026
1027 fn renderer_with_chrome_only(chrome_ms: u64) -> RendererConfig {
1028 RendererConfig {
1029 mode: RendererMode::Chrome,
1030 page_timeout_ms: chrome_ms,
1031 chrome_timeout_ms: Some(chrome_ms),
1032 chrome: Some(CdpEndpoint {
1033 ws_url: "ws://chrome:9222".into(),
1034 }),
1035 ..Default::default()
1036 }
1037 }
1038
1039 #[test]
1040 #[cfg(feature = "cdp")]
1041 fn min_deadline_full_ladder_chrome_only() {
1042 let r = renderer_with_chrome_only(30_000);
1044 assert_eq!(
1046 r.min_deadline_for_full_ladder_ms(),
1047 30_000 + 30_000 + 28_000
1048 );
1049 }
1050
1051 #[test]
1052 #[cfg(feature = "cdp")]
1053 fn min_deadline_full_ladder_auto_three_tiers() {
1054 let r = RendererConfig {
1055 mode: RendererMode::Auto,
1056 page_timeout_ms: 15_000,
1057 http_timeout_ms: Some(15_000),
1058 lightpanda_timeout_ms: Some(2_500),
1059 chrome_timeout_ms: Some(30_000),
1060 lightpanda: Some(CdpEndpoint {
1061 ws_url: "ws://lp:9222".into(),
1062 }),
1063 chrome: Some(CdpEndpoint {
1064 ws_url: "ws://chrome:9222".into(),
1065 }),
1066 ..Default::default()
1067 };
1068 assert_eq!(
1070 r.min_deadline_for_full_ladder_ms(),
1071 15_000 + 2_500 + 30_000 + 2 * 28_000
1072 );
1073 assert_eq!(r.cdp_tier_count(), 2);
1074 }
1075
1076 #[test]
1077 fn effective_deadline_explicit_bypasses_auto_extend() {
1078 let mut cfg = AppConfig::default();
1079 cfg.request.auto_extend_deadline_for_ladder = true;
1080 cfg.renderer = renderer_with_chrome_only(30_000);
1081 assert_eq!(cfg.effective_deadline_ms(Some(5_000), None), 5_000);
1083 assert_eq!(cfg.effective_deadline_ms(Some(500_000), None), 500_000);
1084 }
1085
1086 #[test]
1087 #[cfg(feature = "cdp")]
1088 fn effective_deadline_auto_extend_raises_to_ladder_min() {
1089 let mut cfg = AppConfig::default();
1090 cfg.request.auto_extend_deadline_for_ladder = true;
1091 cfg.request.deadline_ms_default = 8_000;
1092 cfg.renderer = renderer_with_chrome_only(30_000);
1093 let expected = cfg.renderer.min_deadline_for_full_ladder_ms();
1094 assert!(expected > 8_000);
1095 assert_eq!(cfg.effective_deadline_ms(None, None), expected);
1096 }
1097
1098 #[test]
1099 fn effective_deadline_default_wins_when_higher_than_ladder() {
1100 let mut cfg = AppConfig::default();
1101 cfg.request.auto_extend_deadline_for_ladder = true;
1102 cfg.request.deadline_ms_default = 1_000_000;
1103 cfg.renderer = renderer_with_chrome_only(30_000);
1104 assert_eq!(cfg.effective_deadline_ms(None, None), 1_000_000);
1105 }
1106
1107 #[test]
1108 fn effective_deadline_auto_extend_disabled_returns_baseline() {
1109 let mut cfg = AppConfig::default();
1110 cfg.request.auto_extend_deadline_for_ladder = false;
1111 cfg.request.deadline_ms_default = 8_000;
1112 cfg.renderer = renderer_with_chrome_only(30_000);
1113 assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1114 }
1115
1116 #[test]
1117 #[cfg(feature = "cdp")]
1118 fn effective_deadline_extends_for_long_wait_for() {
1119 let mut cfg = AppConfig::default();
1120 cfg.request.auto_extend_deadline_for_ladder = true;
1121 cfg.request.deadline_ms_default = 8_000;
1122 cfg.renderer = renderer_with_chrome_only(30_000);
1123 let base = cfg.renderer.min_deadline_for_full_ladder_ms();
1124 let tier_count = cfg.renderer.cdp_tier_count() as u64;
1125 let with_wait = cfg.effective_deadline_ms(None, Some(20_000));
1127 assert_eq!(with_wait, base + 12_000 * tier_count);
1128 assert_eq!(cfg.effective_deadline_ms(None, Some(2_000)), base);
1130 }
1131
1132 #[test]
1133 fn effective_request_timeout_covers_map_ceiling() {
1134 let mut cfg = AppConfig::default();
1135 cfg.request.auto_extend_deadline_for_ladder = true;
1136 cfg.request.deadline_ms_default = 8_000;
1137 cfg.renderer = renderer_with_chrome_only(30_000);
1138 cfg.search.timeout_ms = 15_000;
1139 cfg.crawler.max_concurrency = 10;
1140 cfg.search.max_limit = 20;
1141 cfg.server.request_timeout_secs = 60;
1142 assert!(cfg.effective_request_timeout_secs() >= 305);
1144 }
1145
1146 #[test]
1147 fn effective_request_timeout_disabled_returns_baseline() {
1148 let mut cfg = AppConfig::default();
1149 cfg.request.auto_extend_deadline_for_ladder = false;
1150 cfg.server.request_timeout_secs = 60;
1151 assert_eq!(cfg.effective_request_timeout_secs(), 60);
1152 }
1153
1154 #[test]
1155 fn effective_request_timeout_respects_operator_override() {
1156 let mut cfg = AppConfig::default();
1157 cfg.request.auto_extend_deadline_for_ladder = true;
1158 cfg.server.request_timeout_secs = 600; cfg.renderer = renderer_with_chrome_only(30_000);
1160 assert_eq!(cfg.effective_request_timeout_secs(), 600);
1162 }
1163
1164 #[test]
1165 fn effective_request_timeout_search_sequential_batching() {
1166 let mut cfg = AppConfig::default();
1168 cfg.request.auto_extend_deadline_for_ladder = true;
1169 cfg.request.deadline_ms_default = 8_000;
1170 cfg.renderer = renderer_with_chrome_only(30_000);
1171 cfg.search.timeout_ms = 15_000;
1172 cfg.search.max_limit = 20;
1173 cfg.crawler.max_concurrency = 1;
1174 cfg.server.request_timeout_secs = 60;
1175 let secs = cfg.effective_request_timeout_secs();
1179 let scrape_ms = cfg.effective_deadline_ms(None, Some(60_000));
1180 let expected_search_ms = 15_000 + 20 * scrape_ms;
1181 let expected_max_ms = scrape_ms.max(expected_search_ms).max(300_000);
1182 let expected_secs = expected_max_ms.div_ceil(1_000) + 5;
1183 assert_eq!(secs, 60u64.max(expected_secs));
1184 }
1185
1186 #[test]
1187 #[cfg(not(feature = "cdp"))]
1188 fn cdp_tier_count_zero_without_cdp_feature() {
1189 let r = RendererConfig {
1193 mode: RendererMode::Auto,
1194 page_timeout_ms: 15_000,
1195 chrome_timeout_ms: Some(30_000),
1196 chrome: Some(CdpEndpoint {
1197 ws_url: "ws://chrome:9222".into(),
1198 }),
1199 lightpanda: Some(CdpEndpoint {
1200 ws_url: "ws://lp:9222".into(),
1201 }),
1202 ..Default::default()
1203 };
1204 assert_eq!(r.cdp_tier_count(), 0);
1205 assert_eq!(r.min_deadline_for_full_ladder_ms(), 15_000);
1207 }
1208
1209 #[test]
1210 fn effective_deadline_skipped_for_http_only_mode() {
1211 let mut cfg = AppConfig::default();
1216 cfg.request.auto_extend_deadline_for_ladder = true;
1217 cfg.request.deadline_ms_default = 8_000;
1218 cfg.renderer = RendererConfig {
1219 mode: RendererMode::Auto,
1220 page_timeout_ms: 30_000,
1221 lightpanda: None,
1223 playwright: None,
1224 chrome: None,
1225 ..Default::default()
1226 };
1227 assert_eq!(cfg.renderer.cdp_tier_count(), 0);
1228 assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1229 assert_eq!(cfg.effective_deadline_ms(None, Some(30_000)), 8_000);
1230 }
1231
1232 #[test]
1233 #[cfg(feature = "cdp")]
1234 fn min_deadline_full_ladder_playwright_only() {
1235 let r = RendererConfig {
1238 mode: RendererMode::Playwright,
1239 page_timeout_ms: 15_000,
1240 http_timeout_ms: Some(15_000),
1241 chrome_timeout_ms: Some(30_000),
1242 playwright: Some(CdpEndpoint {
1243 ws_url: "ws://playwright:9222".into(),
1244 }),
1245 ..Default::default()
1246 };
1247 assert_eq!(r.cdp_tier_count(), 1);
1248 assert_eq!(
1250 r.min_deadline_for_full_ladder_ms(),
1251 15_000 + 30_000 + 28_000
1252 );
1253 }
1254
1255 #[test]
1256 fn renderer_phase_toggles_default_off_or_safe() {
1257 let r = RendererConfig::default();
1258 assert!(!r.chrome_intercept_resources);
1259 assert!(!r.chrome_intercept_stylesheets);
1260 assert!(r.chrome_host_intercept_disable.is_empty());
1261 assert_eq!(r.chrome_nav_budget_ms, 12_000);
1262 assert!(!r.chrome_context_pool_enabled);
1263 assert!(!r.use_predictor);
1264 }
1265
1266 #[test]
1267 fn crawler_per_host_limiter_defaults() {
1268 let c = CrawlerConfig::default();
1269 assert_eq!(c.per_host_min_interval_ms, 0);
1270 assert_eq!(c.per_host_max_concurrent, 1);
1271 }
1272
1273 #[test]
1274 fn env_var_overrides_toml_defaults() {
1275 let _g = ENV_LOCK.lock().unwrap();
1276 clear_renderer_env();
1277 unsafe {
1278 std::env::set_var("CRW_SERVER__PORT", "4444");
1279 std::env::set_var("CRW_RENDERER__LIGHTPANDA__WS_URL", "ws://test:9999/");
1280 }
1281 let cfg = AppConfig::load().unwrap();
1282 clear_renderer_env();
1283
1284 assert_eq!(cfg.server.port, 4444, "env var should override server.port");
1285 assert_eq!(
1286 cfg.renderer.lightpanda.as_ref().unwrap().ws_url,
1287 "ws://test:9999/",
1288 "env var should override renderer.lightpanda.ws_url"
1289 );
1290 }
1291}