1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize, Default)]
4pub struct AppConfig {
5 #[serde(default)]
6 pub server: ServerConfig,
7 #[serde(default)]
8 pub renderer: RendererConfig,
9 #[serde(default)]
10 pub crawler: CrawlerConfig,
11 #[serde(default)]
12 pub extraction: ExtractionConfig,
13 #[serde(default)]
14 pub auth: AuthConfig,
15 #[serde(default)]
16 pub request: RequestConfig,
17 #[serde(default)]
18 pub search: SearchConfig,
19 #[serde(default)]
20 pub map: MapConfig,
21 #[serde(default)]
24 pub client: ClientConfig,
25}
26
27#[derive(Debug, Clone, Default, Deserialize)]
31pub struct ClientConfig {
32 #[serde(default)]
34 pub api_url: Option<String>,
35 #[serde(default)]
37 pub api_key: Option<String>,
38}
39
40#[derive(Debug, Clone, Deserialize, Default)]
42pub struct MapConfig {
43 #[serde(default)]
44 pub url_filter: MapUrlFilterConfig,
45}
46
47#[derive(Debug, Clone, Deserialize)]
52pub struct MapUrlFilterConfig {
53 #[serde(default = "default_true_filter")]
55 pub strip_tracking_params: bool,
56 #[serde(default = "default_true_filter")]
58 pub drop_action_urls: bool,
59 #[serde(default)]
61 pub gov_tld_drop_actions: bool,
62 #[serde(default)]
64 pub extra_tracking_params: Vec<String>,
65 #[serde(default)]
67 pub extra_action_params: Vec<String>,
68 #[serde(default)]
70 pub extra_preserve_params: Vec<String>,
71}
72
73impl Default for MapUrlFilterConfig {
74 fn default() -> Self {
75 Self {
76 strip_tracking_params: true,
77 drop_action_urls: true,
78 gov_tld_drop_actions: false,
79 extra_tracking_params: Vec::new(),
80 extra_action_params: Vec::new(),
81 extra_preserve_params: Vec::new(),
82 }
83 }
84}
85
86fn default_true_filter() -> bool {
87 true
88}
89
90pub const CDP_TIER_OVERHEAD_MS: u64 = 28_000;
100
101pub const MAX_WAIT_FOR_MS: u64 = 60_000;
108
109#[derive(Debug, Clone, Deserialize)]
115pub struct SearchConfig {
116 #[serde(default = "default_true_search")]
119 pub enabled: bool,
120 #[serde(default)]
123 pub searxng_url: Option<String>,
124 #[serde(default = "default_search_timeout_ms")]
126 pub timeout_ms: u64,
127 #[serde(default = "default_search_limit")]
129 pub default_limit: u32,
130 #[serde(default = "default_search_max_limit")]
132 pub max_limit: u32,
133 #[serde(default = "default_research_engines")]
136 pub research_engines: Vec<String>,
137 #[serde(default = "default_github_engines")]
139 pub github_engines: Vec<String>,
140 #[serde(default = "default_true_search")]
145 pub rerank_enabled: bool,
146 #[serde(default)]
155 pub query_expand: bool,
156 #[serde(default = "default_query_expand_variants")]
164 pub query_expand_variants: usize,
165 #[serde(default)]
176 pub multi_round: bool,
177 #[serde(default)]
184 pub passage_select: bool,
185 #[serde(default)]
194 pub page2_fallback: bool,
195 #[serde(default)]
205 pub answer_calibrated: bool,
206 #[serde(default)]
216 pub answer_guarded: bool,
217 #[serde(default)]
226 pub use_structured_sources: bool,
227 #[serde(default)]
235 pub wikidata_lookup: bool,
236 #[serde(default)]
244 pub snippet_fallback: bool,
245}
246
247impl Default for SearchConfig {
248 fn default() -> Self {
249 Self {
250 enabled: true,
251 searxng_url: None,
252 timeout_ms: default_search_timeout_ms(),
253 default_limit: default_search_limit(),
254 max_limit: default_search_max_limit(),
255 research_engines: default_research_engines(),
256 github_engines: default_github_engines(),
257 rerank_enabled: true,
258 query_expand: false,
259 query_expand_variants: default_query_expand_variants(),
260 multi_round: false,
261 passage_select: false,
262 page2_fallback: false,
263 answer_calibrated: false,
264 answer_guarded: false,
265 use_structured_sources: false,
266 wikidata_lookup: false,
267 snippet_fallback: false,
268 }
269 }
270}
271
272fn default_query_expand_variants() -> usize {
273 1
274}
275fn default_true_search() -> bool {
276 true
277}
278fn default_search_timeout_ms() -> u64 {
279 15_000
280}
281fn default_search_limit() -> u32 {
282 5
283}
284fn default_search_max_limit() -> u32 {
285 20
286}
287fn default_research_engines() -> Vec<String> {
288 vec![
289 "arxiv".into(),
290 "crossref".into(),
291 "google scholar".into(),
292 "semantic scholar".into(),
293 ]
294}
295fn default_github_engines() -> Vec<String> {
296 vec!["github".into()]
297}
298
299#[derive(Debug, Clone, Deserialize)]
303pub struct RequestConfig {
304 #[serde(default = "default_deadline_ms")]
309 pub deadline_ms_default: u64,
310 #[serde(default = "default_true_request")]
320 pub auto_extend_deadline_for_ladder: bool,
321}
322
323impl Default for RequestConfig {
324 fn default() -> Self {
325 Self {
326 deadline_ms_default: default_deadline_ms(),
327 auto_extend_deadline_for_ladder: true,
328 }
329 }
330}
331
332fn default_true_request() -> bool {
333 true
334}
335
336fn default_deadline_ms() -> u64 {
337 8000
338}
339
340#[derive(Debug, Clone, Deserialize)]
341pub struct ServerConfig {
342 #[serde(default = "default_host")]
343 pub host: String,
344 #[serde(default = "default_port")]
345 pub port: u16,
346 #[serde(default = "default_request_timeout")]
347 pub request_timeout_secs: u64,
348 #[serde(default = "default_rate_limit_rps")]
350 pub rate_limit_rps: u64,
351}
352
353impl Default for ServerConfig {
354 fn default() -> Self {
355 Self {
356 host: default_host(),
357 port: default_port(),
358 request_timeout_secs: default_request_timeout(),
359 rate_limit_rps: default_rate_limit_rps(),
360 }
361 }
362}
363
364fn default_rate_limit_rps() -> u64 {
365 10
366}
367
368fn default_host() -> String {
369 "0.0.0.0".into()
370}
371fn default_port() -> u16 {
372 3000
373}
374fn default_request_timeout() -> u64 {
375 60
376}
377
378#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize)]
388#[serde(rename_all = "lowercase")]
389pub enum RendererMode {
390 #[default]
391 Auto,
392 None,
393 Lightpanda,
394 Chrome,
395 Playwright,
396}
397
398#[derive(Debug, Clone, Deserialize)]
399pub struct RendererConfig {
400 #[serde(default)]
401 pub mode: RendererMode,
402 #[serde(default = "default_page_timeout")]
406 pub page_timeout_ms: u64,
407 #[serde(default)]
412 pub http_timeout_ms: Option<u64>,
413 #[serde(default)]
418 pub lightpanda_timeout_ms: Option<u64>,
419 #[serde(default)]
423 pub chrome_timeout_ms: Option<u64>,
424 #[serde(default = "default_pool_size")]
425 pub pool_size: usize,
426 #[serde(default, alias = "force_js")]
431 pub render_js_default: Option<bool>,
432 #[serde(default)]
433 pub lightpanda: Option<CdpEndpoint>,
434 #[serde(default)]
435 pub playwright: Option<CdpEndpoint>,
436 #[serde(default)]
437 pub chrome: Option<CdpEndpoint>,
438 #[serde(default)]
444 pub chrome_proxy: Option<CdpEndpoint>,
445 #[serde(default)]
449 pub chrome_proxy_timeout_ms: Option<u64>,
450 #[serde(default)]
454 pub chrome_intercept_resources: bool,
455 #[serde(default)]
459 pub chrome_intercept_stylesheets: bool,
460 #[serde(default)]
463 pub chrome_host_intercept_disable: Vec<String>,
464 #[serde(default = "default_chrome_nav_budget_ms")]
469 pub chrome_nav_budget_ms: u64,
470 #[serde(default)]
477 pub chrome_context_pool_enabled: bool,
478 #[serde(default)]
481 pub chrome_pool: ChromePoolConfig,
482 #[serde(default)]
486 pub chrome_backend: ChromeBackend,
487 #[serde(default)]
491 pub use_predictor: bool,
492 #[serde(default)]
495 pub escalation: EscalationConfig,
496 #[serde(default)]
498 pub antibot: AntibotConfig,
499 #[serde(default)]
507 pub proxy_base_user: Option<String>,
508 #[serde(default)]
510 pub proxy_base_pass: Option<String>,
511 #[serde(default)]
514 pub proxy_default_country: Option<String>,
515}
516
517#[derive(Debug, Clone, Deserialize)]
520pub struct EscalationConfig {
521 #[serde(default)]
523 pub enabled: bool,
524 #[serde(default = "default_waterfall_timeout_ms")]
528 pub waterfall_timeout_ms: u64,
529 #[serde(default = "default_escalation_global_timeout_ms")]
531 pub global_timeout_ms: u64,
532 #[serde(default)]
535 pub residential_proxy: bool,
536 #[serde(default = "default_proxy_country")]
538 pub proxy_country: String,
539}
540
541impl Default for EscalationConfig {
542 fn default() -> Self {
543 Self {
544 enabled: false,
545 waterfall_timeout_ms: default_waterfall_timeout_ms(),
546 global_timeout_ms: default_escalation_global_timeout_ms(),
547 residential_proxy: false,
548 proxy_country: default_proxy_country(),
549 }
550 }
551}
552
553fn default_waterfall_timeout_ms() -> u64 {
554 8_000
555}
556fn default_escalation_global_timeout_ms() -> u64 {
557 60_000
558}
559fn default_proxy_country() -> String {
560 "us".to_string()
561}
562
563#[derive(Debug, Clone, Deserialize)]
566pub struct AntibotConfig {
567 #[serde(default = "default_true")]
569 pub enabled: bool,
570 #[serde(default)]
573 pub escalate_on_signal: bool,
574 #[serde(default = "default_true")]
580 pub escalate_in_failover: bool,
581}
582
583impl Default for AntibotConfig {
584 fn default() -> Self {
585 Self {
586 enabled: true,
587 escalate_on_signal: false,
588 escalate_in_failover: true,
589 }
590 }
591}
592
593fn default_chrome_nav_budget_ms() -> u64 {
594 12_000
595}
596
597#[derive(Debug, Clone, Deserialize)]
601pub struct ChromePoolConfig {
602 #[serde(default)]
605 pub size: Option<usize>,
606 #[serde(default = "default_recycle_after_navs")]
609 pub recycle_after_navs: u32,
610 #[serde(default = "default_idle_timeout_secs")]
612 pub idle_timeout_secs: u64,
613 #[serde(default = "default_health_check_secs")]
615 pub health_check_secs: u64,
616 #[serde(default = "default_shutdown_drain_secs")]
618 pub shutdown_drain_secs: u64,
619}
620
621impl Default for ChromePoolConfig {
622 fn default() -> Self {
623 Self {
624 size: None,
625 recycle_after_navs: default_recycle_after_navs(),
626 idle_timeout_secs: default_idle_timeout_secs(),
627 health_check_secs: default_health_check_secs(),
628 shutdown_drain_secs: default_shutdown_drain_secs(),
629 }
630 }
631}
632
633fn default_recycle_after_navs() -> u32 {
634 1
635}
636fn default_idle_timeout_secs() -> u64 {
637 300
638}
639fn default_health_check_secs() -> u64 {
640 60
641}
642fn default_shutdown_drain_secs() -> u64 {
643 30
644}
645
646#[derive(Debug, Clone, Copy, Default, Deserialize, PartialEq, Eq)]
651#[serde(rename_all = "lowercase")]
652pub enum ChromeBackend {
653 #[default]
656 Vanilla,
657 Browserless,
660}
661
662impl Default for RendererConfig {
663 fn default() -> Self {
664 Self {
665 mode: RendererMode::default(),
666 page_timeout_ms: default_page_timeout(),
667 http_timeout_ms: None,
668 lightpanda_timeout_ms: None,
669 chrome_timeout_ms: None,
670 pool_size: default_pool_size(),
671 render_js_default: None,
672 lightpanda: None,
673 playwright: None,
674 chrome: None,
675 chrome_proxy: None,
676 chrome_proxy_timeout_ms: None,
677 chrome_intercept_resources: false,
678 chrome_intercept_stylesheets: false,
679 chrome_host_intercept_disable: Vec::new(),
680 chrome_nav_budget_ms: default_chrome_nav_budget_ms(),
681 chrome_context_pool_enabled: false,
682 chrome_pool: ChromePoolConfig::default(),
683 chrome_backend: ChromeBackend::default(),
684 use_predictor: false,
685 escalation: EscalationConfig::default(),
686 antibot: AntibotConfig::default(),
687 proxy_base_user: None,
688 proxy_base_pass: None,
689 proxy_default_country: None,
690 }
691 }
692}
693fn default_page_timeout() -> u64 {
694 30000
695}
696
697impl RendererConfig {
698 pub fn http_timeout(&self) -> u64 {
708 self.http_timeout_ms.unwrap_or(self.page_timeout_ms)
709 }
710 pub fn lightpanda_timeout(&self) -> u64 {
711 self.lightpanda_timeout_ms.unwrap_or(self.page_timeout_ms)
712 }
713 pub fn chrome_timeout(&self) -> u64 {
714 self.chrome_timeout_ms.unwrap_or(self.page_timeout_ms)
715 }
716 pub fn chrome_proxy_timeout(&self) -> u64 {
717 self.chrome_proxy_timeout_ms
718 .unwrap_or_else(|| self.chrome_timeout().saturating_add(15_000))
719 }
720
721 pub fn effective_proxy_credentials(&self, country: Option<&str>) -> Option<(String, String)> {
734 let user = self.proxy_base_user.as_ref()?;
735 let pass = self.proxy_base_pass.as_ref()?;
736 let cc = country
737 .or(self.proxy_default_country.as_deref())
738 .map(|s| s.trim().to_lowercase())
739 .filter(|s| s.len() == 2 && s.chars().all(|c| c.is_ascii_alphabetic()));
740 Some(match cc {
741 Some(cc) => (format!("{user}__cr.{cc}"), pass.clone()),
742 None => (user.clone(), pass.clone()),
743 })
744 }
745
746 pub fn cdp_tier_count(&self) -> usize {
755 if !cfg!(feature = "cdp") {
756 return 0;
757 }
758 let want =
759 |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
760 let mut n = 0;
761 if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
762 n += 1;
763 }
764 if want(RendererMode::Playwright) && self.playwright.is_some() {
765 n += 1;
766 }
767 if want(RendererMode::Chrome) && self.chrome.is_some() {
768 n += 1;
769 }
770 n
771 }
772
773 pub fn min_deadline_for_full_ladder_ms(&self) -> u64 {
779 let want =
780 |m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
781
782 let mut sum: u64 = 0;
783 if !matches!(self.mode, RendererMode::None) {
787 sum = sum.saturating_add(self.http_timeout());
788 }
789
790 if !cfg!(feature = "cdp") {
794 return sum;
795 }
796
797 let mut cdp_tier_count: u64 = 0;
798 if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
799 sum = sum.saturating_add(self.lightpanda_timeout());
800 cdp_tier_count += 1;
801 }
802 if want(RendererMode::Playwright) && self.playwright.is_some() {
803 sum = sum.saturating_add(self.chrome_timeout());
804 cdp_tier_count += 1;
805 }
806 if want(RendererMode::Chrome) && self.chrome.is_some() {
807 sum = sum.saturating_add(self.chrome_timeout());
808 cdp_tier_count += 1;
809 }
810 sum.saturating_add(cdp_tier_count.saturating_mul(CDP_TIER_OVERHEAD_MS))
811 }
812}
813fn default_pool_size() -> usize {
814 4
815}
816
817#[derive(Debug, Clone, Deserialize)]
818pub struct CdpEndpoint {
819 pub ws_url: String,
820}
821
822#[derive(Debug, Clone, Deserialize)]
824pub struct StealthConfig {
825 #[serde(default)]
827 pub enabled: bool,
828 #[serde(default)]
830 pub user_agents: Vec<String>,
831 #[serde(default = "default_jitter")]
833 pub jitter_factor: f64,
834 #[serde(default = "default_true")]
836 pub inject_headers: bool,
837}
838
839impl Default for StealthConfig {
840 fn default() -> Self {
841 Self {
842 enabled: false,
843 user_agents: vec![],
844 jitter_factor: default_jitter(),
845 inject_headers: true,
846 }
847 }
848}
849
850fn default_jitter() -> f64 {
851 0.2
852}
853
854pub const BUILTIN_UA_POOL: &[&str] = &[
856 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
857 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
858 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
859 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
860 "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
861];
862
863#[derive(Debug, Clone, Deserialize)]
864pub struct CrawlerConfig {
865 #[serde(default = "default_concurrency")]
866 pub max_concurrency: usize,
867 #[serde(default = "default_rps")]
868 pub requests_per_second: f64,
869 #[serde(default = "default_true")]
870 pub respect_robots_txt: bool,
871 #[serde(default = "default_ua")]
872 pub user_agent: String,
873 #[serde(default = "default_depth")]
874 pub default_max_depth: u32,
875 #[serde(default = "default_max_pages")]
876 pub default_max_pages: u32,
877 #[serde(default)]
880 pub proxy: Option<String>,
881 #[serde(default = "default_job_ttl")]
883 pub job_ttl_secs: u64,
884 #[serde(default)]
885 pub stealth: StealthConfig,
886 #[serde(default)]
891 pub per_host_min_interval_ms: u64,
892 #[serde(default = "default_per_host_max_concurrent")]
896 pub per_host_max_concurrent: u32,
897}
898
899fn default_per_host_max_concurrent() -> u32 {
900 1
901}
902
903impl Default for CrawlerConfig {
904 fn default() -> Self {
905 Self {
906 max_concurrency: default_concurrency(),
907 requests_per_second: default_rps(),
908 respect_robots_txt: true,
909 user_agent: default_ua(),
910 default_max_depth: default_depth(),
911 default_max_pages: default_max_pages(),
912 proxy: None,
913 job_ttl_secs: default_job_ttl(),
914 stealth: StealthConfig::default(),
915 per_host_min_interval_ms: 0,
916 per_host_max_concurrent: default_per_host_max_concurrent(),
917 }
918 }
919}
920
921fn default_concurrency() -> usize {
922 10
923}
924fn default_rps() -> f64 {
925 10.0
926}
927fn default_true() -> bool {
928 true
929}
930fn default_ua() -> String {
931 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 \
935 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
936 .into()
937}
938fn default_depth() -> u32 {
939 2
940}
941fn default_max_pages() -> u32 {
942 100
943}
944fn default_job_ttl() -> u64 {
945 3600
946}
947
948#[derive(Debug, Clone, Deserialize)]
949pub struct ExtractionConfig {
950 #[serde(default = "default_format")]
951 pub default_format: String,
952 #[serde(default = "default_true_ext")]
953 pub only_main_content: bool,
954 #[serde(default)]
955 pub llm: Option<LlmConfig>,
956 #[serde(default)]
959 pub domain_selectors: std::collections::HashMap<String, String>,
960 #[serde(default)]
961 pub llm_fallback: LlmFallbackConfig,
962 #[serde(default = "default_http_retry_threshold")]
965 pub http_retry_threshold_bytes: usize,
966 #[serde(default = "default_lightpanda_retry_threshold")]
970 pub lightpanda_retry_threshold_bytes: usize,
971}
972
973fn default_http_retry_threshold() -> usize {
974 100
975}
976
977fn default_lightpanda_retry_threshold() -> usize {
978 2000
979}
980
981impl Default for ExtractionConfig {
982 fn default() -> Self {
983 Self {
984 default_format: default_format(),
985 only_main_content: true,
986 llm: None,
987 domain_selectors: std::collections::HashMap::new(),
988 llm_fallback: LlmFallbackConfig::default(),
989 http_retry_threshold_bytes: default_http_retry_threshold(),
990 lightpanda_retry_threshold_bytes: default_lightpanda_retry_threshold(),
991 }
992 }
993}
994
995#[derive(Debug, Clone, Deserialize)]
996pub struct LlmFallbackConfig {
997 #[serde(default)]
998 pub enable: bool,
999 #[serde(default = "default_llm_quality_threshold")]
1000 pub quality_threshold: f32,
1001 #[serde(default = "default_llm_max_html_bytes")]
1002 pub max_html_bytes: usize,
1003 #[serde(default)]
1008 pub always_run: bool,
1009}
1010
1011impl Default for LlmFallbackConfig {
1012 fn default() -> Self {
1013 Self {
1014 enable: false,
1015 quality_threshold: default_llm_quality_threshold(),
1016 max_html_bytes: default_llm_max_html_bytes(),
1017 always_run: false,
1018 }
1019 }
1020}
1021
1022fn default_llm_quality_threshold() -> f32 {
1023 0.3
1024}
1025fn default_llm_max_html_bytes() -> usize {
1026 100_000
1027}
1028
1029#[derive(Debug, Clone, Deserialize)]
1030pub struct LlmConfig {
1031 #[serde(default = "default_llm_provider")]
1032 pub provider: String,
1033 pub api_key: String,
1034 #[serde(default = "default_llm_model")]
1035 pub model: String,
1036 #[serde(default)]
1037 pub base_url: Option<String>,
1038 #[serde(default = "default_llm_max_tokens")]
1039 pub max_tokens: u32,
1040 #[serde(default)]
1043 pub azure_api_version: Option<String>,
1044 #[serde(default = "default_llm_max_concurrency")]
1047 pub max_concurrency: usize,
1048 #[serde(default = "default_llm_max_html_bytes")]
1051 pub max_html_bytes: usize,
1052 #[serde(default)]
1056 pub require_byok_header: Option<String>,
1057 #[serde(default)]
1064 pub temperature: Option<f32>,
1065}
1066
1067impl Default for LlmConfig {
1068 fn default() -> Self {
1069 Self {
1070 provider: default_llm_provider(),
1071 api_key: String::new(),
1072 model: default_llm_model(),
1073 base_url: None,
1074 max_tokens: default_llm_max_tokens(),
1075 azure_api_version: None,
1076 max_concurrency: default_llm_max_concurrency(),
1077 max_html_bytes: default_llm_max_html_bytes(),
1078 require_byok_header: None,
1079 temperature: None,
1080 }
1081 }
1082}
1083
1084fn default_llm_max_concurrency() -> usize {
1085 4
1086}
1087
1088fn default_llm_provider() -> String {
1089 "anthropic".into()
1090}
1091fn default_llm_model() -> String {
1092 "claude-sonnet-4-20250514".into()
1093}
1094fn default_llm_max_tokens() -> u32 {
1095 4096
1096}
1097
1098fn default_format() -> String {
1099 "markdown".into()
1100}
1101fn default_true_ext() -> bool {
1102 true
1103}
1104
1105#[derive(Debug, Clone, Default, Deserialize)]
1106pub struct AuthConfig {
1107 #[serde(default)]
1108 pub api_keys: Vec<String>,
1109}
1110
1111pub fn user_config_path() -> Option<std::path::PathBuf> {
1116 if let Ok(dir) = std::env::var("CRW_USER_CONFIG_DIR") {
1117 return Some(std::path::PathBuf::from(dir).join("config.toml"));
1118 }
1119 let home = std::env::var_os("HOME")?;
1120 Some(
1121 std::path::PathBuf::from(home)
1122 .join(".config")
1123 .join("crw")
1124 .join("config.toml"),
1125 )
1126}
1127
1128impl AppConfig {
1129 pub fn load() -> Result<Self, config::ConfigError> {
1141 let mut builder = config::Config::builder()
1142 .add_source(config::File::with_name("config.default").required(false));
1143
1144 if let Some(user_cfg) = user_config_path()
1147 && user_cfg.exists()
1148 {
1149 builder = builder.add_source(config::File::from(user_cfg).required(false));
1150 }
1151
1152 if let Ok(extra) = std::env::var("CRW_CONFIG") {
1154 builder = builder.add_source(config::File::with_name(&extra).required(true));
1155 } else {
1156 builder = builder.add_source(config::File::with_name("config.local").required(false));
1157 }
1158
1159 let cfg = builder
1160 .add_source(
1161 config::Environment::with_prefix("CRW")
1162 .prefix_separator("_")
1163 .separator("__")
1164 .try_parsing(true),
1165 )
1166 .build()?;
1167 cfg.try_deserialize()
1168 }
1169
1170 pub fn effective_deadline_ms(
1187 &self,
1188 requested_deadline_ms: Option<u64>,
1189 wait_for_ms: Option<u64>,
1190 ) -> u64 {
1191 if let Some(explicit) = requested_deadline_ms {
1192 return explicit;
1193 }
1194 let default_ms = self.request.deadline_ms_default;
1195 if !self.request.auto_extend_deadline_for_ladder {
1196 return default_ms;
1197 }
1198 if self.renderer.cdp_tier_count() == 0 {
1205 return default_ms;
1206 }
1207 let ladder_min = self.renderer.min_deadline_for_full_ladder_ms();
1208 const SPA_DEFAULT_MS: u64 = 8_000;
1213 let extra = if let Some(w) = wait_for_ms {
1219 let bounded = w.min(MAX_WAIT_FOR_MS);
1220 let per_tier = bounded.saturating_sub(SPA_DEFAULT_MS);
1221 per_tier.saturating_mul(self.renderer.cdp_tier_count() as u64)
1222 } else {
1223 0
1224 };
1225 default_ms.max(ladder_min.saturating_add(extra))
1226 }
1227
1228 pub fn effective_request_timeout_secs(&self) -> u64 {
1241 let baseline = self.server.request_timeout_secs;
1242 if !self.request.auto_extend_deadline_for_ladder {
1243 return baseline;
1244 }
1245 const OUTER_BUFFER_SECS: u64 = 5;
1246 const MAP_REQUEST_TIMEOUT_CEILING_MS: u64 = 300_000;
1250 let scrape_ms = self.effective_deadline_ms(None, Some(MAX_WAIT_FOR_MS));
1256
1257 let conc = (self.crawler.max_concurrency.max(1)) as u64;
1261 let max_results = self.search.max_limit as u64;
1262 let enrich_batches = max_results.div_ceil(conc);
1263 let search_enrichment_ms = enrich_batches.saturating_mul(scrape_ms);
1264 let search_ms = self.search.timeout_ms.saturating_add(search_enrichment_ms);
1265
1266 let max_handler_ms = scrape_ms.max(search_ms).max(MAP_REQUEST_TIMEOUT_CEILING_MS);
1267 let needed_secs = max_handler_ms
1268 .div_ceil(1_000)
1269 .saturating_add(OUTER_BUFFER_SECS);
1270 baseline.max(needed_secs)
1271 }
1272}
1273
1274#[cfg(test)]
1275mod tests {
1276 use super::*;
1277
1278 static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
1281
1282 fn clear_renderer_env() {
1283 for k in [
1284 "CRW_RENDERER__MODE",
1285 "CRW_RENDERER__FORCE_JS",
1286 "CRW_RENDERER__RENDER_JS_DEFAULT",
1287 "CRW_RENDERER__LIGHTPANDA__WS_URL",
1288 "CRW_SERVER__PORT",
1289 ] {
1290 unsafe { std::env::remove_var(k) };
1291 }
1292 }
1293
1294 #[test]
1295 fn renderer_mode_parses_variants() {
1296 #[derive(Deserialize)]
1297 struct Wrap {
1298 mode: RendererMode,
1299 }
1300 let cases = [
1301 ("mode = \"auto\"", RendererMode::Auto),
1302 ("mode = \"none\"", RendererMode::None),
1303 ("mode = \"lightpanda\"", RendererMode::Lightpanda),
1304 ("mode = \"chrome\"", RendererMode::Chrome),
1305 ("mode = \"playwright\"", RendererMode::Playwright),
1306 ];
1307 for (toml_str, expected) in cases {
1308 let w: Wrap = toml::from_str(toml_str).unwrap();
1309 assert_eq!(w.mode, expected, "toml: {toml_str}");
1310 }
1311 }
1312
1313 #[test]
1314 fn renderer_mode_bogus_errors() {
1315 #[derive(Deserialize)]
1316 struct Wrap {
1317 #[allow(dead_code)]
1318 mode: RendererMode,
1319 }
1320 let err: Result<Wrap, _> = toml::from_str("mode = \"bogus\"");
1321 assert!(err.is_err(), "bogus mode should fail to parse");
1322 }
1323
1324 #[test]
1325 fn renderer_config_default_mode_is_auto() {
1326 let cfg = RendererConfig::default();
1327 assert_eq!(cfg.mode, RendererMode::Auto);
1328 assert_eq!(cfg.render_js_default, None);
1329 }
1330
1331 #[test]
1332 fn render_js_default_force_js_alias() {
1333 let cfg: RendererConfig = toml::from_str("force_js = true").unwrap();
1334 assert_eq!(cfg.render_js_default, Some(true));
1335 }
1336
1337 #[test]
1338 fn render_js_default_direct_field() {
1339 let cfg: RendererConfig = toml::from_str("render_js_default = false").unwrap();
1340 assert_eq!(cfg.render_js_default, Some(false));
1341 }
1342
1343 #[test]
1344 fn env_var_renderer_mode_chrome() {
1345 let _g = ENV_LOCK.lock().unwrap();
1346 clear_renderer_env();
1347 unsafe { std::env::set_var("CRW_RENDERER__MODE", "chrome") };
1348 let cfg = AppConfig::load().unwrap();
1349 clear_renderer_env();
1350 assert_eq!(cfg.renderer.mode, RendererMode::Chrome);
1351 }
1352
1353 #[test]
1354 fn env_var_force_js_alias_works() {
1355 let _g = ENV_LOCK.lock().unwrap();
1356 clear_renderer_env();
1357 unsafe { std::env::set_var("CRW_RENDERER__FORCE_JS", "true") };
1358 let cfg = AppConfig::load().unwrap();
1359 clear_renderer_env();
1360 assert_eq!(cfg.renderer.render_js_default, Some(true));
1361 }
1362
1363 #[test]
1364 fn env_var_render_js_default_direct() {
1365 let _g = ENV_LOCK.lock().unwrap();
1366 clear_renderer_env();
1367 unsafe { std::env::set_var("CRW_RENDERER__RENDER_JS_DEFAULT", "true") };
1368 let cfg = AppConfig::load().unwrap();
1369 clear_renderer_env();
1370 assert_eq!(cfg.renderer.render_js_default, Some(true));
1371 }
1372
1373 #[test]
1374 fn request_config_defaults_match_plan() {
1375 let r = RequestConfig::default();
1376 assert_eq!(r.deadline_ms_default, 8000);
1377 assert!(r.auto_extend_deadline_for_ladder);
1378 }
1379
1380 #[test]
1381 fn default_app_config_enables_auto_extend() {
1382 let cfg = AppConfig::default();
1384 assert!(cfg.request.auto_extend_deadline_for_ladder);
1385 assert_eq!(cfg.request.deadline_ms_default, 8000);
1386 }
1387
1388 fn renderer_with_chrome_only(chrome_ms: u64) -> RendererConfig {
1389 RendererConfig {
1390 mode: RendererMode::Chrome,
1391 page_timeout_ms: chrome_ms,
1392 chrome_timeout_ms: Some(chrome_ms),
1393 chrome: Some(CdpEndpoint {
1394 ws_url: "ws://chrome:9222".into(),
1395 }),
1396 ..Default::default()
1397 }
1398 }
1399
1400 #[test]
1401 #[cfg(feature = "cdp")]
1402 fn min_deadline_full_ladder_chrome_only() {
1403 let r = renderer_with_chrome_only(30_000);
1405 assert_eq!(
1407 r.min_deadline_for_full_ladder_ms(),
1408 30_000 + 30_000 + 28_000
1409 );
1410 }
1411
1412 #[test]
1413 #[cfg(feature = "cdp")]
1414 fn min_deadline_full_ladder_auto_three_tiers() {
1415 let r = RendererConfig {
1416 mode: RendererMode::Auto,
1417 page_timeout_ms: 15_000,
1418 http_timeout_ms: Some(15_000),
1419 lightpanda_timeout_ms: Some(2_500),
1420 chrome_timeout_ms: Some(30_000),
1421 lightpanda: Some(CdpEndpoint {
1422 ws_url: "ws://lp:9222".into(),
1423 }),
1424 chrome: Some(CdpEndpoint {
1425 ws_url: "ws://chrome:9222".into(),
1426 }),
1427 ..Default::default()
1428 };
1429 assert_eq!(
1431 r.min_deadline_for_full_ladder_ms(),
1432 15_000 + 2_500 + 30_000 + 2 * 28_000
1433 );
1434 assert_eq!(r.cdp_tier_count(), 2);
1435 }
1436
1437 #[test]
1438 fn effective_deadline_explicit_bypasses_auto_extend() {
1439 let mut cfg = AppConfig::default();
1440 cfg.request.auto_extend_deadline_for_ladder = true;
1441 cfg.renderer = renderer_with_chrome_only(30_000);
1442 assert_eq!(cfg.effective_deadline_ms(Some(5_000), None), 5_000);
1444 assert_eq!(cfg.effective_deadline_ms(Some(500_000), None), 500_000);
1445 }
1446
1447 #[test]
1448 #[cfg(feature = "cdp")]
1449 fn effective_deadline_auto_extend_raises_to_ladder_min() {
1450 let mut cfg = AppConfig::default();
1451 cfg.request.auto_extend_deadline_for_ladder = true;
1452 cfg.request.deadline_ms_default = 8_000;
1453 cfg.renderer = renderer_with_chrome_only(30_000);
1454 let expected = cfg.renderer.min_deadline_for_full_ladder_ms();
1455 assert!(expected > 8_000);
1456 assert_eq!(cfg.effective_deadline_ms(None, None), expected);
1457 }
1458
1459 #[test]
1460 fn effective_deadline_default_wins_when_higher_than_ladder() {
1461 let mut cfg = AppConfig::default();
1462 cfg.request.auto_extend_deadline_for_ladder = true;
1463 cfg.request.deadline_ms_default = 1_000_000;
1464 cfg.renderer = renderer_with_chrome_only(30_000);
1465 assert_eq!(cfg.effective_deadline_ms(None, None), 1_000_000);
1466 }
1467
1468 #[test]
1469 fn effective_deadline_auto_extend_disabled_returns_baseline() {
1470 let mut cfg = AppConfig::default();
1471 cfg.request.auto_extend_deadline_for_ladder = false;
1472 cfg.request.deadline_ms_default = 8_000;
1473 cfg.renderer = renderer_with_chrome_only(30_000);
1474 assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1475 }
1476
1477 #[test]
1478 #[cfg(feature = "cdp")]
1479 fn effective_deadline_extends_for_long_wait_for() {
1480 let mut cfg = AppConfig::default();
1481 cfg.request.auto_extend_deadline_for_ladder = true;
1482 cfg.request.deadline_ms_default = 8_000;
1483 cfg.renderer = renderer_with_chrome_only(30_000);
1484 let base = cfg.renderer.min_deadline_for_full_ladder_ms();
1485 let tier_count = cfg.renderer.cdp_tier_count() as u64;
1486 let with_wait = cfg.effective_deadline_ms(None, Some(20_000));
1488 assert_eq!(with_wait, base + 12_000 * tier_count);
1489 assert_eq!(cfg.effective_deadline_ms(None, Some(2_000)), base);
1491 }
1492
1493 #[test]
1494 fn effective_request_timeout_covers_map_ceiling() {
1495 let mut cfg = AppConfig::default();
1496 cfg.request.auto_extend_deadline_for_ladder = true;
1497 cfg.request.deadline_ms_default = 8_000;
1498 cfg.renderer = renderer_with_chrome_only(30_000);
1499 cfg.search.timeout_ms = 15_000;
1500 cfg.crawler.max_concurrency = 10;
1501 cfg.search.max_limit = 20;
1502 cfg.server.request_timeout_secs = 60;
1503 assert!(cfg.effective_request_timeout_secs() >= 305);
1505 }
1506
1507 #[test]
1508 fn effective_request_timeout_disabled_returns_baseline() {
1509 let mut cfg = AppConfig::default();
1510 cfg.request.auto_extend_deadline_for_ladder = false;
1511 cfg.server.request_timeout_secs = 60;
1512 assert_eq!(cfg.effective_request_timeout_secs(), 60);
1513 }
1514
1515 #[test]
1516 fn effective_request_timeout_respects_operator_override() {
1517 let mut cfg = AppConfig::default();
1518 cfg.request.auto_extend_deadline_for_ladder = true;
1519 cfg.server.request_timeout_secs = 600; cfg.renderer = renderer_with_chrome_only(30_000);
1521 assert_eq!(cfg.effective_request_timeout_secs(), 600);
1523 }
1524
1525 #[test]
1526 fn effective_request_timeout_search_sequential_batching() {
1527 let mut cfg = AppConfig::default();
1529 cfg.request.auto_extend_deadline_for_ladder = true;
1530 cfg.request.deadline_ms_default = 8_000;
1531 cfg.renderer = renderer_with_chrome_only(30_000);
1532 cfg.search.timeout_ms = 15_000;
1533 cfg.search.max_limit = 20;
1534 cfg.crawler.max_concurrency = 1;
1535 cfg.server.request_timeout_secs = 60;
1536 let secs = cfg.effective_request_timeout_secs();
1540 let scrape_ms = cfg.effective_deadline_ms(None, Some(60_000));
1541 let expected_search_ms = 15_000 + 20 * scrape_ms;
1542 let expected_max_ms = scrape_ms.max(expected_search_ms).max(300_000);
1543 let expected_secs = expected_max_ms.div_ceil(1_000) + 5;
1544 assert_eq!(secs, 60u64.max(expected_secs));
1545 }
1546
1547 #[test]
1548 #[cfg(not(feature = "cdp"))]
1549 fn cdp_tier_count_zero_without_cdp_feature() {
1550 let r = RendererConfig {
1554 mode: RendererMode::Auto,
1555 page_timeout_ms: 15_000,
1556 chrome_timeout_ms: Some(30_000),
1557 chrome: Some(CdpEndpoint {
1558 ws_url: "ws://chrome:9222".into(),
1559 }),
1560 lightpanda: Some(CdpEndpoint {
1561 ws_url: "ws://lp:9222".into(),
1562 }),
1563 ..Default::default()
1564 };
1565 assert_eq!(r.cdp_tier_count(), 0);
1566 assert_eq!(r.min_deadline_for_full_ladder_ms(), 15_000);
1568 }
1569
1570 #[test]
1571 fn effective_deadline_skipped_for_http_only_mode() {
1572 let mut cfg = AppConfig::default();
1577 cfg.request.auto_extend_deadline_for_ladder = true;
1578 cfg.request.deadline_ms_default = 8_000;
1579 cfg.renderer = RendererConfig {
1580 mode: RendererMode::Auto,
1581 page_timeout_ms: 30_000,
1582 lightpanda: None,
1584 playwright: None,
1585 chrome: None,
1586 ..Default::default()
1587 };
1588 assert_eq!(cfg.renderer.cdp_tier_count(), 0);
1589 assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
1590 assert_eq!(cfg.effective_deadline_ms(None, Some(30_000)), 8_000);
1591 }
1592
1593 #[test]
1594 #[cfg(feature = "cdp")]
1595 fn min_deadline_full_ladder_playwright_only() {
1596 let r = RendererConfig {
1599 mode: RendererMode::Playwright,
1600 page_timeout_ms: 15_000,
1601 http_timeout_ms: Some(15_000),
1602 chrome_timeout_ms: Some(30_000),
1603 playwright: Some(CdpEndpoint {
1604 ws_url: "ws://playwright:9222".into(),
1605 }),
1606 ..Default::default()
1607 };
1608 assert_eq!(r.cdp_tier_count(), 1);
1609 assert_eq!(
1611 r.min_deadline_for_full_ladder_ms(),
1612 15_000 + 30_000 + 28_000
1613 );
1614 }
1615
1616 #[test]
1617 fn renderer_phase_toggles_default_off_or_safe() {
1618 let r = RendererConfig::default();
1619 assert!(!r.chrome_intercept_resources);
1620 assert!(!r.chrome_intercept_stylesheets);
1621 assert!(r.chrome_host_intercept_disable.is_empty());
1622 assert_eq!(r.chrome_nav_budget_ms, 12_000);
1623 assert!(!r.chrome_context_pool_enabled);
1624 assert!(!r.use_predictor);
1625 }
1626
1627 #[test]
1628 fn crawler_per_host_limiter_defaults() {
1629 let c = CrawlerConfig::default();
1630 assert_eq!(c.per_host_min_interval_ms, 0);
1631 assert_eq!(c.per_host_max_concurrent, 1);
1632 }
1633
1634 #[test]
1635 fn env_var_overrides_toml_defaults() {
1636 let _g = ENV_LOCK.lock().unwrap();
1637 clear_renderer_env();
1638 unsafe {
1639 std::env::set_var("CRW_SERVER__PORT", "4444");
1640 std::env::set_var("CRW_RENDERER__LIGHTPANDA__WS_URL", "ws://test:9999/");
1641 }
1642 let cfg = AppConfig::load().unwrap();
1643 clear_renderer_env();
1644
1645 assert_eq!(cfg.server.port, 4444, "env var should override server.port");
1646 assert_eq!(
1647 cfg.renderer.lightpanda.as_ref().unwrap().ws_url,
1648 "ws://test:9999/",
1649 "env var should override renderer.lightpanda.ws_url"
1650 );
1651 }
1652
1653 #[test]
1654 fn user_config_path_honors_override_env() {
1655 let _g = ENV_LOCK.lock().unwrap();
1656 let tmp = std::env::temp_dir().join(format!("crw-cfg-test-{}", std::process::id()));
1657 unsafe {
1658 std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1659 }
1660 let p = user_config_path().unwrap();
1661 unsafe {
1662 std::env::remove_var("CRW_USER_CONFIG_DIR");
1663 }
1664 assert_eq!(p, tmp.join("config.toml"));
1665 }
1666
1667 #[test]
1668 fn user_config_file_is_picked_up_by_load() {
1669 let _g = ENV_LOCK.lock().unwrap();
1670 clear_renderer_env();
1671 let tmp = std::env::temp_dir().join(format!("crw-load-test-{}", std::process::id()));
1672 std::fs::create_dir_all(&tmp).unwrap();
1673 let cfg_path = tmp.join("config.toml");
1674 std::fs::write(
1675 &cfg_path,
1676 r#"
1677[client]
1678api_url = "https://api.example.com"
1679api_key = "test-key-123"
1680
1681[search]
1682searxng_url = "http://localhost:9999"
1683
1684[extraction.llm]
1685provider = "deepseek"
1686api_key = "sk-test"
1687model = "deepseek-chat"
1688"#,
1689 )
1690 .unwrap();
1691
1692 unsafe {
1693 std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1694 }
1695 let cfg = AppConfig::load().unwrap();
1696 unsafe {
1697 std::env::remove_var("CRW_USER_CONFIG_DIR");
1698 }
1699 std::fs::remove_dir_all(&tmp).ok();
1700
1701 assert_eq!(
1702 cfg.client.api_url.as_deref(),
1703 Some("https://api.example.com")
1704 );
1705 assert_eq!(cfg.client.api_key.as_deref(), Some("test-key-123"));
1706 assert_eq!(
1707 cfg.search.searxng_url.as_deref(),
1708 Some("http://localhost:9999")
1709 );
1710 let llm = cfg.extraction.llm.expect("llm config present");
1711 assert_eq!(llm.provider, "deepseek");
1712 assert_eq!(llm.api_key, "sk-test");
1713 }
1714
1715 #[test]
1716 fn env_var_beats_user_config() {
1717 let _g = ENV_LOCK.lock().unwrap();
1718 clear_renderer_env();
1719 let tmp = std::env::temp_dir().join(format!("crw-prec-test-{}", std::process::id()));
1720 std::fs::create_dir_all(&tmp).unwrap();
1721 std::fs::write(
1722 tmp.join("config.toml"),
1723 r#"
1724[search]
1725searxng_url = "http://from-file:8080"
1726"#,
1727 )
1728 .unwrap();
1729
1730 unsafe {
1731 std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
1732 std::env::set_var("CRW_SEARCH__SEARXNG_URL", "http://from-env:8080");
1733 }
1734 let cfg = AppConfig::load().unwrap();
1735 unsafe {
1736 std::env::remove_var("CRW_USER_CONFIG_DIR");
1737 std::env::remove_var("CRW_SEARCH__SEARXNG_URL");
1738 }
1739 std::fs::remove_dir_all(&tmp).ok();
1740
1741 assert_eq!(
1742 cfg.search.searxng_url.as_deref(),
1743 Some("http://from-env:8080"),
1744 "env var must win over user config file"
1745 );
1746 }
1747
1748 #[test]
1749 fn effective_proxy_credentials_appends_country_suffix() {
1750 let cfg = RendererConfig {
1751 proxy_base_user: Some("abc".into()),
1752 proxy_base_pass: Some("pw".into()),
1753 proxy_default_country: Some("de".into()),
1754 ..Default::default()
1755 };
1756 let (u, p) = cfg.effective_proxy_credentials(Some("us")).unwrap();
1757 assert_eq!(u, "abc__cr.us");
1758 assert_eq!(p, "pw");
1759 let (u, _) = cfg.effective_proxy_credentials(Some("GB")).unwrap();
1761 assert_eq!(u, "abc__cr.gb", "uppercase input is normalized");
1762 let (u, _) = cfg.effective_proxy_credentials(None).unwrap();
1764 assert_eq!(u, "abc__cr.de");
1765 }
1766
1767 #[test]
1768 fn effective_proxy_credentials_invalid_country_uses_global_pool() {
1769 let cfg = RendererConfig {
1770 proxy_base_user: Some("abc".into()),
1771 proxy_base_pass: Some("pw".into()),
1772 ..Default::default()
1773 };
1774 let (u, _) = cfg.effective_proxy_credentials(Some("usa")).unwrap();
1776 assert_eq!(u, "abc");
1777 let (u, _) = cfg.effective_proxy_credentials(Some("u1")).unwrap();
1779 assert_eq!(u, "abc");
1780 let (u, _) = cfg.effective_proxy_credentials(Some(" ")).unwrap();
1782 assert_eq!(u, "abc");
1783 }
1784
1785 #[test]
1786 fn effective_proxy_credentials_no_base_returns_none() {
1787 let cfg = RendererConfig::default();
1788 assert!(cfg.effective_proxy_credentials(Some("us")).is_none());
1789
1790 let only_user = RendererConfig {
1791 proxy_base_user: Some("abc".into()),
1792 ..Default::default()
1793 };
1794 assert!(only_user.effective_proxy_credentials(Some("us")).is_none());
1795 }
1796}