use serde::Deserialize;
#[derive(Debug, Clone, Deserialize, Default)]
pub struct AppConfig {
#[serde(default)]
pub server: ServerConfig,
#[serde(default)]
pub renderer: RendererConfig,
#[serde(default)]
pub crawler: CrawlerConfig,
#[serde(default)]
pub extraction: ExtractionConfig,
#[serde(default)]
pub auth: AuthConfig,
#[serde(default)]
pub request: RequestConfig,
#[serde(default)]
pub search: SearchConfig,
#[serde(default)]
pub map: MapConfig,
#[serde(default)]
pub client: ClientConfig,
}
#[derive(Debug, Clone, Default, Deserialize)]
pub struct ClientConfig {
#[serde(default)]
pub api_url: Option<String>,
#[serde(default)]
pub api_key: Option<String>,
}
#[derive(Debug, Clone, Deserialize, Default)]
pub struct MapConfig {
#[serde(default)]
pub url_filter: MapUrlFilterConfig,
}
#[derive(Debug, Clone, Deserialize)]
pub struct MapUrlFilterConfig {
#[serde(default = "default_true_filter")]
pub strip_tracking_params: bool,
#[serde(default = "default_true_filter")]
pub drop_action_urls: bool,
#[serde(default)]
pub gov_tld_drop_actions: bool,
#[serde(default)]
pub extra_tracking_params: Vec<String>,
#[serde(default)]
pub extra_action_params: Vec<String>,
#[serde(default)]
pub extra_preserve_params: Vec<String>,
}
impl Default for MapUrlFilterConfig {
fn default() -> Self {
Self {
strip_tracking_params: true,
drop_action_urls: true,
gov_tld_drop_actions: false,
extra_tracking_params: Vec::new(),
extra_action_params: Vec::new(),
extra_preserve_params: Vec::new(),
}
}
}
fn default_true_filter() -> bool {
true
}
pub const CDP_TIER_OVERHEAD_MS: u64 = 28_000;
pub const MAX_WAIT_FOR_MS: u64 = 60_000;
#[derive(Debug, Clone, Deserialize)]
pub struct SearchConfig {
#[serde(default = "default_true_search")]
pub enabled: bool,
#[serde(default)]
pub searxng_url: Option<String>,
#[serde(default = "default_search_timeout_ms")]
pub timeout_ms: u64,
#[serde(default = "default_search_limit")]
pub default_limit: u32,
#[serde(default = "default_search_max_limit")]
pub max_limit: u32,
#[serde(default = "default_research_engines")]
pub research_engines: Vec<String>,
#[serde(default = "default_github_engines")]
pub github_engines: Vec<String>,
#[serde(default = "default_true_search")]
pub rerank_enabled: bool,
#[serde(default)]
pub query_expand: bool,
#[serde(default = "default_query_expand_variants")]
pub query_expand_variants: usize,
#[serde(default)]
pub multi_round: bool,
#[serde(default)]
pub passage_select: bool,
#[serde(default)]
pub page2_fallback: bool,
#[serde(default)]
pub answer_calibrated: bool,
#[serde(default)]
pub answer_guarded: bool,
#[serde(default)]
pub use_structured_sources: bool,
#[serde(default)]
pub wikidata_lookup: bool,
#[serde(default)]
pub snippet_fallback: bool,
}
impl Default for SearchConfig {
fn default() -> Self {
Self {
enabled: true,
searxng_url: None,
timeout_ms: default_search_timeout_ms(),
default_limit: default_search_limit(),
max_limit: default_search_max_limit(),
research_engines: default_research_engines(),
github_engines: default_github_engines(),
rerank_enabled: true,
query_expand: false,
query_expand_variants: default_query_expand_variants(),
multi_round: false,
passage_select: false,
page2_fallback: false,
answer_calibrated: false,
answer_guarded: false,
use_structured_sources: false,
wikidata_lookup: false,
snippet_fallback: false,
}
}
}
fn default_query_expand_variants() -> usize {
1
}
fn default_true_search() -> bool {
true
}
fn default_search_timeout_ms() -> u64 {
15_000
}
fn default_search_limit() -> u32 {
5
}
fn default_search_max_limit() -> u32 {
20
}
fn default_research_engines() -> Vec<String> {
vec![
"arxiv".into(),
"crossref".into(),
"google scholar".into(),
"semantic scholar".into(),
]
}
fn default_github_engines() -> Vec<String> {
vec!["github".into()]
}
#[derive(Debug, Clone, Deserialize)]
pub struct RequestConfig {
#[serde(default = "default_deadline_ms")]
pub deadline_ms_default: u64,
#[serde(default = "default_true_request")]
pub auto_extend_deadline_for_ladder: bool,
}
impl Default for RequestConfig {
fn default() -> Self {
Self {
deadline_ms_default: default_deadline_ms(),
auto_extend_deadline_for_ladder: true,
}
}
}
fn default_true_request() -> bool {
true
}
fn default_deadline_ms() -> u64 {
8000
}
#[derive(Debug, Clone, Deserialize)]
pub struct ServerConfig {
#[serde(default = "default_host")]
pub host: String,
#[serde(default = "default_port")]
pub port: u16,
#[serde(default = "default_request_timeout")]
pub request_timeout_secs: u64,
#[serde(default = "default_rate_limit_rps")]
pub rate_limit_rps: u64,
}
impl Default for ServerConfig {
fn default() -> Self {
Self {
host: default_host(),
port: default_port(),
request_timeout_secs: default_request_timeout(),
rate_limit_rps: default_rate_limit_rps(),
}
}
}
fn default_rate_limit_rps() -> u64 {
10
}
fn default_host() -> String {
"0.0.0.0".into()
}
fn default_port() -> u16 {
3000
}
fn default_request_timeout() -> u64 {
60
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum RendererMode {
#[default]
Auto,
None,
Lightpanda,
Chrome,
Playwright,
}
#[derive(Debug, Clone, Deserialize)]
pub struct RendererConfig {
#[serde(default)]
pub mode: RendererMode,
#[serde(default = "default_page_timeout")]
pub page_timeout_ms: u64,
#[serde(default)]
pub http_timeout_ms: Option<u64>,
#[serde(default)]
pub lightpanda_timeout_ms: Option<u64>,
#[serde(default)]
pub chrome_timeout_ms: Option<u64>,
#[serde(default = "default_pool_size")]
pub pool_size: usize,
#[serde(default, alias = "force_js")]
pub render_js_default: Option<bool>,
#[serde(default)]
pub lightpanda: Option<CdpEndpoint>,
#[serde(default)]
pub playwright: Option<CdpEndpoint>,
#[serde(default)]
pub chrome: Option<CdpEndpoint>,
#[serde(default)]
pub chrome_proxy: Option<CdpEndpoint>,
#[serde(default)]
pub chrome_proxy_timeout_ms: Option<u64>,
#[serde(default)]
pub chrome_intercept_resources: bool,
#[serde(default)]
pub chrome_intercept_stylesheets: bool,
#[serde(default)]
pub chrome_host_intercept_disable: Vec<String>,
#[serde(default = "default_chrome_nav_budget_ms")]
pub chrome_nav_budget_ms: u64,
#[serde(default)]
pub chrome_context_pool_enabled: bool,
#[serde(default)]
pub chrome_pool: ChromePoolConfig,
#[serde(default)]
pub chrome_backend: ChromeBackend,
#[serde(default)]
pub use_predictor: bool,
#[serde(default)]
pub escalation: EscalationConfig,
#[serde(default)]
pub antibot: AntibotConfig,
#[serde(default)]
pub proxy_base_user: Option<String>,
#[serde(default)]
pub proxy_base_pass: Option<String>,
#[serde(default)]
pub proxy_default_country: Option<String>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct EscalationConfig {
#[serde(default)]
pub enabled: bool,
#[serde(default = "default_waterfall_timeout_ms")]
pub waterfall_timeout_ms: u64,
#[serde(default = "default_escalation_global_timeout_ms")]
pub global_timeout_ms: u64,
#[serde(default)]
pub residential_proxy: bool,
#[serde(default = "default_proxy_country")]
pub proxy_country: String,
}
impl Default for EscalationConfig {
fn default() -> Self {
Self {
enabled: false,
waterfall_timeout_ms: default_waterfall_timeout_ms(),
global_timeout_ms: default_escalation_global_timeout_ms(),
residential_proxy: false,
proxy_country: default_proxy_country(),
}
}
}
fn default_waterfall_timeout_ms() -> u64 {
8_000
}
fn default_escalation_global_timeout_ms() -> u64 {
60_000
}
fn default_proxy_country() -> String {
"us".to_string()
}
#[derive(Debug, Clone, Deserialize)]
pub struct AntibotConfig {
#[serde(default = "default_true")]
pub enabled: bool,
#[serde(default)]
pub escalate_on_signal: bool,
#[serde(default = "default_true")]
pub escalate_in_failover: bool,
}
impl Default for AntibotConfig {
fn default() -> Self {
Self {
enabled: true,
escalate_on_signal: false,
escalate_in_failover: true,
}
}
}
fn default_chrome_nav_budget_ms() -> u64 {
12_000
}
#[derive(Debug, Clone, Deserialize)]
pub struct ChromePoolConfig {
#[serde(default)]
pub size: Option<usize>,
#[serde(default = "default_recycle_after_navs")]
pub recycle_after_navs: u32,
#[serde(default = "default_idle_timeout_secs")]
pub idle_timeout_secs: u64,
#[serde(default = "default_health_check_secs")]
pub health_check_secs: u64,
#[serde(default = "default_shutdown_drain_secs")]
pub shutdown_drain_secs: u64,
}
impl Default for ChromePoolConfig {
fn default() -> Self {
Self {
size: None,
recycle_after_navs: default_recycle_after_navs(),
idle_timeout_secs: default_idle_timeout_secs(),
health_check_secs: default_health_check_secs(),
shutdown_drain_secs: default_shutdown_drain_secs(),
}
}
}
fn default_recycle_after_navs() -> u32 {
1
}
fn default_idle_timeout_secs() -> u64 {
300
}
fn default_health_check_secs() -> u64 {
60
}
fn default_shutdown_drain_secs() -> u64 {
30
}
#[derive(Debug, Clone, Copy, Default, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum ChromeBackend {
#[default]
Vanilla,
Browserless,
}
impl Default for RendererConfig {
fn default() -> Self {
Self {
mode: RendererMode::default(),
page_timeout_ms: default_page_timeout(),
http_timeout_ms: None,
lightpanda_timeout_ms: None,
chrome_timeout_ms: None,
pool_size: default_pool_size(),
render_js_default: None,
lightpanda: None,
playwright: None,
chrome: None,
chrome_proxy: None,
chrome_proxy_timeout_ms: None,
chrome_intercept_resources: false,
chrome_intercept_stylesheets: false,
chrome_host_intercept_disable: Vec::new(),
chrome_nav_budget_ms: default_chrome_nav_budget_ms(),
chrome_context_pool_enabled: false,
chrome_pool: ChromePoolConfig::default(),
chrome_backend: ChromeBackend::default(),
use_predictor: false,
escalation: EscalationConfig::default(),
antibot: AntibotConfig::default(),
proxy_base_user: None,
proxy_base_pass: None,
proxy_default_country: None,
}
}
}
fn default_page_timeout() -> u64 {
30000
}
impl RendererConfig {
pub fn http_timeout(&self) -> u64 {
self.http_timeout_ms.unwrap_or(self.page_timeout_ms)
}
pub fn lightpanda_timeout(&self) -> u64 {
self.lightpanda_timeout_ms.unwrap_or(self.page_timeout_ms)
}
pub fn chrome_timeout(&self) -> u64 {
self.chrome_timeout_ms.unwrap_or(self.page_timeout_ms)
}
pub fn chrome_proxy_timeout(&self) -> u64 {
self.chrome_proxy_timeout_ms
.unwrap_or_else(|| self.chrome_timeout().saturating_add(15_000))
}
pub fn effective_proxy_credentials(&self, country: Option<&str>) -> Option<(String, String)> {
let user = self.proxy_base_user.as_ref()?;
let pass = self.proxy_base_pass.as_ref()?;
let cc = country
.or(self.proxy_default_country.as_deref())
.map(|s| s.trim().to_lowercase())
.filter(|s| s.len() == 2 && s.chars().all(|c| c.is_ascii_alphabetic()));
Some(match cc {
Some(cc) => (format!("{user}__cr.{cc}"), pass.clone()),
None => (user.clone(), pass.clone()),
})
}
pub fn cdp_tier_count(&self) -> usize {
if !cfg!(feature = "cdp") {
return 0;
}
let want =
|m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
let mut n = 0;
if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
n += 1;
}
if want(RendererMode::Playwright) && self.playwright.is_some() {
n += 1;
}
if want(RendererMode::Chrome) && self.chrome.is_some() {
n += 1;
}
n
}
pub fn min_deadline_for_full_ladder_ms(&self) -> u64 {
let want =
|m: RendererMode| -> bool { matches!(self.mode, RendererMode::Auto) || self.mode == m };
let mut sum: u64 = 0;
if !matches!(self.mode, RendererMode::None) {
sum = sum.saturating_add(self.http_timeout());
}
if !cfg!(feature = "cdp") {
return sum;
}
let mut cdp_tier_count: u64 = 0;
if want(RendererMode::Lightpanda) && self.lightpanda.is_some() {
sum = sum.saturating_add(self.lightpanda_timeout());
cdp_tier_count += 1;
}
if want(RendererMode::Playwright) && self.playwright.is_some() {
sum = sum.saturating_add(self.chrome_timeout());
cdp_tier_count += 1;
}
if want(RendererMode::Chrome) && self.chrome.is_some() {
sum = sum.saturating_add(self.chrome_timeout());
cdp_tier_count += 1;
}
sum.saturating_add(cdp_tier_count.saturating_mul(CDP_TIER_OVERHEAD_MS))
}
}
fn default_pool_size() -> usize {
4
}
#[derive(Debug, Clone, Deserialize)]
pub struct CdpEndpoint {
pub ws_url: String,
}
#[derive(Debug, Clone, Deserialize)]
pub struct StealthConfig {
#[serde(default)]
pub enabled: bool,
#[serde(default)]
pub user_agents: Vec<String>,
#[serde(default = "default_jitter")]
pub jitter_factor: f64,
#[serde(default = "default_true")]
pub inject_headers: bool,
}
impl Default for StealthConfig {
fn default() -> Self {
Self {
enabled: false,
user_agents: vec![],
jitter_factor: default_jitter(),
inject_headers: true,
}
}
}
fn default_jitter() -> f64 {
0.2
}
pub const BUILTIN_UA_POOL: &[&str] = &[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
];
#[derive(Debug, Clone, Deserialize)]
pub struct CrawlerConfig {
#[serde(default = "default_concurrency")]
pub max_concurrency: usize,
#[serde(default = "default_rps")]
pub requests_per_second: f64,
#[serde(default = "default_true")]
pub respect_robots_txt: bool,
#[serde(default = "default_ua")]
pub user_agent: String,
#[serde(default = "default_depth")]
pub default_max_depth: u32,
#[serde(default = "default_max_pages")]
pub default_max_pages: u32,
#[serde(default)]
pub proxy: Option<String>,
#[serde(default = "default_job_ttl")]
pub job_ttl_secs: u64,
#[serde(default)]
pub stealth: StealthConfig,
#[serde(default)]
pub per_host_min_interval_ms: u64,
#[serde(default = "default_per_host_max_concurrent")]
pub per_host_max_concurrent: u32,
}
fn default_per_host_max_concurrent() -> u32 {
1
}
impl Default for CrawlerConfig {
fn default() -> Self {
Self {
max_concurrency: default_concurrency(),
requests_per_second: default_rps(),
respect_robots_txt: true,
user_agent: default_ua(),
default_max_depth: default_depth(),
default_max_pages: default_max_pages(),
proxy: None,
job_ttl_secs: default_job_ttl(),
stealth: StealthConfig::default(),
per_host_min_interval_ms: 0,
per_host_max_concurrent: default_per_host_max_concurrent(),
}
}
}
fn default_concurrency() -> usize {
10
}
fn default_rps() -> f64 {
10.0
}
fn default_true() -> bool {
true
}
fn default_ua() -> String {
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
.into()
}
fn default_depth() -> u32 {
2
}
fn default_max_pages() -> u32 {
100
}
fn default_job_ttl() -> u64 {
3600
}
#[derive(Debug, Clone, Deserialize)]
pub struct ExtractionConfig {
#[serde(default = "default_format")]
pub default_format: String,
#[serde(default = "default_true_ext")]
pub only_main_content: bool,
#[serde(default)]
pub llm: Option<LlmConfig>,
#[serde(default)]
pub domain_selectors: std::collections::HashMap<String, String>,
#[serde(default)]
pub llm_fallback: LlmFallbackConfig,
#[serde(default = "default_http_retry_threshold")]
pub http_retry_threshold_bytes: usize,
#[serde(default = "default_lightpanda_retry_threshold")]
pub lightpanda_retry_threshold_bytes: usize,
}
fn default_http_retry_threshold() -> usize {
100
}
fn default_lightpanda_retry_threshold() -> usize {
2000
}
impl Default for ExtractionConfig {
fn default() -> Self {
Self {
default_format: default_format(),
only_main_content: true,
llm: None,
domain_selectors: std::collections::HashMap::new(),
llm_fallback: LlmFallbackConfig::default(),
http_retry_threshold_bytes: default_http_retry_threshold(),
lightpanda_retry_threshold_bytes: default_lightpanda_retry_threshold(),
}
}
}
#[derive(Debug, Clone, Deserialize)]
pub struct LlmFallbackConfig {
#[serde(default)]
pub enable: bool,
#[serde(default = "default_llm_quality_threshold")]
pub quality_threshold: f32,
#[serde(default = "default_llm_max_html_bytes")]
pub max_html_bytes: usize,
#[serde(default)]
pub always_run: bool,
}
impl Default for LlmFallbackConfig {
fn default() -> Self {
Self {
enable: false,
quality_threshold: default_llm_quality_threshold(),
max_html_bytes: default_llm_max_html_bytes(),
always_run: false,
}
}
}
fn default_llm_quality_threshold() -> f32 {
0.3
}
fn default_llm_max_html_bytes() -> usize {
100_000
}
#[derive(Debug, Clone, Deserialize)]
pub struct LlmConfig {
#[serde(default = "default_llm_provider")]
pub provider: String,
pub api_key: String,
#[serde(default = "default_llm_model")]
pub model: String,
#[serde(default)]
pub base_url: Option<String>,
#[serde(default = "default_llm_max_tokens")]
pub max_tokens: u32,
#[serde(default)]
pub azure_api_version: Option<String>,
#[serde(default = "default_llm_max_concurrency")]
pub max_concurrency: usize,
#[serde(default = "default_llm_max_html_bytes")]
pub max_html_bytes: usize,
#[serde(default)]
pub require_byok_header: Option<String>,
#[serde(default)]
pub temperature: Option<f32>,
}
impl Default for LlmConfig {
fn default() -> Self {
Self {
provider: default_llm_provider(),
api_key: String::new(),
model: default_llm_model(),
base_url: None,
max_tokens: default_llm_max_tokens(),
azure_api_version: None,
max_concurrency: default_llm_max_concurrency(),
max_html_bytes: default_llm_max_html_bytes(),
require_byok_header: None,
temperature: None,
}
}
}
fn default_llm_max_concurrency() -> usize {
4
}
fn default_llm_provider() -> String {
"anthropic".into()
}
fn default_llm_model() -> String {
"claude-sonnet-4-20250514".into()
}
fn default_llm_max_tokens() -> u32 {
4096
}
fn default_format() -> String {
"markdown".into()
}
fn default_true_ext() -> bool {
true
}
#[derive(Debug, Clone, Default, Deserialize)]
pub struct AuthConfig {
#[serde(default)]
pub api_keys: Vec<String>,
}
pub fn user_config_path() -> Option<std::path::PathBuf> {
if let Ok(dir) = std::env::var("CRW_USER_CONFIG_DIR") {
return Some(std::path::PathBuf::from(dir).join("config.toml"));
}
let home = std::env::var_os("HOME")?;
Some(
std::path::PathBuf::from(home)
.join(".config")
.join("crw")
.join("config.toml"),
)
}
impl AppConfig {
pub fn load() -> Result<Self, config::ConfigError> {
let mut builder = config::Config::builder()
.add_source(config::File::with_name("config.default").required(false));
if let Some(user_cfg) = user_config_path()
&& user_cfg.exists()
{
builder = builder.add_source(config::File::from(user_cfg).required(false));
}
if let Ok(extra) = std::env::var("CRW_CONFIG") {
builder = builder.add_source(config::File::with_name(&extra).required(true));
} else {
builder = builder.add_source(config::File::with_name("config.local").required(false));
}
let cfg = builder
.add_source(
config::Environment::with_prefix("CRW")
.prefix_separator("_")
.separator("__")
.try_parsing(true),
)
.build()?;
cfg.try_deserialize()
}
pub fn effective_deadline_ms(
&self,
requested_deadline_ms: Option<u64>,
wait_for_ms: Option<u64>,
) -> u64 {
if let Some(explicit) = requested_deadline_ms {
return explicit;
}
let default_ms = self.request.deadline_ms_default;
if !self.request.auto_extend_deadline_for_ladder {
return default_ms;
}
if self.renderer.cdp_tier_count() == 0 {
return default_ms;
}
let ladder_min = self.renderer.min_deadline_for_full_ladder_ms();
const SPA_DEFAULT_MS: u64 = 8_000;
let extra = if let Some(w) = wait_for_ms {
let bounded = w.min(MAX_WAIT_FOR_MS);
let per_tier = bounded.saturating_sub(SPA_DEFAULT_MS);
per_tier.saturating_mul(self.renderer.cdp_tier_count() as u64)
} else {
0
};
default_ms.max(ladder_min.saturating_add(extra))
}
pub fn effective_request_timeout_secs(&self) -> u64 {
let baseline = self.server.request_timeout_secs;
if !self.request.auto_extend_deadline_for_ladder {
return baseline;
}
const OUTER_BUFFER_SECS: u64 = 5;
const MAP_REQUEST_TIMEOUT_CEILING_MS: u64 = 300_000;
let scrape_ms = self.effective_deadline_ms(None, Some(MAX_WAIT_FOR_MS));
let conc = (self.crawler.max_concurrency.max(1)) as u64;
let max_results = self.search.max_limit as u64;
let enrich_batches = max_results.div_ceil(conc);
let search_enrichment_ms = enrich_batches.saturating_mul(scrape_ms);
let search_ms = self.search.timeout_ms.saturating_add(search_enrichment_ms);
let max_handler_ms = scrape_ms.max(search_ms).max(MAP_REQUEST_TIMEOUT_CEILING_MS);
let needed_secs = max_handler_ms
.div_ceil(1_000)
.saturating_add(OUTER_BUFFER_SECS);
baseline.max(needed_secs)
}
}
#[cfg(test)]
mod tests {
use super::*;
static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
fn clear_renderer_env() {
for k in [
"CRW_RENDERER__MODE",
"CRW_RENDERER__FORCE_JS",
"CRW_RENDERER__RENDER_JS_DEFAULT",
"CRW_RENDERER__LIGHTPANDA__WS_URL",
"CRW_SERVER__PORT",
] {
unsafe { std::env::remove_var(k) };
}
}
#[test]
fn renderer_mode_parses_variants() {
#[derive(Deserialize)]
struct Wrap {
mode: RendererMode,
}
let cases = [
("mode = \"auto\"", RendererMode::Auto),
("mode = \"none\"", RendererMode::None),
("mode = \"lightpanda\"", RendererMode::Lightpanda),
("mode = \"chrome\"", RendererMode::Chrome),
("mode = \"playwright\"", RendererMode::Playwright),
];
for (toml_str, expected) in cases {
let w: Wrap = toml::from_str(toml_str).unwrap();
assert_eq!(w.mode, expected, "toml: {toml_str}");
}
}
#[test]
fn renderer_mode_bogus_errors() {
#[derive(Deserialize)]
struct Wrap {
#[allow(dead_code)]
mode: RendererMode,
}
let err: Result<Wrap, _> = toml::from_str("mode = \"bogus\"");
assert!(err.is_err(), "bogus mode should fail to parse");
}
#[test]
fn renderer_config_default_mode_is_auto() {
let cfg = RendererConfig::default();
assert_eq!(cfg.mode, RendererMode::Auto);
assert_eq!(cfg.render_js_default, None);
}
#[test]
fn render_js_default_force_js_alias() {
let cfg: RendererConfig = toml::from_str("force_js = true").unwrap();
assert_eq!(cfg.render_js_default, Some(true));
}
#[test]
fn render_js_default_direct_field() {
let cfg: RendererConfig = toml::from_str("render_js_default = false").unwrap();
assert_eq!(cfg.render_js_default, Some(false));
}
#[test]
fn env_var_renderer_mode_chrome() {
let _g = ENV_LOCK.lock().unwrap();
clear_renderer_env();
unsafe { std::env::set_var("CRW_RENDERER__MODE", "chrome") };
let cfg = AppConfig::load().unwrap();
clear_renderer_env();
assert_eq!(cfg.renderer.mode, RendererMode::Chrome);
}
#[test]
fn env_var_force_js_alias_works() {
let _g = ENV_LOCK.lock().unwrap();
clear_renderer_env();
unsafe { std::env::set_var("CRW_RENDERER__FORCE_JS", "true") };
let cfg = AppConfig::load().unwrap();
clear_renderer_env();
assert_eq!(cfg.renderer.render_js_default, Some(true));
}
#[test]
fn env_var_render_js_default_direct() {
let _g = ENV_LOCK.lock().unwrap();
clear_renderer_env();
unsafe { std::env::set_var("CRW_RENDERER__RENDER_JS_DEFAULT", "true") };
let cfg = AppConfig::load().unwrap();
clear_renderer_env();
assert_eq!(cfg.renderer.render_js_default, Some(true));
}
#[test]
fn request_config_defaults_match_plan() {
let r = RequestConfig::default();
assert_eq!(r.deadline_ms_default, 8000);
assert!(r.auto_extend_deadline_for_ladder);
}
#[test]
fn default_app_config_enables_auto_extend() {
let cfg = AppConfig::default();
assert!(cfg.request.auto_extend_deadline_for_ladder);
assert_eq!(cfg.request.deadline_ms_default, 8000);
}
fn renderer_with_chrome_only(chrome_ms: u64) -> RendererConfig {
RendererConfig {
mode: RendererMode::Chrome,
page_timeout_ms: chrome_ms,
chrome_timeout_ms: Some(chrome_ms),
chrome: Some(CdpEndpoint {
ws_url: "ws://chrome:9222".into(),
}),
..Default::default()
}
}
#[test]
#[cfg(feature = "cdp")]
fn min_deadline_full_ladder_chrome_only() {
let r = renderer_with_chrome_only(30_000);
assert_eq!(
r.min_deadline_for_full_ladder_ms(),
30_000 + 30_000 + 28_000
);
}
#[test]
#[cfg(feature = "cdp")]
fn min_deadline_full_ladder_auto_three_tiers() {
let r = RendererConfig {
mode: RendererMode::Auto,
page_timeout_ms: 15_000,
http_timeout_ms: Some(15_000),
lightpanda_timeout_ms: Some(2_500),
chrome_timeout_ms: Some(30_000),
lightpanda: Some(CdpEndpoint {
ws_url: "ws://lp:9222".into(),
}),
chrome: Some(CdpEndpoint {
ws_url: "ws://chrome:9222".into(),
}),
..Default::default()
};
assert_eq!(
r.min_deadline_for_full_ladder_ms(),
15_000 + 2_500 + 30_000 + 2 * 28_000
);
assert_eq!(r.cdp_tier_count(), 2);
}
#[test]
fn effective_deadline_explicit_bypasses_auto_extend() {
let mut cfg = AppConfig::default();
cfg.request.auto_extend_deadline_for_ladder = true;
cfg.renderer = renderer_with_chrome_only(30_000);
assert_eq!(cfg.effective_deadline_ms(Some(5_000), None), 5_000);
assert_eq!(cfg.effective_deadline_ms(Some(500_000), None), 500_000);
}
#[test]
#[cfg(feature = "cdp")]
fn effective_deadline_auto_extend_raises_to_ladder_min() {
let mut cfg = AppConfig::default();
cfg.request.auto_extend_deadline_for_ladder = true;
cfg.request.deadline_ms_default = 8_000;
cfg.renderer = renderer_with_chrome_only(30_000);
let expected = cfg.renderer.min_deadline_for_full_ladder_ms();
assert!(expected > 8_000);
assert_eq!(cfg.effective_deadline_ms(None, None), expected);
}
#[test]
fn effective_deadline_default_wins_when_higher_than_ladder() {
let mut cfg = AppConfig::default();
cfg.request.auto_extend_deadline_for_ladder = true;
cfg.request.deadline_ms_default = 1_000_000;
cfg.renderer = renderer_with_chrome_only(30_000);
assert_eq!(cfg.effective_deadline_ms(None, None), 1_000_000);
}
#[test]
fn effective_deadline_auto_extend_disabled_returns_baseline() {
let mut cfg = AppConfig::default();
cfg.request.auto_extend_deadline_for_ladder = false;
cfg.request.deadline_ms_default = 8_000;
cfg.renderer = renderer_with_chrome_only(30_000);
assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
}
#[test]
#[cfg(feature = "cdp")]
fn effective_deadline_extends_for_long_wait_for() {
let mut cfg = AppConfig::default();
cfg.request.auto_extend_deadline_for_ladder = true;
cfg.request.deadline_ms_default = 8_000;
cfg.renderer = renderer_with_chrome_only(30_000);
let base = cfg.renderer.min_deadline_for_full_ladder_ms();
let tier_count = cfg.renderer.cdp_tier_count() as u64;
let with_wait = cfg.effective_deadline_ms(None, Some(20_000));
assert_eq!(with_wait, base + 12_000 * tier_count);
assert_eq!(cfg.effective_deadline_ms(None, Some(2_000)), base);
}
#[test]
fn effective_request_timeout_covers_map_ceiling() {
let mut cfg = AppConfig::default();
cfg.request.auto_extend_deadline_for_ladder = true;
cfg.request.deadline_ms_default = 8_000;
cfg.renderer = renderer_with_chrome_only(30_000);
cfg.search.timeout_ms = 15_000;
cfg.crawler.max_concurrency = 10;
cfg.search.max_limit = 20;
cfg.server.request_timeout_secs = 60;
assert!(cfg.effective_request_timeout_secs() >= 305);
}
#[test]
fn effective_request_timeout_disabled_returns_baseline() {
let mut cfg = AppConfig::default();
cfg.request.auto_extend_deadline_for_ladder = false;
cfg.server.request_timeout_secs = 60;
assert_eq!(cfg.effective_request_timeout_secs(), 60);
}
#[test]
fn effective_request_timeout_respects_operator_override() {
let mut cfg = AppConfig::default();
cfg.request.auto_extend_deadline_for_ladder = true;
cfg.server.request_timeout_secs = 600; cfg.renderer = renderer_with_chrome_only(30_000);
assert_eq!(cfg.effective_request_timeout_secs(), 600);
}
#[test]
fn effective_request_timeout_search_sequential_batching() {
let mut cfg = AppConfig::default();
cfg.request.auto_extend_deadline_for_ladder = true;
cfg.request.deadline_ms_default = 8_000;
cfg.renderer = renderer_with_chrome_only(30_000);
cfg.search.timeout_ms = 15_000;
cfg.search.max_limit = 20;
cfg.crawler.max_concurrency = 1;
cfg.server.request_timeout_secs = 60;
let secs = cfg.effective_request_timeout_secs();
let scrape_ms = cfg.effective_deadline_ms(None, Some(60_000));
let expected_search_ms = 15_000 + 20 * scrape_ms;
let expected_max_ms = scrape_ms.max(expected_search_ms).max(300_000);
let expected_secs = expected_max_ms.div_ceil(1_000) + 5;
assert_eq!(secs, 60u64.max(expected_secs));
}
#[test]
#[cfg(not(feature = "cdp"))]
fn cdp_tier_count_zero_without_cdp_feature() {
let r = RendererConfig {
mode: RendererMode::Auto,
page_timeout_ms: 15_000,
chrome_timeout_ms: Some(30_000),
chrome: Some(CdpEndpoint {
ws_url: "ws://chrome:9222".into(),
}),
lightpanda: Some(CdpEndpoint {
ws_url: "ws://lp:9222".into(),
}),
..Default::default()
};
assert_eq!(r.cdp_tier_count(), 0);
assert_eq!(r.min_deadline_for_full_ladder_ms(), 15_000);
}
#[test]
fn effective_deadline_skipped_for_http_only_mode() {
let mut cfg = AppConfig::default();
cfg.request.auto_extend_deadline_for_ladder = true;
cfg.request.deadline_ms_default = 8_000;
cfg.renderer = RendererConfig {
mode: RendererMode::Auto,
page_timeout_ms: 30_000,
lightpanda: None,
playwright: None,
chrome: None,
..Default::default()
};
assert_eq!(cfg.renderer.cdp_tier_count(), 0);
assert_eq!(cfg.effective_deadline_ms(None, None), 8_000);
assert_eq!(cfg.effective_deadline_ms(None, Some(30_000)), 8_000);
}
#[test]
#[cfg(feature = "cdp")]
fn min_deadline_full_ladder_playwright_only() {
let r = RendererConfig {
mode: RendererMode::Playwright,
page_timeout_ms: 15_000,
http_timeout_ms: Some(15_000),
chrome_timeout_ms: Some(30_000),
playwright: Some(CdpEndpoint {
ws_url: "ws://playwright:9222".into(),
}),
..Default::default()
};
assert_eq!(r.cdp_tier_count(), 1);
assert_eq!(
r.min_deadline_for_full_ladder_ms(),
15_000 + 30_000 + 28_000
);
}
#[test]
fn renderer_phase_toggles_default_off_or_safe() {
let r = RendererConfig::default();
assert!(!r.chrome_intercept_resources);
assert!(!r.chrome_intercept_stylesheets);
assert!(r.chrome_host_intercept_disable.is_empty());
assert_eq!(r.chrome_nav_budget_ms, 12_000);
assert!(!r.chrome_context_pool_enabled);
assert!(!r.use_predictor);
}
#[test]
fn crawler_per_host_limiter_defaults() {
let c = CrawlerConfig::default();
assert_eq!(c.per_host_min_interval_ms, 0);
assert_eq!(c.per_host_max_concurrent, 1);
}
#[test]
fn env_var_overrides_toml_defaults() {
let _g = ENV_LOCK.lock().unwrap();
clear_renderer_env();
unsafe {
std::env::set_var("CRW_SERVER__PORT", "4444");
std::env::set_var("CRW_RENDERER__LIGHTPANDA__WS_URL", "ws://test:9999/");
}
let cfg = AppConfig::load().unwrap();
clear_renderer_env();
assert_eq!(cfg.server.port, 4444, "env var should override server.port");
assert_eq!(
cfg.renderer.lightpanda.as_ref().unwrap().ws_url,
"ws://test:9999/",
"env var should override renderer.lightpanda.ws_url"
);
}
#[test]
fn user_config_path_honors_override_env() {
let _g = ENV_LOCK.lock().unwrap();
let tmp = std::env::temp_dir().join(format!("crw-cfg-test-{}", std::process::id()));
unsafe {
std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
}
let p = user_config_path().unwrap();
unsafe {
std::env::remove_var("CRW_USER_CONFIG_DIR");
}
assert_eq!(p, tmp.join("config.toml"));
}
#[test]
fn user_config_file_is_picked_up_by_load() {
let _g = ENV_LOCK.lock().unwrap();
clear_renderer_env();
let tmp = std::env::temp_dir().join(format!("crw-load-test-{}", std::process::id()));
std::fs::create_dir_all(&tmp).unwrap();
let cfg_path = tmp.join("config.toml");
std::fs::write(
&cfg_path,
r#"
[client]
api_url = "https://api.example.com"
api_key = "test-key-123"
[search]
searxng_url = "http://localhost:9999"
[extraction.llm]
provider = "deepseek"
api_key = "sk-test"
model = "deepseek-chat"
"#,
)
.unwrap();
unsafe {
std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
}
let cfg = AppConfig::load().unwrap();
unsafe {
std::env::remove_var("CRW_USER_CONFIG_DIR");
}
std::fs::remove_dir_all(&tmp).ok();
assert_eq!(
cfg.client.api_url.as_deref(),
Some("https://api.example.com")
);
assert_eq!(cfg.client.api_key.as_deref(), Some("test-key-123"));
assert_eq!(
cfg.search.searxng_url.as_deref(),
Some("http://localhost:9999")
);
let llm = cfg.extraction.llm.expect("llm config present");
assert_eq!(llm.provider, "deepseek");
assert_eq!(llm.api_key, "sk-test");
}
#[test]
fn env_var_beats_user_config() {
let _g = ENV_LOCK.lock().unwrap();
clear_renderer_env();
let tmp = std::env::temp_dir().join(format!("crw-prec-test-{}", std::process::id()));
std::fs::create_dir_all(&tmp).unwrap();
std::fs::write(
tmp.join("config.toml"),
r#"
[search]
searxng_url = "http://from-file:8080"
"#,
)
.unwrap();
unsafe {
std::env::set_var("CRW_USER_CONFIG_DIR", &tmp);
std::env::set_var("CRW_SEARCH__SEARXNG_URL", "http://from-env:8080");
}
let cfg = AppConfig::load().unwrap();
unsafe {
std::env::remove_var("CRW_USER_CONFIG_DIR");
std::env::remove_var("CRW_SEARCH__SEARXNG_URL");
}
std::fs::remove_dir_all(&tmp).ok();
assert_eq!(
cfg.search.searxng_url.as_deref(),
Some("http://from-env:8080"),
"env var must win over user config file"
);
}
#[test]
fn effective_proxy_credentials_appends_country_suffix() {
let cfg = RendererConfig {
proxy_base_user: Some("abc".into()),
proxy_base_pass: Some("pw".into()),
proxy_default_country: Some("de".into()),
..Default::default()
};
let (u, p) = cfg.effective_proxy_credentials(Some("us")).unwrap();
assert_eq!(u, "abc__cr.us");
assert_eq!(p, "pw");
let (u, _) = cfg.effective_proxy_credentials(Some("GB")).unwrap();
assert_eq!(u, "abc__cr.gb", "uppercase input is normalized");
let (u, _) = cfg.effective_proxy_credentials(None).unwrap();
assert_eq!(u, "abc__cr.de");
}
#[test]
fn effective_proxy_credentials_invalid_country_uses_global_pool() {
let cfg = RendererConfig {
proxy_base_user: Some("abc".into()),
proxy_base_pass: Some("pw".into()),
..Default::default()
};
let (u, _) = cfg.effective_proxy_credentials(Some("usa")).unwrap();
assert_eq!(u, "abc");
let (u, _) = cfg.effective_proxy_credentials(Some("u1")).unwrap();
assert_eq!(u, "abc");
let (u, _) = cfg.effective_proxy_credentials(Some(" ")).unwrap();
assert_eq!(u, "abc");
}
#[test]
fn effective_proxy_credentials_no_base_returns_none() {
let cfg = RendererConfig::default();
assert!(cfg.effective_proxy_credentials(Some("us")).is_none());
let only_user = RendererConfig {
proxy_base_user: Some("abc".into()),
..Default::default()
};
assert!(only_user.effective_proxy_credentials(Some("us")).is_none());
}
}