use serde::Deserialize;
#[derive(Debug, Clone, Deserialize)]
pub struct AppConfig {
#[serde(default)]
pub server: ServerConfig,
#[serde(default)]
pub renderer: RendererConfig,
#[serde(default)]
pub crawler: CrawlerConfig,
#[serde(default)]
pub extraction: ExtractionConfig,
#[serde(default)]
pub auth: AuthConfig,
#[serde(default)]
pub request: RequestConfig,
}
#[derive(Debug, Clone, Deserialize)]
pub struct RequestConfig {
#[serde(default = "default_deadline_ms")]
pub deadline_ms_default: u64,
}
impl Default for RequestConfig {
fn default() -> Self {
Self {
deadline_ms_default: default_deadline_ms(),
}
}
}
fn default_deadline_ms() -> u64 {
8000
}
#[derive(Debug, Clone, Deserialize)]
pub struct ServerConfig {
#[serde(default = "default_host")]
pub host: String,
#[serde(default = "default_port")]
pub port: u16,
#[serde(default = "default_request_timeout")]
pub request_timeout_secs: u64,
#[serde(default = "default_rate_limit_rps")]
pub rate_limit_rps: u64,
}
impl Default for ServerConfig {
fn default() -> Self {
Self {
host: default_host(),
port: default_port(),
request_timeout_secs: default_request_timeout(),
rate_limit_rps: default_rate_limit_rps(),
}
}
}
fn default_rate_limit_rps() -> u64 {
10
}
fn default_host() -> String {
"0.0.0.0".into()
}
fn default_port() -> u16 {
3000
}
fn default_request_timeout() -> u64 {
60
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum RendererMode {
#[default]
Auto,
None,
Lightpanda,
Chrome,
Playwright,
}
#[derive(Debug, Clone, Deserialize)]
pub struct RendererConfig {
#[serde(default)]
pub mode: RendererMode,
#[serde(default = "default_page_timeout")]
pub page_timeout_ms: u64,
#[serde(default)]
pub http_timeout_ms: Option<u64>,
#[serde(default)]
pub lightpanda_timeout_ms: Option<u64>,
#[serde(default)]
pub chrome_timeout_ms: Option<u64>,
#[serde(default = "default_pool_size")]
pub pool_size: usize,
#[serde(default, alias = "force_js")]
pub render_js_default: Option<bool>,
#[serde(default)]
pub lightpanda: Option<CdpEndpoint>,
#[serde(default)]
pub playwright: Option<CdpEndpoint>,
#[serde(default)]
pub chrome: Option<CdpEndpoint>,
#[serde(default)]
pub chrome_intercept_resources: bool,
#[serde(default)]
pub chrome_intercept_stylesheets: bool,
#[serde(default)]
pub chrome_host_intercept_disable: Vec<String>,
#[serde(default = "default_chrome_nav_budget_ms")]
pub chrome_nav_budget_ms: u64,
#[serde(default)]
pub chrome_context_pool_enabled: bool,
#[serde(default)]
pub use_predictor: bool,
#[serde(default)]
pub escalation: EscalationConfig,
#[serde(default)]
pub antibot: AntibotConfig,
}
#[derive(Debug, Clone, Deserialize)]
pub struct EscalationConfig {
#[serde(default)]
pub enabled: bool,
#[serde(default = "default_waterfall_timeout_ms")]
pub waterfall_timeout_ms: u64,
#[serde(default = "default_escalation_global_timeout_ms")]
pub global_timeout_ms: u64,
#[serde(default)]
pub residential_proxy: bool,
#[serde(default = "default_proxy_country")]
pub proxy_country: String,
}
impl Default for EscalationConfig {
fn default() -> Self {
Self {
enabled: false,
waterfall_timeout_ms: default_waterfall_timeout_ms(),
global_timeout_ms: default_escalation_global_timeout_ms(),
residential_proxy: false,
proxy_country: default_proxy_country(),
}
}
}
fn default_waterfall_timeout_ms() -> u64 {
8_000
}
fn default_escalation_global_timeout_ms() -> u64 {
60_000
}
fn default_proxy_country() -> String {
"us".to_string()
}
#[derive(Debug, Clone, Deserialize)]
pub struct AntibotConfig {
#[serde(default = "default_true")]
pub enabled: bool,
#[serde(default)]
pub escalate_on_signal: bool,
}
impl Default for AntibotConfig {
fn default() -> Self {
Self {
enabled: true,
escalate_on_signal: false,
}
}
}
fn default_chrome_nav_budget_ms() -> u64 {
12_000
}
impl Default for RendererConfig {
fn default() -> Self {
Self {
mode: RendererMode::default(),
page_timeout_ms: default_page_timeout(),
http_timeout_ms: None,
lightpanda_timeout_ms: None,
chrome_timeout_ms: None,
pool_size: default_pool_size(),
render_js_default: None,
lightpanda: None,
playwright: None,
chrome: None,
chrome_intercept_resources: false,
chrome_intercept_stylesheets: false,
chrome_host_intercept_disable: Vec::new(),
chrome_nav_budget_ms: default_chrome_nav_budget_ms(),
chrome_context_pool_enabled: false,
use_predictor: false,
escalation: EscalationConfig::default(),
antibot: AntibotConfig::default(),
}
}
}
fn default_page_timeout() -> u64 {
30000
}
impl RendererConfig {
pub fn http_timeout(&self) -> u64 {
self.http_timeout_ms.unwrap_or(self.page_timeout_ms)
}
pub fn lightpanda_timeout(&self) -> u64 {
self.lightpanda_timeout_ms.unwrap_or(self.page_timeout_ms)
}
pub fn chrome_timeout(&self) -> u64 {
self.chrome_timeout_ms.unwrap_or(self.page_timeout_ms)
}
}
fn default_pool_size() -> usize {
4
}
#[derive(Debug, Clone, Deserialize)]
pub struct CdpEndpoint {
pub ws_url: String,
}
#[derive(Debug, Clone, Deserialize)]
pub struct StealthConfig {
#[serde(default)]
pub enabled: bool,
#[serde(default)]
pub user_agents: Vec<String>,
#[serde(default = "default_jitter")]
pub jitter_factor: f64,
#[serde(default = "default_true")]
pub inject_headers: bool,
}
impl Default for StealthConfig {
fn default() -> Self {
Self {
enabled: false,
user_agents: vec![],
jitter_factor: default_jitter(),
inject_headers: true,
}
}
}
fn default_jitter() -> f64 {
0.2
}
pub const BUILTIN_UA_POOL: &[&str] = &[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
];
#[derive(Debug, Clone, Deserialize)]
pub struct CrawlerConfig {
#[serde(default = "default_concurrency")]
pub max_concurrency: usize,
#[serde(default = "default_rps")]
pub requests_per_second: f64,
#[serde(default = "default_true")]
pub respect_robots_txt: bool,
#[serde(default = "default_ua")]
pub user_agent: String,
#[serde(default = "default_depth")]
pub default_max_depth: u32,
#[serde(default = "default_max_pages")]
pub default_max_pages: u32,
#[serde(default)]
pub proxy: Option<String>,
#[serde(default = "default_job_ttl")]
pub job_ttl_secs: u64,
#[serde(default)]
pub stealth: StealthConfig,
#[serde(default)]
pub per_host_min_interval_ms: u64,
#[serde(default = "default_per_host_max_concurrent")]
pub per_host_max_concurrent: u32,
}
fn default_per_host_max_concurrent() -> u32 {
1
}
impl Default for CrawlerConfig {
fn default() -> Self {
Self {
max_concurrency: default_concurrency(),
requests_per_second: default_rps(),
respect_robots_txt: true,
user_agent: default_ua(),
default_max_depth: default_depth(),
default_max_pages: default_max_pages(),
proxy: None,
job_ttl_secs: default_job_ttl(),
stealth: StealthConfig::default(),
per_host_min_interval_ms: 0,
per_host_max_concurrent: default_per_host_max_concurrent(),
}
}
}
fn default_concurrency() -> usize {
10
}
fn default_rps() -> f64 {
10.0
}
fn default_true() -> bool {
true
}
fn default_ua() -> String {
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
.into()
}
fn default_depth() -> u32 {
2
}
fn default_max_pages() -> u32 {
100
}
fn default_job_ttl() -> u64 {
3600
}
#[derive(Debug, Clone, Deserialize)]
pub struct ExtractionConfig {
#[serde(default = "default_format")]
pub default_format: String,
#[serde(default = "default_true_ext")]
pub only_main_content: bool,
#[serde(default)]
pub llm: Option<LlmConfig>,
#[serde(default)]
pub domain_selectors: std::collections::HashMap<String, String>,
#[serde(default)]
pub llm_fallback: LlmFallbackConfig,
#[serde(default = "default_http_retry_threshold")]
pub http_retry_threshold_bytes: usize,
#[serde(default = "default_lightpanda_retry_threshold")]
pub lightpanda_retry_threshold_bytes: usize,
}
fn default_http_retry_threshold() -> usize {
100
}
fn default_lightpanda_retry_threshold() -> usize {
2000
}
impl Default for ExtractionConfig {
fn default() -> Self {
Self {
default_format: default_format(),
only_main_content: true,
llm: None,
domain_selectors: std::collections::HashMap::new(),
llm_fallback: LlmFallbackConfig::default(),
http_retry_threshold_bytes: default_http_retry_threshold(),
lightpanda_retry_threshold_bytes: default_lightpanda_retry_threshold(),
}
}
}
#[derive(Debug, Clone, Deserialize)]
pub struct LlmFallbackConfig {
#[serde(default)]
pub enable: bool,
#[serde(default = "default_llm_quality_threshold")]
pub quality_threshold: f32,
#[serde(default = "default_llm_max_html_bytes")]
pub max_html_bytes: usize,
#[serde(default)]
pub always_run: bool,
}
impl Default for LlmFallbackConfig {
fn default() -> Self {
Self {
enable: false,
quality_threshold: default_llm_quality_threshold(),
max_html_bytes: default_llm_max_html_bytes(),
always_run: false,
}
}
}
fn default_llm_quality_threshold() -> f32 {
0.3
}
fn default_llm_max_html_bytes() -> usize {
100_000
}
#[derive(Debug, Clone, Deserialize)]
pub struct LlmConfig {
#[serde(default = "default_llm_provider")]
pub provider: String,
pub api_key: String,
#[serde(default = "default_llm_model")]
pub model: String,
#[serde(default)]
pub base_url: Option<String>,
#[serde(default = "default_llm_max_tokens")]
pub max_tokens: u32,
#[serde(default)]
pub azure_api_version: Option<String>,
}
fn default_llm_provider() -> String {
"anthropic".into()
}
fn default_llm_model() -> String {
"claude-sonnet-4-20250514".into()
}
fn default_llm_max_tokens() -> u32 {
4096
}
fn default_format() -> String {
"markdown".into()
}
fn default_true_ext() -> bool {
true
}
#[derive(Debug, Clone, Default, Deserialize)]
pub struct AuthConfig {
#[serde(default)]
pub api_keys: Vec<String>,
}
impl AppConfig {
pub fn load() -> Result<Self, config::ConfigError> {
let mut builder = config::Config::builder()
.add_source(config::File::with_name("config.default").required(false));
if let Ok(extra) = std::env::var("CRW_CONFIG") {
builder = builder.add_source(config::File::with_name(&extra).required(true));
} else {
builder = builder.add_source(config::File::with_name("config.local").required(false));
}
let cfg = builder
.add_source(
config::Environment::with_prefix("CRW")
.prefix_separator("_")
.separator("__")
.try_parsing(true),
)
.build()?;
cfg.try_deserialize()
}
}
#[cfg(test)]
mod tests {
use super::*;
static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
fn clear_renderer_env() {
for k in [
"CRW_RENDERER__MODE",
"CRW_RENDERER__FORCE_JS",
"CRW_RENDERER__RENDER_JS_DEFAULT",
"CRW_RENDERER__LIGHTPANDA__WS_URL",
"CRW_SERVER__PORT",
] {
unsafe { std::env::remove_var(k) };
}
}
#[test]
fn renderer_mode_parses_variants() {
#[derive(Deserialize)]
struct Wrap {
mode: RendererMode,
}
let cases = [
("mode = \"auto\"", RendererMode::Auto),
("mode = \"none\"", RendererMode::None),
("mode = \"lightpanda\"", RendererMode::Lightpanda),
("mode = \"chrome\"", RendererMode::Chrome),
("mode = \"playwright\"", RendererMode::Playwright),
];
for (toml_str, expected) in cases {
let w: Wrap = toml::from_str(toml_str).unwrap();
assert_eq!(w.mode, expected, "toml: {toml_str}");
}
}
#[test]
fn renderer_mode_bogus_errors() {
#[derive(Deserialize)]
struct Wrap {
#[allow(dead_code)]
mode: RendererMode,
}
let err: Result<Wrap, _> = toml::from_str("mode = \"bogus\"");
assert!(err.is_err(), "bogus mode should fail to parse");
}
#[test]
fn renderer_config_default_mode_is_auto() {
let cfg = RendererConfig::default();
assert_eq!(cfg.mode, RendererMode::Auto);
assert_eq!(cfg.render_js_default, None);
}
#[test]
fn render_js_default_force_js_alias() {
let cfg: RendererConfig = toml::from_str("force_js = true").unwrap();
assert_eq!(cfg.render_js_default, Some(true));
}
#[test]
fn render_js_default_direct_field() {
let cfg: RendererConfig = toml::from_str("render_js_default = false").unwrap();
assert_eq!(cfg.render_js_default, Some(false));
}
#[test]
fn env_var_renderer_mode_chrome() {
let _g = ENV_LOCK.lock().unwrap();
clear_renderer_env();
unsafe { std::env::set_var("CRW_RENDERER__MODE", "chrome") };
let cfg = AppConfig::load().unwrap();
clear_renderer_env();
assert_eq!(cfg.renderer.mode, RendererMode::Chrome);
}
#[test]
fn env_var_force_js_alias_works() {
let _g = ENV_LOCK.lock().unwrap();
clear_renderer_env();
unsafe { std::env::set_var("CRW_RENDERER__FORCE_JS", "true") };
let cfg = AppConfig::load().unwrap();
clear_renderer_env();
assert_eq!(cfg.renderer.render_js_default, Some(true));
}
#[test]
fn env_var_render_js_default_direct() {
let _g = ENV_LOCK.lock().unwrap();
clear_renderer_env();
unsafe { std::env::set_var("CRW_RENDERER__RENDER_JS_DEFAULT", "true") };
let cfg = AppConfig::load().unwrap();
clear_renderer_env();
assert_eq!(cfg.renderer.render_js_default, Some(true));
}
#[test]
fn request_config_defaults_match_plan() {
let r = RequestConfig::default();
assert_eq!(r.deadline_ms_default, 8000);
}
#[test]
fn renderer_phase_toggles_default_off_or_safe() {
let r = RendererConfig::default();
assert!(!r.chrome_intercept_resources);
assert!(!r.chrome_intercept_stylesheets);
assert!(r.chrome_host_intercept_disable.is_empty());
assert_eq!(r.chrome_nav_budget_ms, 12_000);
assert!(!r.chrome_context_pool_enabled);
assert!(!r.use_predictor);
}
#[test]
fn crawler_per_host_limiter_defaults() {
let c = CrawlerConfig::default();
assert_eq!(c.per_host_min_interval_ms, 0);
assert_eq!(c.per_host_max_concurrent, 1);
}
#[test]
fn env_var_overrides_toml_defaults() {
let _g = ENV_LOCK.lock().unwrap();
clear_renderer_env();
unsafe {
std::env::set_var("CRW_SERVER__PORT", "4444");
std::env::set_var("CRW_RENDERER__LIGHTPANDA__WS_URL", "ws://test:9999/");
}
let cfg = AppConfig::load().unwrap();
clear_renderer_env();
assert_eq!(cfg.server.port, 4444, "env var should override server.port");
assert_eq!(
cfg.renderer.lightpanda.as_ref().unwrap().ws_url,
"ws://test:9999/",
"env var should override renderer.lightpanda.ws_url"
);
}
}