Skip to main content

crw_core/
config.rs

1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize)]
4pub struct AppConfig {
5    #[serde(default)]
6    pub server: ServerConfig,
7    #[serde(default)]
8    pub renderer: RendererConfig,
9    #[serde(default)]
10    pub crawler: CrawlerConfig,
11    #[serde(default)]
12    pub extraction: ExtractionConfig,
13    #[serde(default)]
14    pub auth: AuthConfig,
15}
16
17#[derive(Debug, Clone, Deserialize)]
18pub struct ServerConfig {
19    #[serde(default = "default_host")]
20    pub host: String,
21    #[serde(default = "default_port")]
22    pub port: u16,
23    #[serde(default = "default_request_timeout")]
24    pub request_timeout_secs: u64,
25}
26
27impl Default for ServerConfig {
28    fn default() -> Self {
29        Self {
30            host: default_host(),
31            port: default_port(),
32            request_timeout_secs: default_request_timeout(),
33        }
34    }
35}
36
37fn default_host() -> String {
38    "0.0.0.0".into()
39}
40fn default_port() -> u16 {
41    3000
42}
43fn default_request_timeout() -> u64 {
44    60
45}
46
47#[derive(Debug, Clone, Deserialize)]
48pub struct RendererConfig {
49    #[serde(default = "default_renderer_mode")]
50    pub mode: String,
51    #[serde(default = "default_page_timeout")]
52    pub page_timeout_ms: u64,
53    #[serde(default = "default_pool_size")]
54    pub pool_size: usize,
55    #[serde(default)]
56    pub lightpanda: Option<CdpEndpoint>,
57    #[serde(default)]
58    pub playwright: Option<CdpEndpoint>,
59    #[serde(default)]
60    pub chrome: Option<CdpEndpoint>,
61}
62
63impl Default for RendererConfig {
64    fn default() -> Self {
65        Self {
66            mode: default_renderer_mode(),
67            page_timeout_ms: default_page_timeout(),
68            pool_size: default_pool_size(),
69            lightpanda: None,
70            playwright: None,
71            chrome: None,
72        }
73    }
74}
75
76fn default_renderer_mode() -> String {
77    "auto".into()
78}
79fn default_page_timeout() -> u64 {
80    30000
81}
82fn default_pool_size() -> usize {
83    4
84}
85
86#[derive(Debug, Clone, Deserialize)]
87pub struct CdpEndpoint {
88    pub ws_url: String,
89}
90
91/// Stealth mode configuration for evading bot detection.
92#[derive(Debug, Clone, Deserialize)]
93pub struct StealthConfig {
94    /// Enable stealth mode globally.
95    #[serde(default)]
96    pub enabled: bool,
97    /// Custom user-agent pool. Empty = use built-in pool.
98    #[serde(default)]
99    pub user_agents: Vec<String>,
100    /// Jitter factor for rate limiting (0.0–1.0, default 0.2 = ±20%).
101    #[serde(default = "default_jitter")]
102    pub jitter_factor: f64,
103    /// Inject realistic browser headers (Accept, Sec-Fetch-*, etc.).
104    #[serde(default = "default_true")]
105    pub inject_headers: bool,
106}
107
108impl Default for StealthConfig {
109    fn default() -> Self {
110        Self {
111            enabled: false,
112            user_agents: vec![],
113            jitter_factor: default_jitter(),
114            inject_headers: true,
115        }
116    }
117}
118
119fn default_jitter() -> f64 {
120    0.2
121}
122
123/// Built-in realistic user-agent pool used when stealth is enabled.
124pub const BUILTIN_UA_POOL: &[&str] = &[
125    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
126    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
127    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
128    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
129    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
130];
131
132#[derive(Debug, Clone, Deserialize)]
133pub struct CrawlerConfig {
134    #[serde(default = "default_concurrency")]
135    pub max_concurrency: usize,
136    #[serde(default = "default_rps")]
137    pub requests_per_second: f64,
138    #[serde(default = "default_true")]
139    pub respect_robots_txt: bool,
140    #[serde(default = "default_ua")]
141    pub user_agent: String,
142    #[serde(default = "default_depth")]
143    pub default_max_depth: u32,
144    #[serde(default = "default_max_pages")]
145    pub default_max_pages: u32,
146    /// HTTP/HTTPS proxy URL (e.g. "http://proxy:8080")
147    #[serde(default)]
148    pub proxy: Option<String>,
149    /// TTL in seconds for completed crawl jobs before cleanup (default: 3600)
150    #[serde(default = "default_job_ttl")]
151    pub job_ttl_secs: u64,
152    #[serde(default)]
153    pub stealth: StealthConfig,
154}
155
156impl Default for CrawlerConfig {
157    fn default() -> Self {
158        Self {
159            max_concurrency: default_concurrency(),
160            requests_per_second: default_rps(),
161            respect_robots_txt: true,
162            user_agent: default_ua(),
163            default_max_depth: default_depth(),
164            default_max_pages: default_max_pages(),
165            proxy: None,
166            job_ttl_secs: default_job_ttl(),
167            stealth: StealthConfig::default(),
168        }
169    }
170}
171
172fn default_concurrency() -> usize {
173    10
174}
175fn default_rps() -> f64 {
176    10.0
177}
178fn default_true() -> bool {
179    true
180}
181fn default_ua() -> String {
182    "CRW/0.1".into()
183}
184fn default_depth() -> u32 {
185    2
186}
187fn default_max_pages() -> u32 {
188    100
189}
190fn default_job_ttl() -> u64 {
191    3600
192}
193
194#[derive(Debug, Clone, Deserialize)]
195pub struct ExtractionConfig {
196    #[serde(default = "default_format")]
197    pub default_format: String,
198    #[serde(default = "default_true_ext")]
199    pub only_main_content: bool,
200    #[serde(default)]
201    pub llm: Option<LlmConfig>,
202}
203
204impl Default for ExtractionConfig {
205    fn default() -> Self {
206        Self {
207            default_format: default_format(),
208            only_main_content: true,
209            llm: None,
210        }
211    }
212}
213
214#[derive(Debug, Clone, Deserialize)]
215pub struct LlmConfig {
216    #[serde(default = "default_llm_provider")]
217    pub provider: String,
218    pub api_key: String,
219    #[serde(default = "default_llm_model")]
220    pub model: String,
221    #[serde(default)]
222    pub base_url: Option<String>,
223    #[serde(default = "default_llm_max_tokens")]
224    pub max_tokens: u32,
225}
226
227fn default_llm_provider() -> String {
228    "anthropic".into()
229}
230fn default_llm_model() -> String {
231    "claude-sonnet-4-20250514".into()
232}
233fn default_llm_max_tokens() -> u32 {
234    4096
235}
236
237fn default_format() -> String {
238    "markdown".into()
239}
240fn default_true_ext() -> bool {
241    true
242}
243
244#[derive(Debug, Clone, Default, Deserialize)]
245pub struct AuthConfig {
246    #[serde(default)]
247    pub api_keys: Vec<String>,
248}
249
250impl AppConfig {
251    /// Load config from config.default.toml + environment variable overrides.
252    /// Env vars use `CRW_` prefix, `__` as separator. E.g. `CRW_SERVER__PORT=8080`.
253    pub fn load() -> Result<Self, config::ConfigError> {
254        let mut builder = config::Config::builder()
255            .add_source(config::File::with_name("config.default").required(false));
256
257        // Load optional override config file (e.g. config.docker.toml in containers).
258        if let Ok(extra) = std::env::var("CRW_CONFIG") {
259            builder = builder.add_source(config::File::with_name(&extra).required(true));
260        } else {
261            builder = builder.add_source(config::File::with_name("config.local").required(false));
262        }
263
264        let cfg = builder
265            .add_source(
266                config::Environment::with_prefix("CRW")
267                    .separator("__")
268                    .try_parsing(true),
269            )
270            .build()?;
271        cfg.try_deserialize()
272    }
273}