Skip to main content

crw_core/
config.rs

1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize)]
4pub struct AppConfig {
5    #[serde(default)]
6    pub server: ServerConfig,
7    #[serde(default)]
8    pub renderer: RendererConfig,
9    #[serde(default)]
10    pub crawler: CrawlerConfig,
11    #[serde(default)]
12    pub extraction: ExtractionConfig,
13    #[serde(default)]
14    pub auth: AuthConfig,
15}
16
17#[derive(Debug, Clone, Deserialize)]
18pub struct ServerConfig {
19    #[serde(default = "default_host")]
20    pub host: String,
21    #[serde(default = "default_port")]
22    pub port: u16,
23    #[serde(default = "default_request_timeout")]
24    pub request_timeout_secs: u64,
25    /// Maximum requests per second (global). 0 = unlimited.
26    #[serde(default = "default_rate_limit_rps")]
27    pub rate_limit_rps: u64,
28}
29
30impl Default for ServerConfig {
31    fn default() -> Self {
32        Self {
33            host: default_host(),
34            port: default_port(),
35            request_timeout_secs: default_request_timeout(),
36            rate_limit_rps: default_rate_limit_rps(),
37        }
38    }
39}
40
41fn default_rate_limit_rps() -> u64 {
42    10
43}
44
45fn default_host() -> String {
46    "0.0.0.0".into()
47}
48fn default_port() -> u16 {
49    3000
50}
51fn default_request_timeout() -> u64 {
52    60
53}
54
55#[derive(Debug, Clone, Deserialize)]
56pub struct RendererConfig {
57    #[serde(default = "default_renderer_mode")]
58    pub mode: String,
59    #[serde(default = "default_page_timeout")]
60    pub page_timeout_ms: u64,
61    #[serde(default = "default_pool_size")]
62    pub pool_size: usize,
63    #[serde(default)]
64    pub lightpanda: Option<CdpEndpoint>,
65    #[serde(default)]
66    pub playwright: Option<CdpEndpoint>,
67    #[serde(default)]
68    pub chrome: Option<CdpEndpoint>,
69}
70
71impl Default for RendererConfig {
72    fn default() -> Self {
73        Self {
74            mode: default_renderer_mode(),
75            page_timeout_ms: default_page_timeout(),
76            pool_size: default_pool_size(),
77            lightpanda: None,
78            playwright: None,
79            chrome: None,
80        }
81    }
82}
83
84fn default_renderer_mode() -> String {
85    "auto".into()
86}
87fn default_page_timeout() -> u64 {
88    30000
89}
90fn default_pool_size() -> usize {
91    4
92}
93
94#[derive(Debug, Clone, Deserialize)]
95pub struct CdpEndpoint {
96    pub ws_url: String,
97}
98
99/// Stealth mode configuration for evading bot detection.
100#[derive(Debug, Clone, Deserialize)]
101pub struct StealthConfig {
102    /// Enable stealth mode globally.
103    #[serde(default)]
104    pub enabled: bool,
105    /// Custom user-agent pool. Empty = use built-in pool.
106    #[serde(default)]
107    pub user_agents: Vec<String>,
108    /// Jitter factor for rate limiting (0.0–1.0, default 0.2 = ±20%).
109    #[serde(default = "default_jitter")]
110    pub jitter_factor: f64,
111    /// Inject realistic browser headers (Accept, Sec-Fetch-*, etc.).
112    #[serde(default = "default_true")]
113    pub inject_headers: bool,
114}
115
116impl Default for StealthConfig {
117    fn default() -> Self {
118        Self {
119            enabled: false,
120            user_agents: vec![],
121            jitter_factor: default_jitter(),
122            inject_headers: true,
123        }
124    }
125}
126
127fn default_jitter() -> f64 {
128    0.2
129}
130
131/// Built-in realistic user-agent pool used when stealth is enabled.
132pub const BUILTIN_UA_POOL: &[&str] = &[
133    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
134    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
135    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
136    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
137    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
138];
139
140#[derive(Debug, Clone, Deserialize)]
141pub struct CrawlerConfig {
142    #[serde(default = "default_concurrency")]
143    pub max_concurrency: usize,
144    #[serde(default = "default_rps")]
145    pub requests_per_second: f64,
146    #[serde(default = "default_true")]
147    pub respect_robots_txt: bool,
148    #[serde(default = "default_ua")]
149    pub user_agent: String,
150    #[serde(default = "default_depth")]
151    pub default_max_depth: u32,
152    #[serde(default = "default_max_pages")]
153    pub default_max_pages: u32,
154    /// HTTP/HTTPS proxy URL (e.g. "http://proxy:8080")
155    #[serde(default)]
156    pub proxy: Option<String>,
157    /// TTL in seconds for completed crawl jobs before cleanup (default: 3600)
158    #[serde(default = "default_job_ttl")]
159    pub job_ttl_secs: u64,
160    #[serde(default)]
161    pub stealth: StealthConfig,
162}
163
164impl Default for CrawlerConfig {
165    fn default() -> Self {
166        Self {
167            max_concurrency: default_concurrency(),
168            requests_per_second: default_rps(),
169            respect_robots_txt: true,
170            user_agent: default_ua(),
171            default_max_depth: default_depth(),
172            default_max_pages: default_max_pages(),
173            proxy: None,
174            job_ttl_secs: default_job_ttl(),
175            stealth: StealthConfig::default(),
176        }
177    }
178}
179
180fn default_concurrency() -> usize {
181    10
182}
183fn default_rps() -> f64 {
184    10.0
185}
186fn default_true() -> bool {
187    true
188}
189fn default_ua() -> String {
190    "CRW/0.1".into()
191}
192fn default_depth() -> u32 {
193    2
194}
195fn default_max_pages() -> u32 {
196    100
197}
198fn default_job_ttl() -> u64 {
199    3600
200}
201
202#[derive(Debug, Clone, Deserialize)]
203pub struct ExtractionConfig {
204    #[serde(default = "default_format")]
205    pub default_format: String,
206    #[serde(default = "default_true_ext")]
207    pub only_main_content: bool,
208    #[serde(default)]
209    pub llm: Option<LlmConfig>,
210}
211
212impl Default for ExtractionConfig {
213    fn default() -> Self {
214        Self {
215            default_format: default_format(),
216            only_main_content: true,
217            llm: None,
218        }
219    }
220}
221
222#[derive(Debug, Clone, Deserialize)]
223pub struct LlmConfig {
224    #[serde(default = "default_llm_provider")]
225    pub provider: String,
226    pub api_key: String,
227    #[serde(default = "default_llm_model")]
228    pub model: String,
229    #[serde(default)]
230    pub base_url: Option<String>,
231    #[serde(default = "default_llm_max_tokens")]
232    pub max_tokens: u32,
233}
234
235fn default_llm_provider() -> String {
236    "anthropic".into()
237}
238fn default_llm_model() -> String {
239    "claude-sonnet-4-20250514".into()
240}
241fn default_llm_max_tokens() -> u32 {
242    4096
243}
244
245fn default_format() -> String {
246    "markdown".into()
247}
248fn default_true_ext() -> bool {
249    true
250}
251
252#[derive(Debug, Clone, Default, Deserialize)]
253pub struct AuthConfig {
254    #[serde(default)]
255    pub api_keys: Vec<String>,
256}
257
258impl AppConfig {
259    /// Load config from config.default.toml + environment variable overrides.
260    /// Env vars use `CRW_` prefix, `__` as separator. E.g. `CRW_SERVER__PORT=8080`.
261    pub fn load() -> Result<Self, config::ConfigError> {
262        let mut builder = config::Config::builder()
263            .add_source(config::File::with_name("config.default").required(false));
264
265        // Load optional override config file (e.g. config.docker.toml in containers).
266        if let Ok(extra) = std::env::var("CRW_CONFIG") {
267            builder = builder.add_source(config::File::with_name(&extra).required(true));
268        } else {
269            builder = builder.add_source(config::File::with_name("config.local").required(false));
270        }
271
272        let cfg = builder
273            .add_source(
274                config::Environment::with_prefix("CRW")
275                    .separator("__")
276                    .try_parsing(true),
277            )
278            .build()?;
279        cfg.try_deserialize()
280    }
281}