Skip to main content

crw_core/
config.rs

1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize)]
4pub struct AppConfig {
5    #[serde(default)]
6    pub server: ServerConfig,
7    #[serde(default)]
8    pub renderer: RendererConfig,
9    #[serde(default)]
10    pub crawler: CrawlerConfig,
11    #[serde(default)]
12    pub extraction: ExtractionConfig,
13    #[serde(default)]
14    pub auth: AuthConfig,
15}
16
17#[derive(Debug, Clone, Deserialize)]
18pub struct ServerConfig {
19    #[serde(default = "default_host")]
20    pub host: String,
21    #[serde(default = "default_port")]
22    pub port: u16,
23    #[serde(default = "default_request_timeout")]
24    pub request_timeout_secs: u64,
25}
26
27impl Default for ServerConfig {
28    fn default() -> Self {
29        Self {
30            host: default_host(),
31            port: default_port(),
32            request_timeout_secs: default_request_timeout(),
33        }
34    }
35}
36
37fn default_host() -> String {
38    "0.0.0.0".into()
39}
40fn default_port() -> u16 {
41    3000
42}
43fn default_request_timeout() -> u64 {
44    60
45}
46
47#[derive(Debug, Clone, Deserialize)]
48pub struct RendererConfig {
49    #[serde(default = "default_renderer_mode")]
50    pub mode: String,
51    #[serde(default = "default_page_timeout")]
52    pub page_timeout_ms: u64,
53    #[serde(default = "default_pool_size")]
54    pub pool_size: usize,
55    #[serde(default)]
56    pub lightpanda: Option<CdpEndpoint>,
57    #[serde(default)]
58    pub playwright: Option<CdpEndpoint>,
59    #[serde(default)]
60    pub chrome: Option<CdpEndpoint>,
61}
62
63impl Default for RendererConfig {
64    fn default() -> Self {
65        Self {
66            mode: default_renderer_mode(),
67            page_timeout_ms: default_page_timeout(),
68            pool_size: default_pool_size(),
69            lightpanda: None,
70            playwright: None,
71            chrome: None,
72        }
73    }
74}
75
76fn default_renderer_mode() -> String {
77    "auto".into()
78}
79fn default_page_timeout() -> u64 {
80    30000
81}
82fn default_pool_size() -> usize {
83    4
84}
85
86#[derive(Debug, Clone, Deserialize)]
87pub struct CdpEndpoint {
88    pub ws_url: String,
89}
90
91#[derive(Debug, Clone, Deserialize)]
92pub struct CrawlerConfig {
93    #[serde(default = "default_concurrency")]
94    pub max_concurrency: usize,
95    #[serde(default = "default_rps")]
96    pub requests_per_second: f64,
97    #[serde(default = "default_true")]
98    pub respect_robots_txt: bool,
99    #[serde(default = "default_ua")]
100    pub user_agent: String,
101    #[serde(default = "default_depth")]
102    pub default_max_depth: u32,
103    #[serde(default = "default_max_pages")]
104    pub default_max_pages: u32,
105    /// HTTP/HTTPS proxy URL (e.g. "http://proxy:8080")
106    #[serde(default)]
107    pub proxy: Option<String>,
108    /// TTL in seconds for completed crawl jobs before cleanup (default: 3600)
109    #[serde(default = "default_job_ttl")]
110    pub job_ttl_secs: u64,
111}
112
113impl Default for CrawlerConfig {
114    fn default() -> Self {
115        Self {
116            max_concurrency: default_concurrency(),
117            requests_per_second: default_rps(),
118            respect_robots_txt: true,
119            user_agent: default_ua(),
120            default_max_depth: default_depth(),
121            default_max_pages: default_max_pages(),
122            proxy: None,
123            job_ttl_secs: default_job_ttl(),
124        }
125    }
126}
127
128fn default_concurrency() -> usize {
129    10
130}
131fn default_rps() -> f64 {
132    10.0
133}
134fn default_true() -> bool {
135    true
136}
137fn default_ua() -> String {
138    "CRW/0.1".into()
139}
140fn default_depth() -> u32 {
141    2
142}
143fn default_max_pages() -> u32 {
144    100
145}
146fn default_job_ttl() -> u64 {
147    3600
148}
149
150#[derive(Debug, Clone, Deserialize)]
151pub struct ExtractionConfig {
152    #[serde(default = "default_format")]
153    pub default_format: String,
154    #[serde(default = "default_true_ext")]
155    pub only_main_content: bool,
156    #[serde(default)]
157    pub llm: Option<LlmConfig>,
158}
159
160impl Default for ExtractionConfig {
161    fn default() -> Self {
162        Self {
163            default_format: default_format(),
164            only_main_content: true,
165            llm: None,
166        }
167    }
168}
169
170#[derive(Debug, Clone, Deserialize)]
171pub struct LlmConfig {
172    #[serde(default = "default_llm_provider")]
173    pub provider: String,
174    pub api_key: String,
175    #[serde(default = "default_llm_model")]
176    pub model: String,
177    #[serde(default)]
178    pub base_url: Option<String>,
179    #[serde(default = "default_llm_max_tokens")]
180    pub max_tokens: u32,
181}
182
183fn default_llm_provider() -> String {
184    "anthropic".into()
185}
186fn default_llm_model() -> String {
187    "claude-sonnet-4-20250514".into()
188}
189fn default_llm_max_tokens() -> u32 {
190    4096
191}
192
193fn default_format() -> String {
194    "markdown".into()
195}
196fn default_true_ext() -> bool {
197    true
198}
199
200#[derive(Debug, Clone, Default, Deserialize)]
201pub struct AuthConfig {
202    #[serde(default)]
203    pub api_keys: Vec<String>,
204}
205
206impl AppConfig {
207    /// Load config from config.default.toml + environment variable overrides.
208    /// Env vars use `CRW_` prefix, `__` as separator. E.g. `CRW_SERVER__PORT=8080`.
209    pub fn load() -> Result<Self, config::ConfigError> {
210        let mut builder = config::Config::builder()
211            .add_source(config::File::with_name("config.default").required(false));
212
213        // Load optional override config file (e.g. config.docker.toml in containers).
214        if let Ok(extra) = std::env::var("CRW_CONFIG") {
215            builder = builder.add_source(config::File::with_name(&extra).required(true));
216        } else {
217            builder = builder.add_source(config::File::with_name("config.local").required(false));
218        }
219
220        let cfg = builder
221            .add_source(
222                config::Environment::with_prefix("CRW")
223                    .separator("__")
224                    .try_parsing(true),
225            )
226            .build()?;
227        cfg.try_deserialize()
228    }
229}