1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize)]
4pub struct AppConfig {
5 #[serde(default)]
6 pub server: ServerConfig,
7 #[serde(default)]
8 pub renderer: RendererConfig,
9 #[serde(default)]
10 pub crawler: CrawlerConfig,
11 #[serde(default)]
12 pub extraction: ExtractionConfig,
13 #[serde(default)]
14 pub auth: AuthConfig,
15}
16
17#[derive(Debug, Clone, Deserialize)]
18pub struct ServerConfig {
19 #[serde(default = "default_host")]
20 pub host: String,
21 #[serde(default = "default_port")]
22 pub port: u16,
23 #[serde(default = "default_request_timeout")]
24 pub request_timeout_secs: u64,
25}
26
27impl Default for ServerConfig {
28 fn default() -> Self {
29 Self {
30 host: default_host(),
31 port: default_port(),
32 request_timeout_secs: default_request_timeout(),
33 }
34 }
35}
36
37fn default_host() -> String {
38 "0.0.0.0".into()
39}
40fn default_port() -> u16 {
41 3000
42}
43fn default_request_timeout() -> u64 {
44 60
45}
46
47#[derive(Debug, Clone, Deserialize)]
48pub struct RendererConfig {
49 #[serde(default = "default_renderer_mode")]
50 pub mode: String,
51 #[serde(default = "default_page_timeout")]
52 pub page_timeout_ms: u64,
53 #[serde(default = "default_pool_size")]
54 pub pool_size: usize,
55 #[serde(default)]
56 pub lightpanda: Option<CdpEndpoint>,
57 #[serde(default)]
58 pub playwright: Option<CdpEndpoint>,
59 #[serde(default)]
60 pub chrome: Option<CdpEndpoint>,
61}
62
63impl Default for RendererConfig {
64 fn default() -> Self {
65 Self {
66 mode: default_renderer_mode(),
67 page_timeout_ms: default_page_timeout(),
68 pool_size: default_pool_size(),
69 lightpanda: None,
70 playwright: None,
71 chrome: None,
72 }
73 }
74}
75
76fn default_renderer_mode() -> String {
77 "auto".into()
78}
79fn default_page_timeout() -> u64 {
80 30000
81}
82fn default_pool_size() -> usize {
83 4
84}
85
86#[derive(Debug, Clone, Deserialize)]
87pub struct CdpEndpoint {
88 pub ws_url: String,
89}
90
91#[derive(Debug, Clone, Deserialize)]
93pub struct StealthConfig {
94 #[serde(default)]
96 pub enabled: bool,
97 #[serde(default)]
99 pub user_agents: Vec<String>,
100 #[serde(default = "default_jitter")]
102 pub jitter_factor: f64,
103 #[serde(default = "default_true")]
105 pub inject_headers: bool,
106}
107
108impl Default for StealthConfig {
109 fn default() -> Self {
110 Self {
111 enabled: false,
112 user_agents: vec![],
113 jitter_factor: default_jitter(),
114 inject_headers: true,
115 }
116 }
117}
118
119fn default_jitter() -> f64 {
120 0.2
121}
122
123pub const BUILTIN_UA_POOL: &[&str] = &[
125 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
126 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
127 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
128 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
129 "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
130];
131
132#[derive(Debug, Clone, Deserialize)]
133pub struct CrawlerConfig {
134 #[serde(default = "default_concurrency")]
135 pub max_concurrency: usize,
136 #[serde(default = "default_rps")]
137 pub requests_per_second: f64,
138 #[serde(default = "default_true")]
139 pub respect_robots_txt: bool,
140 #[serde(default = "default_ua")]
141 pub user_agent: String,
142 #[serde(default = "default_depth")]
143 pub default_max_depth: u32,
144 #[serde(default = "default_max_pages")]
145 pub default_max_pages: u32,
146 #[serde(default)]
148 pub proxy: Option<String>,
149 #[serde(default = "default_job_ttl")]
151 pub job_ttl_secs: u64,
152 #[serde(default)]
153 pub stealth: StealthConfig,
154}
155
156impl Default for CrawlerConfig {
157 fn default() -> Self {
158 Self {
159 max_concurrency: default_concurrency(),
160 requests_per_second: default_rps(),
161 respect_robots_txt: true,
162 user_agent: default_ua(),
163 default_max_depth: default_depth(),
164 default_max_pages: default_max_pages(),
165 proxy: None,
166 job_ttl_secs: default_job_ttl(),
167 stealth: StealthConfig::default(),
168 }
169 }
170}
171
172fn default_concurrency() -> usize {
173 10
174}
175fn default_rps() -> f64 {
176 10.0
177}
178fn default_true() -> bool {
179 true
180}
181fn default_ua() -> String {
182 "CRW/0.1".into()
183}
184fn default_depth() -> u32 {
185 2
186}
187fn default_max_pages() -> u32 {
188 100
189}
190fn default_job_ttl() -> u64 {
191 3600
192}
193
194#[derive(Debug, Clone, Deserialize)]
195pub struct ExtractionConfig {
196 #[serde(default = "default_format")]
197 pub default_format: String,
198 #[serde(default = "default_true_ext")]
199 pub only_main_content: bool,
200 #[serde(default)]
201 pub llm: Option<LlmConfig>,
202}
203
204impl Default for ExtractionConfig {
205 fn default() -> Self {
206 Self {
207 default_format: default_format(),
208 only_main_content: true,
209 llm: None,
210 }
211 }
212}
213
214#[derive(Debug, Clone, Deserialize)]
215pub struct LlmConfig {
216 #[serde(default = "default_llm_provider")]
217 pub provider: String,
218 pub api_key: String,
219 #[serde(default = "default_llm_model")]
220 pub model: String,
221 #[serde(default)]
222 pub base_url: Option<String>,
223 #[serde(default = "default_llm_max_tokens")]
224 pub max_tokens: u32,
225}
226
227fn default_llm_provider() -> String {
228 "anthropic".into()
229}
230fn default_llm_model() -> String {
231 "claude-sonnet-4-20250514".into()
232}
233fn default_llm_max_tokens() -> u32 {
234 4096
235}
236
237fn default_format() -> String {
238 "markdown".into()
239}
240fn default_true_ext() -> bool {
241 true
242}
243
244#[derive(Debug, Clone, Default, Deserialize)]
245pub struct AuthConfig {
246 #[serde(default)]
247 pub api_keys: Vec<String>,
248}
249
250impl AppConfig {
251 pub fn load() -> Result<Self, config::ConfigError> {
254 let mut builder = config::Config::builder()
255 .add_source(config::File::with_name("config.default").required(false));
256
257 if let Ok(extra) = std::env::var("CRW_CONFIG") {
259 builder = builder.add_source(config::File::with_name(&extra).required(true));
260 } else {
261 builder = builder.add_source(config::File::with_name("config.local").required(false));
262 }
263
264 let cfg = builder
265 .add_source(
266 config::Environment::with_prefix("CRW")
267 .separator("__")
268 .try_parsing(true),
269 )
270 .build()?;
271 cfg.try_deserialize()
272 }
273}