1use serde::Deserialize;
2
3#[derive(Debug, Clone, Deserialize)]
4pub struct AppConfig {
5 #[serde(default)]
6 pub server: ServerConfig,
7 #[serde(default)]
8 pub renderer: RendererConfig,
9 #[serde(default)]
10 pub crawler: CrawlerConfig,
11 #[serde(default)]
12 pub extraction: ExtractionConfig,
13 #[serde(default)]
14 pub auth: AuthConfig,
15}
16
17#[derive(Debug, Clone, Deserialize)]
18pub struct ServerConfig {
19 #[serde(default = "default_host")]
20 pub host: String,
21 #[serde(default = "default_port")]
22 pub port: u16,
23 #[serde(default = "default_request_timeout")]
24 pub request_timeout_secs: u64,
25 #[serde(default = "default_rate_limit_rps")]
27 pub rate_limit_rps: u64,
28}
29
30impl Default for ServerConfig {
31 fn default() -> Self {
32 Self {
33 host: default_host(),
34 port: default_port(),
35 request_timeout_secs: default_request_timeout(),
36 rate_limit_rps: default_rate_limit_rps(),
37 }
38 }
39}
40
41fn default_rate_limit_rps() -> u64 {
42 10
43}
44
45fn default_host() -> String {
46 "0.0.0.0".into()
47}
48fn default_port() -> u16 {
49 3000
50}
51fn default_request_timeout() -> u64 {
52 60
53}
54
55#[derive(Debug, Clone, Deserialize)]
56pub struct RendererConfig {
57 #[serde(default = "default_renderer_mode")]
58 pub mode: String,
59 #[serde(default = "default_page_timeout")]
60 pub page_timeout_ms: u64,
61 #[serde(default = "default_pool_size")]
62 pub pool_size: usize,
63 #[serde(default)]
64 pub lightpanda: Option<CdpEndpoint>,
65 #[serde(default)]
66 pub playwright: Option<CdpEndpoint>,
67 #[serde(default)]
68 pub chrome: Option<CdpEndpoint>,
69}
70
71impl Default for RendererConfig {
72 fn default() -> Self {
73 Self {
74 mode: default_renderer_mode(),
75 page_timeout_ms: default_page_timeout(),
76 pool_size: default_pool_size(),
77 lightpanda: None,
78 playwright: None,
79 chrome: None,
80 }
81 }
82}
83
84fn default_renderer_mode() -> String {
85 "auto".into()
86}
87fn default_page_timeout() -> u64 {
88 30000
89}
90fn default_pool_size() -> usize {
91 4
92}
93
94#[derive(Debug, Clone, Deserialize)]
95pub struct CdpEndpoint {
96 pub ws_url: String,
97}
98
99#[derive(Debug, Clone, Deserialize)]
101pub struct StealthConfig {
102 #[serde(default)]
104 pub enabled: bool,
105 #[serde(default)]
107 pub user_agents: Vec<String>,
108 #[serde(default = "default_jitter")]
110 pub jitter_factor: f64,
111 #[serde(default = "default_true")]
113 pub inject_headers: bool,
114}
115
116impl Default for StealthConfig {
117 fn default() -> Self {
118 Self {
119 enabled: false,
120 user_agents: vec![],
121 jitter_factor: default_jitter(),
122 inject_headers: true,
123 }
124 }
125}
126
127fn default_jitter() -> f64 {
128 0.2
129}
130
131pub const BUILTIN_UA_POOL: &[&str] = &[
133 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
134 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
135 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
136 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
137 "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
138];
139
140#[derive(Debug, Clone, Deserialize)]
141pub struct CrawlerConfig {
142 #[serde(default = "default_concurrency")]
143 pub max_concurrency: usize,
144 #[serde(default = "default_rps")]
145 pub requests_per_second: f64,
146 #[serde(default = "default_true")]
147 pub respect_robots_txt: bool,
148 #[serde(default = "default_ua")]
149 pub user_agent: String,
150 #[serde(default = "default_depth")]
151 pub default_max_depth: u32,
152 #[serde(default = "default_max_pages")]
153 pub default_max_pages: u32,
154 #[serde(default)]
156 pub proxy: Option<String>,
157 #[serde(default = "default_job_ttl")]
159 pub job_ttl_secs: u64,
160 #[serde(default)]
161 pub stealth: StealthConfig,
162}
163
164impl Default for CrawlerConfig {
165 fn default() -> Self {
166 Self {
167 max_concurrency: default_concurrency(),
168 requests_per_second: default_rps(),
169 respect_robots_txt: true,
170 user_agent: default_ua(),
171 default_max_depth: default_depth(),
172 default_max_pages: default_max_pages(),
173 proxy: None,
174 job_ttl_secs: default_job_ttl(),
175 stealth: StealthConfig::default(),
176 }
177 }
178}
179
180fn default_concurrency() -> usize {
181 10
182}
183fn default_rps() -> f64 {
184 10.0
185}
186fn default_true() -> bool {
187 true
188}
189fn default_ua() -> String {
190 "CRW/0.1".into()
191}
192fn default_depth() -> u32 {
193 2
194}
195fn default_max_pages() -> u32 {
196 100
197}
198fn default_job_ttl() -> u64 {
199 3600
200}
201
202#[derive(Debug, Clone, Deserialize)]
203pub struct ExtractionConfig {
204 #[serde(default = "default_format")]
205 pub default_format: String,
206 #[serde(default = "default_true_ext")]
207 pub only_main_content: bool,
208 #[serde(default)]
209 pub llm: Option<LlmConfig>,
210}
211
212impl Default for ExtractionConfig {
213 fn default() -> Self {
214 Self {
215 default_format: default_format(),
216 only_main_content: true,
217 llm: None,
218 }
219 }
220}
221
222#[derive(Debug, Clone, Deserialize)]
223pub struct LlmConfig {
224 #[serde(default = "default_llm_provider")]
225 pub provider: String,
226 pub api_key: String,
227 #[serde(default = "default_llm_model")]
228 pub model: String,
229 #[serde(default)]
230 pub base_url: Option<String>,
231 #[serde(default = "default_llm_max_tokens")]
232 pub max_tokens: u32,
233}
234
235fn default_llm_provider() -> String {
236 "anthropic".into()
237}
238fn default_llm_model() -> String {
239 "claude-sonnet-4-20250514".into()
240}
241fn default_llm_max_tokens() -> u32 {
242 4096
243}
244
245fn default_format() -> String {
246 "markdown".into()
247}
248fn default_true_ext() -> bool {
249 true
250}
251
252#[derive(Debug, Clone, Default, Deserialize)]
253pub struct AuthConfig {
254 #[serde(default)]
255 pub api_keys: Vec<String>,
256}
257
258impl AppConfig {
259 pub fn load() -> Result<Self, config::ConfigError> {
262 let mut builder = config::Config::builder()
263 .add_source(config::File::with_name("config.default").required(false));
264
265 if let Ok(extra) = std::env::var("CRW_CONFIG") {
267 builder = builder.add_source(config::File::with_name(&extra).required(true));
268 } else {
269 builder = builder.add_source(config::File::with_name("config.local").required(false));
270 }
271
272 let cfg = builder
273 .add_source(
274 config::Environment::with_prefix("CRW")
275 .separator("__")
276 .try_parsing(true),
277 )
278 .build()?;
279 cfg.try_deserialize()
280 }
281}