1use anyhow::{Context, Result};
2use serde::{Deserialize, Serialize};
3use std::fs;
4use std::path::PathBuf;
5use std::env;
6
7#[derive(Debug, Clone, Serialize, Deserialize)]
8pub struct OmnivoreConfig {
9 #[serde(default)]
10 pub ai: AiConfig,
11
12 #[serde(default)]
13 pub extraction: ExtractionConfig,
14
15 #[serde(default)]
16 pub browser: BrowserConfig,
17
18 #[serde(default)]
19 pub output: OutputConfig,
20
21 #[serde(default)]
22 pub templates: TemplateConfig,
23
24 #[serde(default)]
25 pub advanced: AdvancedConfig,
26}
27
28#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct AiConfig {
30 pub openai_api_key: Option<String>,
31 pub model: String,
32 pub temperature: f32,
33 pub max_tokens: u32,
34 pub enable_natural_language: bool,
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct ExtractionConfig {
39 pub auto_detect_tables: bool,
40 pub auto_detect_forms: bool,
41 pub auto_detect_dropdowns: bool,
42 pub auto_detect_pagination: bool,
43 pub auto_detect_downloads: bool,
44 pub auto_extract_emails: bool,
45 pub auto_extract_phones: bool,
46 pub auto_extract_addresses: bool,
47 pub auto_follow_pagination: bool,
48 pub max_pagination_pages: u32,
49}
50
51#[derive(Debug, Clone, Serialize, Deserialize)]
52pub struct BrowserConfig {
53 pub driver_path: Option<String>,
54 pub headless: bool,
55 pub timeout: u32,
56 pub wait_for_dynamic: bool,
57 pub screenshot_errors: bool,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct OutputConfig {
62 pub default_format: String,
63 pub organize_output: bool,
64 pub compress_output: bool,
65 pub save_raw_html: bool,
66 pub save_screenshots: bool,
67 pub database_export: Option<DatabaseConfig>,
68 pub cloud_export: Option<CloudConfig>,
69 pub webhook_url: Option<String>,
70}
71
72#[derive(Debug, Clone, Serialize, Deserialize)]
73pub struct DatabaseConfig {
74 pub connection_string: String,
75 pub table_name: String,
76 pub batch_size: u32,
77}
78
79#[derive(Debug, Clone, Serialize, Deserialize)]
80pub struct CloudConfig {
81 pub provider: String, pub bucket: String,
83 pub credentials: Option<String>,
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct TemplateConfig {
88 pub templates_dir: PathBuf,
89 pub default_template: Option<String>,
90}
91
92#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct AdvancedConfig {
94 pub max_workers: usize,
95 pub max_depth: u32,
96 pub respect_robots: bool,
97 pub user_agent: String,
98 pub rate_limit_ms: u64,
99 pub retry_attempts: u32,
100 pub deduplication: bool,
101}
102
103impl Default for OmnivoreConfig {
104 fn default() -> Self {
105 Self {
106 ai: AiConfig::default(),
107 extraction: ExtractionConfig::default(),
108 browser: BrowserConfig::default(),
109 output: OutputConfig::default(),
110 templates: TemplateConfig::default(),
111 advanced: AdvancedConfig::default(),
112 }
113 }
114}
115
116impl Default for AiConfig {
117 fn default() -> Self {
118 Self {
119 openai_api_key: None,
120 model: "gpt-4-turbo-preview".to_string(),
121 temperature: 0.3,
122 max_tokens: 2000,
123 enable_natural_language: false,
124 }
125 }
126}
127
128impl Default for ExtractionConfig {
129 fn default() -> Self {
130 Self {
131 auto_detect_tables: true,
132 auto_detect_forms: true,
133 auto_detect_dropdowns: true,
134 auto_detect_pagination: true,
135 auto_detect_downloads: true,
136 auto_extract_emails: true,
137 auto_extract_phones: true,
138 auto_extract_addresses: true,
139 auto_follow_pagination: false,
140 max_pagination_pages: 10,
141 }
142 }
143}
144
145impl Default for BrowserConfig {
146 fn default() -> Self {
147 Self {
148 driver_path: None,
149 headless: true,
150 timeout: 30,
151 wait_for_dynamic: true,
152 screenshot_errors: false,
153 }
154 }
155}
156
157impl Default for OutputConfig {
158 fn default() -> Self {
159 Self {
160 default_format: "json".to_string(),
161 organize_output: true,
162 compress_output: false,
163 save_raw_html: false,
164 save_screenshots: false,
165 database_export: None,
166 cloud_export: None,
167 webhook_url: None,
168 }
169 }
170}
171
172impl Default for TemplateConfig {
173 fn default() -> Self {
174 Self {
175 templates_dir: dirs::home_dir()
176 .unwrap_or_else(|| PathBuf::from("."))
177 .join(".omnivore")
178 .join("templates"),
179 default_template: None,
180 }
181 }
182}
183
184impl Default for AdvancedConfig {
185 fn default() -> Self {
186 Self {
187 max_workers: 10,
188 max_depth: 5,
189 respect_robots: true,
190 user_agent: "Omnivore/1.0".to_string(),
191 rate_limit_ms: 100,
192 retry_attempts: 3,
193 deduplication: true,
194 }
195 }
196}
197
198impl OmnivoreConfig {
199 pub fn load() -> Result<Self> {
200 let config_path = Self::config_path()?;
201
202 if !config_path.exists() {
203 return Ok(Self::default());
205 }
206
207 let content = fs::read_to_string(&config_path)
208 .context("Failed to read config file")?;
209
210 let config: Self = toml::from_str(&content)
211 .context("Failed to parse config file")?;
212
213 Ok(config)
214 }
215
216 pub fn save(&self) -> Result<()> {
217 let config_path = Self::config_path()?;
218
219 if let Some(parent) = config_path.parent() {
221 fs::create_dir_all(parent)
222 .context("Failed to create config directory")?;
223 }
224
225 let content = toml::to_string_pretty(self)
226 .context("Failed to serialize config")?;
227
228 fs::write(&config_path, content)
229 .context("Failed to write config file")?;
230
231 Ok(())
232 }
233
234 pub fn config_path() -> Result<PathBuf> {
235 if let Ok(path) = env::var("OMNIVORE_CONFIG") {
237 return Ok(PathBuf::from(path));
238 }
239
240 let home = dirs::home_dir()
242 .context("Failed to get home directory")?;
243
244 Ok(home.join(".omnivore").join("config.toml"))
245 }
246
247 pub fn merge_with_env(&mut self) {
248 if let Ok(key) = env::var("OMNIVORE_OPENAI_API_KEY") {
250 self.ai.openai_api_key = Some(key);
251 }
252
253 if let Ok(model) = env::var("OMNIVORE_AI_MODEL") {
254 self.ai.model = model;
255 }
256
257 if let Ok(webhook) = env::var("OMNIVORE_WEBHOOK_URL") {
258 self.output.webhook_url = Some(webhook);
259 }
260
261 if let Ok(ua) = env::var("OMNIVORE_USER_AGENT") {
262 self.advanced.user_agent = ua;
263 }
264 }
265
266 pub fn validate(&self) -> Result<()> {
267 if self.ai.enable_natural_language && self.ai.openai_api_key.is_none() {
269 anyhow::bail!("OpenAI API key is required when natural language mode is enabled");
270 }
271
272 if self.advanced.max_workers == 0 {
273 anyhow::bail!("max_workers must be greater than 0");
274 }
275
276 if self.advanced.max_depth == 0 {
277 anyhow::bail!("max_depth must be greater than 0");
278 }
279
280 Ok(())
281 }
282
283 pub fn is_configured(&self) -> bool {
284 self.ai.openai_api_key.is_some() || !self.ai.enable_natural_language
286 }
287
288 pub fn get_api_key(&self) -> Option<&str> {
289 self.ai.openai_api_key.as_deref()
290 }
291}
292
293#[derive(Debug, Clone, Serialize, Deserialize)]
294pub struct ExtractionTemplate {
295 pub name: String,
296 pub description: String,
297 pub version: String,
298 pub author: Option<String>,
299 pub patterns: Vec<PatternRule>,
300 pub pipelines: Vec<String>,
301 pub output_schema: Option<serde_json::Value>,
302}
303
304#[derive(Debug, Clone, Serialize, Deserialize)]
305pub struct PatternRule {
306 pub name: String,
307 pub pattern_type: String, pub selector: String,
309 pub extract: Vec<String>,
310 pub transform: Option<String>,
311 pub required: bool,
312}
313
314impl ExtractionTemplate {
315 pub fn load(name: &str) -> Result<Self> {
316 let config = OmnivoreConfig::load()?;
317 let template_path = config.templates.templates_dir.join(format!("{}.yaml", name));
318
319 if !template_path.exists() {
320 anyhow::bail!("Template '{}' not found", name);
321 }
322
323 let content = fs::read_to_string(&template_path)
324 .context("Failed to read template file")?;
325
326 let template: Self = serde_yaml::from_str(&content)
327 .context("Failed to parse template file")?;
328
329 Ok(template)
330 }
331
332 pub fn save(&self) -> Result<()> {
333 let config = OmnivoreConfig::load()?;
334 let template_path = config.templates.templates_dir.join(format!("{}.yaml", self.name));
335
336 fs::create_dir_all(&config.templates.templates_dir)
338 .context("Failed to create templates directory")?;
339
340 let content = serde_yaml::to_string(self)
341 .context("Failed to serialize template")?;
342
343 fs::write(&template_path, content)
344 .context("Failed to write template file")?;
345
346 Ok(())
347 }
348
349 pub fn list_templates() -> Result<Vec<String>> {
350 let config = OmnivoreConfig::load()?;
351 let templates_dir = &config.templates.templates_dir;
352
353 if !templates_dir.exists() {
354 return Ok(Vec::new());
355 }
356
357 let mut templates = Vec::new();
358
359 for entry in fs::read_dir(templates_dir)? {
360 let entry = entry?;
361 let path = entry.path();
362
363 if path.extension().and_then(|s| s.to_str()) == Some("yaml") {
364 if let Some(stem) = path.file_stem().and_then(|s| s.to_str()) {
365 templates.push(stem.to_string());
366 }
367 }
368 }
369
370 Ok(templates)
371 }
372}