omnivore_core/
config.rs

1use anyhow::{Context, Result};
2use serde::{Deserialize, Serialize};
3use std::fs;
4use std::path::PathBuf;
5use std::env;
6
7#[derive(Debug, Clone, Serialize, Deserialize)]
8pub struct OmnivoreConfig {
9    #[serde(default)]
10    pub ai: AiConfig,
11    
12    #[serde(default)]
13    pub extraction: ExtractionConfig,
14    
15    #[serde(default)]
16    pub browser: BrowserConfig,
17    
18    #[serde(default)]
19    pub output: OutputConfig,
20    
21    #[serde(default)]
22    pub templates: TemplateConfig,
23    
24    #[serde(default)]
25    pub advanced: AdvancedConfig,
26}
27
28#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct AiConfig {
30    pub openai_api_key: Option<String>,
31    pub model: String,
32    pub temperature: f32,
33    pub max_tokens: u32,
34    pub enable_natural_language: bool,
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct ExtractionConfig {
39    pub auto_detect_tables: bool,
40    pub auto_detect_forms: bool,
41    pub auto_detect_dropdowns: bool,
42    pub auto_detect_pagination: bool,
43    pub auto_detect_downloads: bool,
44    pub auto_extract_emails: bool,
45    pub auto_extract_phones: bool,
46    pub auto_extract_addresses: bool,
47    pub auto_follow_pagination: bool,
48    pub max_pagination_pages: u32,
49}
50
51#[derive(Debug, Clone, Serialize, Deserialize)]
52pub struct BrowserConfig {
53    pub driver_path: Option<String>,
54    pub headless: bool,
55    pub timeout: u32,
56    pub wait_for_dynamic: bool,
57    pub screenshot_errors: bool,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct OutputConfig {
62    pub default_format: String,
63    pub organize_output: bool,
64    pub compress_output: bool,
65    pub save_raw_html: bool,
66    pub save_screenshots: bool,
67    pub database_export: Option<DatabaseConfig>,
68    pub cloud_export: Option<CloudConfig>,
69    pub webhook_url: Option<String>,
70}
71
72#[derive(Debug, Clone, Serialize, Deserialize)]
73pub struct DatabaseConfig {
74    pub connection_string: String,
75    pub table_name: String,
76    pub batch_size: u32,
77}
78
79#[derive(Debug, Clone, Serialize, Deserialize)]
80pub struct CloudConfig {
81    pub provider: String, // "s3", "gcs", "azure"
82    pub bucket: String,
83    pub credentials: Option<String>,
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct TemplateConfig {
88    pub templates_dir: PathBuf,
89    pub default_template: Option<String>,
90}
91
92#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct AdvancedConfig {
94    pub max_workers: usize,
95    pub max_depth: u32,
96    pub respect_robots: bool,
97    pub user_agent: String,
98    pub rate_limit_ms: u64,
99    pub retry_attempts: u32,
100    pub deduplication: bool,
101}
102
103impl Default for OmnivoreConfig {
104    fn default() -> Self {
105        Self {
106            ai: AiConfig::default(),
107            extraction: ExtractionConfig::default(),
108            browser: BrowserConfig::default(),
109            output: OutputConfig::default(),
110            templates: TemplateConfig::default(),
111            advanced: AdvancedConfig::default(),
112        }
113    }
114}
115
116impl Default for AiConfig {
117    fn default() -> Self {
118        Self {
119            openai_api_key: None,
120            model: "gpt-4-turbo-preview".to_string(),
121            temperature: 0.3,
122            max_tokens: 2000,
123            enable_natural_language: false,
124        }
125    }
126}
127
128impl Default for ExtractionConfig {
129    fn default() -> Self {
130        Self {
131            auto_detect_tables: true,
132            auto_detect_forms: true,
133            auto_detect_dropdowns: true,
134            auto_detect_pagination: true,
135            auto_detect_downloads: true,
136            auto_extract_emails: true,
137            auto_extract_phones: true,
138            auto_extract_addresses: true,
139            auto_follow_pagination: false,
140            max_pagination_pages: 10,
141        }
142    }
143}
144
145impl Default for BrowserConfig {
146    fn default() -> Self {
147        Self {
148            driver_path: None,
149            headless: true,
150            timeout: 30,
151            wait_for_dynamic: true,
152            screenshot_errors: false,
153        }
154    }
155}
156
157impl Default for OutputConfig {
158    fn default() -> Self {
159        Self {
160            default_format: "json".to_string(),
161            organize_output: true,
162            compress_output: false,
163            save_raw_html: false,
164            save_screenshots: false,
165            database_export: None,
166            cloud_export: None,
167            webhook_url: None,
168        }
169    }
170}
171
172impl Default for TemplateConfig {
173    fn default() -> Self {
174        Self {
175            templates_dir: dirs::home_dir()
176                .unwrap_or_else(|| PathBuf::from("."))
177                .join(".omnivore")
178                .join("templates"),
179            default_template: None,
180        }
181    }
182}
183
184impl Default for AdvancedConfig {
185    fn default() -> Self {
186        Self {
187            max_workers: 10,
188            max_depth: 5,
189            respect_robots: true,
190            user_agent: "Omnivore/1.0".to_string(),
191            rate_limit_ms: 100,
192            retry_attempts: 3,
193            deduplication: true,
194        }
195    }
196}
197
198impl OmnivoreConfig {
199    pub fn load() -> Result<Self> {
200        let config_path = Self::config_path()?;
201        
202        if !config_path.exists() {
203            // Return default config if file doesn't exist
204            return Ok(Self::default());
205        }
206        
207        let content = fs::read_to_string(&config_path)
208            .context("Failed to read config file")?;
209        
210        let config: Self = toml::from_str(&content)
211            .context("Failed to parse config file")?;
212        
213        Ok(config)
214    }
215    
216    pub fn save(&self) -> Result<()> {
217        let config_path = Self::config_path()?;
218        
219        // Create config directory if it doesn't exist
220        if let Some(parent) = config_path.parent() {
221            fs::create_dir_all(parent)
222                .context("Failed to create config directory")?;
223        }
224        
225        let content = toml::to_string_pretty(self)
226            .context("Failed to serialize config")?;
227        
228        fs::write(&config_path, content)
229            .context("Failed to write config file")?;
230        
231        Ok(())
232    }
233    
234    pub fn config_path() -> Result<PathBuf> {
235        // Check for OMNIVORE_CONFIG env var first
236        if let Ok(path) = env::var("OMNIVORE_CONFIG") {
237            return Ok(PathBuf::from(path));
238        }
239        
240        // Default to ~/.omnivore/config.toml
241        let home = dirs::home_dir()
242            .context("Failed to get home directory")?;
243        
244        Ok(home.join(".omnivore").join("config.toml"))
245    }
246    
247    pub fn merge_with_env(&mut self) {
248        // Override with environment variables if present
249        if let Ok(key) = env::var("OMNIVORE_OPENAI_API_KEY") {
250            self.ai.openai_api_key = Some(key);
251        }
252        
253        if let Ok(model) = env::var("OMNIVORE_AI_MODEL") {
254            self.ai.model = model;
255        }
256        
257        if let Ok(webhook) = env::var("OMNIVORE_WEBHOOK_URL") {
258            self.output.webhook_url = Some(webhook);
259        }
260        
261        if let Ok(ua) = env::var("OMNIVORE_USER_AGENT") {
262            self.advanced.user_agent = ua;
263        }
264    }
265    
266    pub fn validate(&self) -> Result<()> {
267        // Validate configuration
268        if self.ai.enable_natural_language && self.ai.openai_api_key.is_none() {
269            anyhow::bail!("OpenAI API key is required when natural language mode is enabled");
270        }
271        
272        if self.advanced.max_workers == 0 {
273            anyhow::bail!("max_workers must be greater than 0");
274        }
275        
276        if self.advanced.max_depth == 0 {
277            anyhow::bail!("max_depth must be greater than 0");
278        }
279        
280        Ok(())
281    }
282    
283    pub fn is_configured(&self) -> bool {
284        // Check if basic configuration is complete
285        self.ai.openai_api_key.is_some() || !self.ai.enable_natural_language
286    }
287    
288    pub fn get_api_key(&self) -> Option<&str> {
289        self.ai.openai_api_key.as_deref()
290    }
291}
292
293#[derive(Debug, Clone, Serialize, Deserialize)]
294pub struct ExtractionTemplate {
295    pub name: String,
296    pub description: String,
297    pub version: String,
298    pub author: Option<String>,
299    pub patterns: Vec<PatternRule>,
300    pub pipelines: Vec<String>,
301    pub output_schema: Option<serde_json::Value>,
302}
303
304#[derive(Debug, Clone, Serialize, Deserialize)]
305pub struct PatternRule {
306    pub name: String,
307    pub pattern_type: String, // "css", "xpath", "regex", "json_path"
308    pub selector: String,
309    pub extract: Vec<String>,
310    pub transform: Option<String>,
311    pub required: bool,
312}
313
314impl ExtractionTemplate {
315    pub fn load(name: &str) -> Result<Self> {
316        let config = OmnivoreConfig::load()?;
317        let template_path = config.templates.templates_dir.join(format!("{}.yaml", name));
318        
319        if !template_path.exists() {
320            anyhow::bail!("Template '{}' not found", name);
321        }
322        
323        let content = fs::read_to_string(&template_path)
324            .context("Failed to read template file")?;
325        
326        let template: Self = serde_yaml::from_str(&content)
327            .context("Failed to parse template file")?;
328        
329        Ok(template)
330    }
331    
332    pub fn save(&self) -> Result<()> {
333        let config = OmnivoreConfig::load()?;
334        let template_path = config.templates.templates_dir.join(format!("{}.yaml", self.name));
335        
336        // Create templates directory if it doesn't exist
337        fs::create_dir_all(&config.templates.templates_dir)
338            .context("Failed to create templates directory")?;
339        
340        let content = serde_yaml::to_string(self)
341            .context("Failed to serialize template")?;
342        
343        fs::write(&template_path, content)
344            .context("Failed to write template file")?;
345        
346        Ok(())
347    }
348    
349    pub fn list_templates() -> Result<Vec<String>> {
350        let config = OmnivoreConfig::load()?;
351        let templates_dir = &config.templates.templates_dir;
352        
353        if !templates_dir.exists() {
354            return Ok(Vec::new());
355        }
356        
357        let mut templates = Vec::new();
358        
359        for entry in fs::read_dir(templates_dir)? {
360            let entry = entry?;
361            let path = entry.path();
362            
363            if path.extension().and_then(|s| s.to_str()) == Some("yaml") {
364                if let Some(stem) = path.file_stem().and_then(|s| s.to_str()) {
365                    templates.push(stem.to_string());
366                }
367            }
368        }
369        
370        Ok(templates)
371    }
372}