omnivore_core/
lib.rs

1pub mod crawler;
2pub mod error;
3pub mod extractor;
4pub mod graph;
5pub mod intelligence;
6pub mod parser;
7pub mod storage;
8pub mod table_extractor;
9pub mod config;
10pub mod ai;
11pub mod detector;
12
13#[cfg(feature = "browser")]
14pub mod browser;
15
16pub use error::{Error, Result};
17
18use serde::{Deserialize, Serialize};
19
20#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct CrawlConfig {
22    pub max_workers: usize,
23    pub max_depth: u32,
24    pub user_agent: String,
25    pub respect_robots_txt: bool,
26    pub politeness: PolitenessConfig,
27    pub timeout_ms: u64,
28    pub max_retries: u32,
29}
30
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct PolitenessConfig {
33    pub default_delay_ms: u64,
34    pub max_requests_per_second: f64,
35    pub backoff_multiplier: f64,
36}
37
38impl Default for CrawlConfig {
39    fn default() -> Self {
40        Self {
41            max_workers: 10,
42            max_depth: 10,
43            user_agent: "Omnivore/1.0".to_string(),
44            respect_robots_txt: true,
45            politeness: PolitenessConfig::default(),
46            timeout_ms: 30000,
47            max_retries: 3,
48        }
49    }
50}
51
52impl Default for PolitenessConfig {
53    fn default() -> Self {
54        Self {
55            default_delay_ms: 100,
56            max_requests_per_second: 10.0,
57            backoff_multiplier: 2.0,
58        }
59    }
60}
61
62#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct CrawlResult {
64    pub url: String,
65    pub status_code: u16,
66    pub content: String,
67    pub cleaned_content: Option<extractor::CleanedContent>,
68    pub headers: std::collections::HashMap<String, String>,
69    pub extracted_data: serde_json::Value,
70    pub links: Vec<String>,
71    pub crawled_at: chrono::DateTime<chrono::Utc>,
72}
73
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct CrawlStats {
76    pub total_urls: usize,
77    pub successful: usize,
78    pub failed: usize,
79    pub in_progress: usize,
80    pub average_response_time_ms: f64,
81    pub start_time: chrono::DateTime<chrono::Utc>,
82    pub elapsed_time: std::time::Duration,
83}