1pub mod crawler;
2pub mod error;
3pub mod extractor;
4pub mod graph;
5pub mod intelligence;
6pub mod parser;
7pub mod storage;
8pub mod table_extractor;
9pub mod config;
10pub mod ai;
11pub mod detector;
12
13#[cfg(feature = "browser")]
14pub mod browser;
15
16pub use error::{Error, Result};
17
18use serde::{Deserialize, Serialize};
19
20#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct CrawlConfig {
22 pub max_workers: usize,
23 pub max_depth: u32,
24 pub user_agent: String,
25 pub respect_robots_txt: bool,
26 pub politeness: PolitenessConfig,
27 pub timeout_ms: u64,
28 pub max_retries: u32,
29}
30
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct PolitenessConfig {
33 pub default_delay_ms: u64,
34 pub max_requests_per_second: f64,
35 pub backoff_multiplier: f64,
36}
37
38impl Default for CrawlConfig {
39 fn default() -> Self {
40 Self {
41 max_workers: 10,
42 max_depth: 10,
43 user_agent: "Omnivore/1.0".to_string(),
44 respect_robots_txt: true,
45 politeness: PolitenessConfig::default(),
46 timeout_ms: 30000,
47 max_retries: 3,
48 }
49 }
50}
51
52impl Default for PolitenessConfig {
53 fn default() -> Self {
54 Self {
55 default_delay_ms: 100,
56 max_requests_per_second: 10.0,
57 backoff_multiplier: 2.0,
58 }
59 }
60}
61
62#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct CrawlResult {
64 pub url: String,
65 pub status_code: u16,
66 pub content: String,
67 pub cleaned_content: Option<extractor::CleanedContent>,
68 pub headers: std::collections::HashMap<String, String>,
69 pub extracted_data: serde_json::Value,
70 pub links: Vec<String>,
71 pub crawled_at: chrono::DateTime<chrono::Utc>,
72}
73
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct CrawlStats {
76 pub total_urls: usize,
77 pub successful: usize,
78 pub failed: usize,
79 pub in_progress: usize,
80 pub average_response_time_ms: f64,
81 pub start_time: chrono::DateTime<chrono::Utc>,
82 pub elapsed_time: std::time::Duration,
83}