use std::collections::HashMap;
use std::path::PathBuf;
use std::time::Duration;
use serde::{Deserialize, Serialize};
use super::AssetCategory;
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct ExtractionMeta {
pub cost: Option<f64>,
pub prompt_tokens: Option<u64>,
pub completion_tokens: Option<u64>,
pub model: Option<String>,
pub chunks_processed: usize,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum BrowserMode {
#[default]
Auto,
Always,
Never,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum BrowserWait {
#[default]
NetworkIdle,
Selector,
Fixed,
}
pub(crate) mod duration_ms {
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::time::Duration;
pub fn serialize<S: Serializer>(d: &Duration, s: S) -> Result<S::Ok, S::Error> {
d.as_millis().serialize(s)
}
pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result<Duration, D::Error> {
let ms = u64::deserialize(d)?;
Ok(Duration::from_millis(ms))
}
}
pub(crate) mod option_duration_ms {
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use std::time::Duration;
pub fn serialize<S: Serializer>(d: &Option<Duration>, s: S) -> Result<S::Ok, S::Error> {
d.map(|d| d.as_millis() as u64).serialize(s)
}
pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result<Option<Duration>, D::Error> {
let ms: Option<u64> = Option::deserialize(d)?;
Ok(ms.map(Duration::from_millis))
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct ProxyConfig {
pub url: String,
pub username: Option<String>,
pub password: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(deny_unknown_fields, tag = "type")]
pub enum AuthConfig {
#[serde(rename = "basic")]
Basic { username: String, password: String },
#[serde(rename = "bearer")]
Bearer { token: String },
#[serde(rename = "header")]
Header { name: String, value: String },
}
impl Default for AuthConfig {
fn default() -> Self {
Self::Basic {
username: String::new(),
password: String::new(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(deny_unknown_fields, default)]
pub struct BrowserConfig {
pub mode: BrowserMode,
pub endpoint: Option<String>,
#[serde(with = "duration_ms")]
pub timeout: Duration,
pub wait: BrowserWait,
pub wait_selector: Option<String>,
#[serde(default, with = "option_duration_ms")]
pub extra_wait: Option<Duration>,
}
impl Default for BrowserConfig {
fn default() -> Self {
Self {
mode: BrowserMode::Auto,
endpoint: None,
timeout: Duration::from_secs(30),
wait: BrowserWait::default(),
wait_selector: None,
extra_wait: None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(deny_unknown_fields, default)]
pub struct CrawlConfig {
pub max_depth: Option<usize>,
pub max_pages: Option<usize>,
pub max_concurrent: Option<usize>,
pub respect_robots_txt: bool,
pub user_agent: Option<String>,
pub stay_on_domain: bool,
pub allow_subdomains: bool,
#[serde(default)]
pub include_paths: Vec<String>,
#[serde(default)]
pub exclude_paths: Vec<String>,
#[serde(default)]
pub custom_headers: HashMap<String, String>,
#[serde(with = "duration_ms")]
pub request_timeout: Duration,
pub max_redirects: usize,
pub retry_count: usize,
#[serde(default)]
pub retry_codes: Vec<u16>,
pub cookies_enabled: bool,
pub auth: Option<AuthConfig>,
pub max_body_size: Option<usize>,
pub main_content_only: bool,
#[serde(default)]
pub remove_tags: Vec<String>,
pub map_limit: Option<usize>,
pub map_search: Option<String>,
pub download_assets: bool,
#[serde(default)]
pub asset_types: Vec<AssetCategory>,
pub max_asset_size: Option<usize>,
#[serde(default)]
pub browser: BrowserConfig,
pub proxy: Option<ProxyConfig>,
#[serde(default)]
pub user_agents: Vec<String>,
pub capture_screenshot: bool,
pub download_documents: bool,
pub document_max_size: Option<usize>,
#[serde(default)]
pub document_mime_types: Vec<String>,
pub warc_output: Option<PathBuf>,
pub browser_profile: Option<String>,
pub save_browser_profile: bool,
#[cfg(feature = "browser")]
#[serde(skip)]
pub browser_pool: Option<std::sync::Arc<crate::browser_pool::BrowserPool>>,
}
impl Default for CrawlConfig {
fn default() -> Self {
Self {
max_depth: None,
max_pages: None,
max_concurrent: None,
respect_robots_txt: false,
user_agent: None,
stay_on_domain: false,
allow_subdomains: false,
include_paths: Vec::new(),
exclude_paths: Vec::new(),
custom_headers: HashMap::new(),
request_timeout: Duration::from_secs(30),
max_redirects: 10,
retry_count: 0,
retry_codes: Vec::new(),
cookies_enabled: false,
auth: None,
max_body_size: None,
main_content_only: false,
remove_tags: Vec::new(),
map_limit: None,
map_search: None,
download_assets: false,
asset_types: Vec::new(),
max_asset_size: None,
browser: BrowserConfig::default(),
proxy: None,
user_agents: Vec::new(),
capture_screenshot: false,
download_documents: true,
document_max_size: Some(50 * 1024 * 1024), document_mime_types: Vec::new(),
warc_output: None,
browser_profile: None,
save_browser_profile: false,
#[cfg(feature = "browser")]
browser_pool: None,
}
}
}
impl CrawlConfig {
pub fn validate(&self) -> Result<(), crate::error::CrawlError> {
use crate::error::CrawlError;
if let Some(0) = self.max_concurrent {
return Err(CrawlError::InvalidConfig("max_concurrent must be > 0".into()));
}
if self.browser.wait == BrowserWait::Selector && self.browser.wait_selector.is_none() {
return Err(CrawlError::InvalidConfig(
"browser.wait_selector required when browser.wait is Selector".into(),
));
}
if let Some(max_pages) = self.max_pages
&& max_pages == 0
{
return Err(CrawlError::InvalidConfig("max_pages must be > 0".into()));
}
if self.max_redirects > 100 {
return Err(CrawlError::InvalidConfig("max_redirects must be <= 100".into()));
}
for pattern in &self.include_paths {
regex::Regex::new(pattern)
.map_err(|e| CrawlError::InvalidConfig(format!("invalid include_path regex '{pattern}': {e}")))?;
}
for pattern in &self.exclude_paths {
regex::Regex::new(pattern)
.map_err(|e| CrawlError::InvalidConfig(format!("invalid exclude_path regex '{pattern}': {e}")))?;
}
for &code in &self.retry_codes {
if !(100..=599).contains(&code) {
return Err(CrawlError::InvalidConfig(format!("invalid retry code: {code}")));
}
}
if self.request_timeout.is_zero() {
return Err(CrawlError::InvalidConfig("request_timeout must be > 0".into()));
}
Ok(())
}
}