use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractorConfig {
pub max_length: usize,
pub timeout_secs: u64,
pub clean_text: bool,
pub preserve_whitespace: bool,
pub user_agent: String,
pub follow_redirects: bool,
pub max_redirects: usize,
#[cfg(feature = "sanitize")]
pub sanitize: bool,
#[cfg(feature = "sanitize")]
pub redact_pii: bool,
#[cfg(feature = "sanitize")]
pub detect_injection: bool,
}
impl Default for ExtractorConfig {
fn default() -> Self {
Self {
max_length: 100_000,
timeout_secs: 30,
clean_text: true,
preserve_whitespace: false,
user_agent: format!(
"HanzoExtract/{} (https://hanzo.ai)",
env!("CARGO_PKG_VERSION")
),
follow_redirects: true,
max_redirects: 5,
#[cfg(feature = "sanitize")]
sanitize: true,
#[cfg(feature = "sanitize")]
redact_pii: true,
#[cfg(feature = "sanitize")]
detect_injection: true,
}
}
}
impl ExtractorConfig {
pub fn with_max_length(mut self, max_length: usize) -> Self {
self.max_length = max_length;
self
}
pub fn with_timeout(mut self, timeout_secs: u64) -> Self {
self.timeout_secs = timeout_secs;
self
}
pub fn with_clean_text(mut self, clean: bool) -> Self {
self.clean_text = clean;
self
}
#[cfg(feature = "sanitize")]
pub fn with_sanitize(mut self, sanitize: bool) -> Self {
self.sanitize = sanitize;
self
}
#[cfg(feature = "sanitize")]
pub fn with_redact_pii(mut self, redact: bool) -> Self {
self.redact_pii = redact;
self
}
#[cfg(feature = "sanitize")]
pub fn with_detect_injection(mut self, detect: bool) -> Self {
self.detect_injection = detect;
self
}
}