Skip to main content

hanzo_extract/
config.rs

1//! Extractor configuration
2
3use serde::{Deserialize, Serialize};
4
5/// Configuration for content extraction
6#[derive(Debug, Clone, Serialize, Deserialize)]
7pub struct ExtractorConfig {
8    /// Maximum content length to extract (in characters)
9    pub max_length: usize,
10
11    /// Request timeout in seconds
12    pub timeout_secs: u64,
13
14    /// Whether to extract clean text (remove HTML tags, scripts, etc.)
15    pub clean_text: bool,
16
17    /// Whether to preserve whitespace formatting
18    pub preserve_whitespace: bool,
19
20    /// User agent for web requests
21    pub user_agent: String,
22
23    /// Whether to follow redirects
24    pub follow_redirects: bool,
25
26    /// Maximum redirects to follow
27    pub max_redirects: usize,
28
29    /// Whether to sanitize output via hanzo-guard
30    #[cfg(feature = "sanitize")]
31    pub sanitize: bool,
32
33    /// Whether to redact PII
34    #[cfg(feature = "sanitize")]
35    pub redact_pii: bool,
36
37    /// Whether to detect injection attempts
38    #[cfg(feature = "sanitize")]
39    pub detect_injection: bool,
40}
41
42impl Default for ExtractorConfig {
43    fn default() -> Self {
44        Self {
45            max_length: 100_000,
46            timeout_secs: 30,
47            clean_text: true,
48            preserve_whitespace: false,
49            user_agent: format!(
50                "HanzoExtract/{} (https://hanzo.ai)",
51                env!("CARGO_PKG_VERSION")
52            ),
53            follow_redirects: true,
54            max_redirects: 5,
55            #[cfg(feature = "sanitize")]
56            sanitize: true,
57            #[cfg(feature = "sanitize")]
58            redact_pii: true,
59            #[cfg(feature = "sanitize")]
60            detect_injection: true,
61        }
62    }
63}
64
65impl ExtractorConfig {
66    /// Create a new config with custom max length
67    pub fn with_max_length(mut self, max_length: usize) -> Self {
68        self.max_length = max_length;
69        self
70    }
71
72    /// Create a new config with custom timeout
73    pub fn with_timeout(mut self, timeout_secs: u64) -> Self {
74        self.timeout_secs = timeout_secs;
75        self
76    }
77
78    /// Enable or disable text cleaning
79    pub fn with_clean_text(mut self, clean: bool) -> Self {
80        self.clean_text = clean;
81        self
82    }
83
84    #[cfg(feature = "sanitize")]
85    /// Enable or disable sanitization
86    pub fn with_sanitize(mut self, sanitize: bool) -> Self {
87        self.sanitize = sanitize;
88        self
89    }
90
91    #[cfg(feature = "sanitize")]
92    /// Enable or disable PII redaction
93    pub fn with_redact_pii(mut self, redact: bool) -> Self {
94        self.redact_pii = redact;
95        self
96    }
97
98    #[cfg(feature = "sanitize")]
99    /// Enable or disable injection detection
100    pub fn with_detect_injection(mut self, detect: bool) -> Self {
101        self.detect_injection = detect;
102        self
103    }
104}