1use serde::{Deserialize, Serialize};
4
5#[derive(Debug, Clone, Serialize, Deserialize)]
7pub struct ExtractorConfig {
8 pub max_length: usize,
10
11 pub timeout_secs: u64,
13
14 pub clean_text: bool,
16
17 pub preserve_whitespace: bool,
19
20 pub user_agent: String,
22
23 pub follow_redirects: bool,
25
26 pub max_redirects: usize,
28
29 #[cfg(feature = "sanitize")]
31 pub sanitize: bool,
32
33 #[cfg(feature = "sanitize")]
35 pub redact_pii: bool,
36
37 #[cfg(feature = "sanitize")]
39 pub detect_injection: bool,
40}
41
42impl Default for ExtractorConfig {
43 fn default() -> Self {
44 Self {
45 max_length: 100_000,
46 timeout_secs: 30,
47 clean_text: true,
48 preserve_whitespace: false,
49 user_agent: format!(
50 "HanzoExtract/{} (https://hanzo.ai)",
51 env!("CARGO_PKG_VERSION")
52 ),
53 follow_redirects: true,
54 max_redirects: 5,
55 #[cfg(feature = "sanitize")]
56 sanitize: true,
57 #[cfg(feature = "sanitize")]
58 redact_pii: true,
59 #[cfg(feature = "sanitize")]
60 detect_injection: true,
61 }
62 }
63}
64
65impl ExtractorConfig {
66 pub fn with_max_length(mut self, max_length: usize) -> Self {
68 self.max_length = max_length;
69 self
70 }
71
72 pub fn with_timeout(mut self, timeout_secs: u64) -> Self {
74 self.timeout_secs = timeout_secs;
75 self
76 }
77
78 pub fn with_clean_text(mut self, clean: bool) -> Self {
80 self.clean_text = clean;
81 self
82 }
83
84 #[cfg(feature = "sanitize")]
85 pub fn with_sanitize(mut self, sanitize: bool) -> Self {
87 self.sanitize = sanitize;
88 self
89 }
90
91 #[cfg(feature = "sanitize")]
92 pub fn with_redact_pii(mut self, redact: bool) -> Self {
94 self.redact_pii = redact;
95 self
96 }
97
98 #[cfg(feature = "sanitize")]
99 pub fn with_detect_injection(mut self, detect: bool) -> Self {
101 self.detect_injection = detect;
102 self
103 }
104}