1use crate::masker::deduplicate;
4use crate::{
5 apply_mask, CloakError, Confidence, EntityType, Locale, MaskStrategy, PiiEntity, Recognizer,
6 RecognizerRegistry, Result,
7};
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::time::Instant;
11
12pub struct ScannerBuilder {
23 registry: RecognizerRegistry,
24 locale: Locale,
25 strategy: Option<MaskStrategy>,
26 min_confidence: Confidence,
27}
28
29impl Default for ScannerBuilder {
30 fn default() -> Self {
31 Self {
32 registry: RecognizerRegistry::new(),
33 locale: Locale::Universal,
34 strategy: Some(MaskStrategy::default()),
35 min_confidence: Confidence::ZERO,
36 }
37 }
38}
39
40impl ScannerBuilder {
41 #[must_use]
43 pub fn new() -> Self {
44 Self::default()
45 }
46
47 #[must_use]
49 pub fn from_registry(registry: RecognizerRegistry) -> Self {
50 Self {
51 registry,
52 ..Self::default()
53 }
54 }
55
56 #[must_use]
58 pub fn locale(mut self, locale: Locale) -> Self {
59 self.locale = locale;
60 self
61 }
62
63 #[must_use]
65 pub fn strategy(mut self, strategy: MaskStrategy) -> Self {
66 self.strategy = Some(strategy);
67 self
68 }
69
70 #[must_use]
72 pub fn without_masking(mut self) -> Self {
73 self.strategy = None;
74 self
75 }
76
77 #[must_use]
79 pub fn recognizer<R>(mut self, recognizer: R) -> Self
80 where
81 R: Recognizer + 'static,
82 {
83 self.registry.register(recognizer);
84 self
85 }
86
87 #[must_use]
89 pub fn boxed_recognizer(mut self, recognizer: Box<dyn Recognizer>) -> Self {
90 self.registry.register_boxed(recognizer);
91 self
92 }
93
94 pub fn min_confidence(mut self, confidence: f64) -> Result<Self> {
96 self.min_confidence = Confidence::new(confidence)?;
97 Ok(self)
98 }
99
100 pub fn build(self) -> Result<Scanner> {
102 if self.registry.is_empty() {
103 return Err(CloakError::NoRecognizers);
104 }
105
106 Ok(Scanner {
107 registry: self.registry,
108 locale: self.locale,
109 strategy: self.strategy,
110 min_confidence: self.min_confidence,
111 })
112 }
113}
114
115pub struct Scanner {
117 registry: RecognizerRegistry,
118 locale: Locale,
119 strategy: Option<MaskStrategy>,
120 min_confidence: Confidence,
121}
122
123impl Scanner {
124 #[must_use]
126 pub fn builder() -> ScannerBuilder {
127 ScannerBuilder::new()
128 }
129
130 pub fn scan(&self, text: &str) -> Result<ScanResult> {
132 let started = Instant::now();
133 let mut findings = self.registry.scan_locale(text, &self.locale);
134 findings.retain(|finding| finding.confidence >= self.min_confidence);
135 findings = deduplicate_for_reporting(&findings);
136 findings.sort_by_key(|finding| finding.span.start);
137
138 let masked_text = self
139 .strategy
140 .as_ref()
141 .map(|strategy| apply_mask(text, &findings, strategy))
142 .transpose()?;
143
144 let stats = ScanStats::from_findings(&findings, started.elapsed().as_millis(), text.len());
145
146 Ok(ScanResult {
147 findings,
148 masked_text,
149 stats,
150 })
151 }
152}
153
154fn deduplicate_for_reporting(findings: &[PiiEntity]) -> Vec<PiiEntity> {
155 let mut sorted = findings.to_vec();
156 sorted.sort_by_key(|finding| (finding.span.start, std::cmp::Reverse(finding.span.end)));
157
158 let mut keep: Vec<PiiEntity> = Vec::with_capacity(sorted.len());
159 for finding in sorted {
160 if keep
161 .iter()
162 .any(|kept| should_preserve_nested_url_query(kept, &finding))
163 {
164 keep.push(finding);
165 continue;
166 }
167
168 if let Some(overlap_index) = keep
169 .iter()
170 .rposition(|kept| finding.span.overlaps(kept.span))
171 {
172 if should_keep_existing_url_query(&keep[overlap_index], &finding) {
173 continue;
174 }
175 if should_replace_with_url_query(&keep[overlap_index], &finding) {
176 keep[overlap_index] = finding;
177 continue;
178 }
179
180 let merged = deduplicate(&[keep[overlap_index].clone(), finding])
181 .into_iter()
182 .next()
183 .unwrap_or_else(|| keep[overlap_index].clone());
184 keep[overlap_index] = merged;
185 continue;
186 }
187 keep.push(finding);
188 }
189
190 keep
191}
192
193fn should_preserve_nested_url_query(outer: &PiiEntity, inner: &PiiEntity) -> bool {
194 outer.entity_type == EntityType::Url
195 && inner.recognizer_id.starts_with("url_query_")
196 && inner.span.start >= outer.span.start
197 && inner.span.end <= outer.span.end
198}
199
200fn should_keep_existing_url_query(existing: &PiiEntity, incoming: &PiiEntity) -> bool {
201 existing.span == incoming.span
202 && existing.entity_type == incoming.entity_type
203 && existing.recognizer_id.starts_with("url_query_")
204 && !incoming.recognizer_id.starts_with("url_query_")
205}
206
207fn should_replace_with_url_query(existing: &PiiEntity, incoming: &PiiEntity) -> bool {
208 existing.span == incoming.span
209 && existing.entity_type == incoming.entity_type
210 && !existing.recognizer_id.starts_with("url_query_")
211 && incoming.recognizer_id.starts_with("url_query_")
212}
213
214#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
216pub struct ScanResult {
217 pub findings: Vec<PiiEntity>,
219 pub masked_text: Option<String>,
221 pub stats: ScanStats,
223}
224
225#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
227pub struct ScanStats {
228 pub total_findings: usize,
230 pub findings_by_type: HashMap<EntityType, usize>,
232 pub scan_duration_ms: u64,
234 pub bytes_scanned: usize,
236}
237
238impl ScanStats {
239 fn from_findings(findings: &[PiiEntity], duration_ms: u128, bytes_scanned: usize) -> Self {
240 let mut findings_by_type = HashMap::new();
241 for finding in findings {
242 *findings_by_type
243 .entry(finding.entity_type.clone())
244 .or_insert(0) += 1;
245 }
246
247 Self {
248 total_findings: findings.len(),
249 findings_by_type,
250 scan_duration_ms: duration_ms.try_into().unwrap_or(u64::MAX),
251 bytes_scanned,
252 }
253 }
254}
255
256#[cfg(test)]
257mod tests {
258 use super::*;
259 use crate::Span;
260
261 struct EmailRecognizer;
262
263 impl Recognizer for EmailRecognizer {
264 fn id(&self) -> &str {
265 "email_test_v1"
266 }
267
268 fn entity_type(&self) -> EntityType {
269 EntityType::Email
270 }
271
272 fn supported_locales(&self) -> &[Locale] {
273 &[]
274 }
275
276 fn scan(&self, text: &str) -> Vec<PiiEntity> {
277 let Some(start) = text
278 .find('@')
279 .and_then(|at| text[..at].rfind(' ').map(|space| space + 1).or(Some(0)))
280 else {
281 return Vec::new();
282 };
283 let end = text[start..]
284 .find(' ')
285 .map_or(text.len(), |offset| start + offset);
286 vec![PiiEntity {
287 entity_type: EntityType::Email,
288 span: Span::new(start, end),
289 text: text[start..end].to_string(),
290 confidence: Confidence::new(0.95).unwrap(),
291 recognizer_id: self.id().to_string(),
292 }]
293 }
294 }
295
296 #[test]
297 fn test_scanner_builder_without_recognizers_errors() {
298 assert!(Scanner::builder().build().is_err());
299 }
300
301 #[test]
302 fn test_scanner_scan_returns_findings_and_masked_text() {
303 let scanner = Scanner::builder()
304 .recognizer(EmailRecognizer)
305 .build()
306 .unwrap();
307 let result = scanner.scan("Contact user@example.com").unwrap();
308 assert_eq!(result.findings.len(), 1);
309 assert_eq!(result.masked_text.as_deref(), Some("Contact [EMAIL]"));
310 }
311
312 #[test]
313 fn test_scanner_without_masking_returns_no_masked_text() {
314 let scanner = Scanner::builder()
315 .recognizer(EmailRecognizer)
316 .without_masking()
317 .build()
318 .unwrap();
319 let result = scanner.scan("Contact user@example.com").unwrap();
320 assert!(result.masked_text.is_none());
321 }
322
323 #[test]
324 fn test_scanner_min_confidence_filters_low_confidence_findings() {
325 let scanner = Scanner::builder()
326 .recognizer(EmailRecognizer)
327 .min_confidence(1.0)
328 .unwrap()
329 .build()
330 .unwrap();
331 let result = scanner.scan("Contact user@example.com").unwrap();
332 assert!(result.findings.is_empty());
333 }
334}