Skip to main content

cloakrs_core/
scanner.rs

1//! Scanner orchestration.
2
3use crate::masker::deduplicate;
4use crate::{
5    apply_mask, CloakError, Confidence, EntityType, Locale, MaskStrategy, PiiEntity, Recognizer,
6    RecognizerRegistry, Result,
7};
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::time::Instant;
11
12/// Builder for configuring a [`Scanner`].
13///
14/// # Examples
15///
16/// ```
17/// use cloakrs_core::{Locale, Scanner};
18///
19/// let scanner = Scanner::builder().locale(Locale::US).build();
20/// assert!(scanner.is_err());
21/// ```
22pub struct ScannerBuilder {
23    registry: RecognizerRegistry,
24    locale: Locale,
25    strategy: Option<MaskStrategy>,
26    min_confidence: Confidence,
27}
28
29impl Default for ScannerBuilder {
30    fn default() -> Self {
31        Self {
32            registry: RecognizerRegistry::new(),
33            locale: Locale::Universal,
34            strategy: Some(MaskStrategy::default()),
35            min_confidence: Confidence::ZERO,
36        }
37    }
38}
39
40impl ScannerBuilder {
41    /// Creates a scanner builder.
42    #[must_use]
43    pub fn new() -> Self {
44        Self::default()
45    }
46
47    /// Creates a scanner builder from an existing recognizer registry.
48    #[must_use]
49    pub fn from_registry(registry: RecognizerRegistry) -> Self {
50        Self {
51            registry,
52            ..Self::default()
53        }
54    }
55
56    /// Sets the scanner locale.
57    #[must_use]
58    pub fn locale(mut self, locale: Locale) -> Self {
59        self.locale = locale;
60        self
61    }
62
63    /// Sets the masking strategy.
64    #[must_use]
65    pub fn strategy(mut self, strategy: MaskStrategy) -> Self {
66        self.strategy = Some(strategy);
67        self
68    }
69
70    /// Disables masked text generation.
71    #[must_use]
72    pub fn without_masking(mut self) -> Self {
73        self.strategy = None;
74        self
75    }
76
77    /// Adds a recognizer.
78    #[must_use]
79    pub fn recognizer<R>(mut self, recognizer: R) -> Self
80    where
81        R: Recognizer + 'static,
82    {
83        self.registry.register(recognizer);
84        self
85    }
86
87    /// Adds a boxed recognizer.
88    #[must_use]
89    pub fn boxed_recognizer(mut self, recognizer: Box<dyn Recognizer>) -> Self {
90        self.registry.register_boxed(recognizer);
91        self
92    }
93
94    /// Sets the minimum confidence threshold.
95    pub fn min_confidence(mut self, confidence: f64) -> Result<Self> {
96        self.min_confidence = Confidence::new(confidence)?;
97        Ok(self)
98    }
99
100    /// Builds a scanner.
101    pub fn build(self) -> Result<Scanner> {
102        if self.registry.is_empty() {
103            return Err(CloakError::NoRecognizers);
104        }
105
106        Ok(Scanner {
107            registry: self.registry,
108            locale: self.locale,
109            strategy: self.strategy,
110            min_confidence: self.min_confidence,
111        })
112    }
113}
114
115/// The main scanner that ties detection and masking together.
116pub struct Scanner {
117    registry: RecognizerRegistry,
118    locale: Locale,
119    strategy: Option<MaskStrategy>,
120    min_confidence: Confidence,
121}
122
123impl Scanner {
124    /// Creates a scanner builder.
125    #[must_use]
126    pub fn builder() -> ScannerBuilder {
127        ScannerBuilder::new()
128    }
129
130    /// Scans text and returns findings, optional masked text, and stats.
131    pub fn scan(&self, text: &str) -> Result<ScanResult> {
132        let started = Instant::now();
133        let mut findings = self.registry.scan_locale(text, &self.locale);
134        findings.retain(|finding| finding.confidence >= self.min_confidence);
135        findings = deduplicate_for_reporting(&findings);
136        findings.sort_by_key(|finding| finding.span.start);
137
138        let masked_text = self
139            .strategy
140            .as_ref()
141            .map(|strategy| apply_mask(text, &findings, strategy))
142            .transpose()?;
143
144        let stats = ScanStats::from_findings(&findings, started.elapsed().as_millis(), text.len());
145
146        Ok(ScanResult {
147            findings,
148            masked_text,
149            stats,
150        })
151    }
152}
153
154fn deduplicate_for_reporting(findings: &[PiiEntity]) -> Vec<PiiEntity> {
155    let mut sorted = findings.to_vec();
156    sorted.sort_by_key(|finding| (finding.span.start, std::cmp::Reverse(finding.span.end)));
157
158    let mut keep: Vec<PiiEntity> = Vec::with_capacity(sorted.len());
159    for finding in sorted {
160        if keep
161            .iter()
162            .any(|kept| should_preserve_nested_url_query(kept, &finding))
163        {
164            keep.push(finding);
165            continue;
166        }
167
168        if let Some(overlap_index) = keep
169            .iter()
170            .rposition(|kept| finding.span.overlaps(kept.span))
171        {
172            if should_keep_existing_url_query(&keep[overlap_index], &finding) {
173                continue;
174            }
175            if should_replace_with_url_query(&keep[overlap_index], &finding) {
176                keep[overlap_index] = finding;
177                continue;
178            }
179
180            let merged = deduplicate(&[keep[overlap_index].clone(), finding])
181                .into_iter()
182                .next()
183                .unwrap_or_else(|| keep[overlap_index].clone());
184            keep[overlap_index] = merged;
185            continue;
186        }
187        keep.push(finding);
188    }
189
190    keep
191}
192
193fn should_preserve_nested_url_query(outer: &PiiEntity, inner: &PiiEntity) -> bool {
194    outer.entity_type == EntityType::Url
195        && inner.recognizer_id.starts_with("url_query_")
196        && inner.span.start >= outer.span.start
197        && inner.span.end <= outer.span.end
198}
199
200fn should_keep_existing_url_query(existing: &PiiEntity, incoming: &PiiEntity) -> bool {
201    existing.span == incoming.span
202        && existing.entity_type == incoming.entity_type
203        && existing.recognizer_id.starts_with("url_query_")
204        && !incoming.recognizer_id.starts_with("url_query_")
205}
206
207fn should_replace_with_url_query(existing: &PiiEntity, incoming: &PiiEntity) -> bool {
208    existing.span == incoming.span
209        && existing.entity_type == incoming.entity_type
210        && !existing.recognizer_id.starts_with("url_query_")
211        && incoming.recognizer_id.starts_with("url_query_")
212}
213
214/// Result of scanning text.
215#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
216pub struct ScanResult {
217    /// All PII entities found.
218    pub findings: Vec<PiiEntity>,
219    /// Text with PII masked, when masking is enabled.
220    pub masked_text: Option<String>,
221    /// Statistics about the scan.
222    pub stats: ScanStats,
223}
224
225/// Statistics about a scan.
226#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
227pub struct ScanStats {
228    /// Total number of findings.
229    pub total_findings: usize,
230    /// Count of findings per entity type.
231    pub findings_by_type: HashMap<EntityType, usize>,
232    /// Wall-clock scan duration in milliseconds.
233    pub scan_duration_ms: u64,
234    /// Number of input bytes scanned.
235    pub bytes_scanned: usize,
236}
237
238impl ScanStats {
239    fn from_findings(findings: &[PiiEntity], duration_ms: u128, bytes_scanned: usize) -> Self {
240        let mut findings_by_type = HashMap::new();
241        for finding in findings {
242            *findings_by_type
243                .entry(finding.entity_type.clone())
244                .or_insert(0) += 1;
245        }
246
247        Self {
248            total_findings: findings.len(),
249            findings_by_type,
250            scan_duration_ms: duration_ms.try_into().unwrap_or(u64::MAX),
251            bytes_scanned,
252        }
253    }
254}
255
256#[cfg(test)]
257mod tests {
258    use super::*;
259    use crate::Span;
260
261    struct EmailRecognizer;
262
263    impl Recognizer for EmailRecognizer {
264        fn id(&self) -> &str {
265            "email_test_v1"
266        }
267
268        fn entity_type(&self) -> EntityType {
269            EntityType::Email
270        }
271
272        fn supported_locales(&self) -> &[Locale] {
273            &[]
274        }
275
276        fn scan(&self, text: &str) -> Vec<PiiEntity> {
277            let Some(start) = text
278                .find('@')
279                .and_then(|at| text[..at].rfind(' ').map(|space| space + 1).or(Some(0)))
280            else {
281                return Vec::new();
282            };
283            let end = text[start..]
284                .find(' ')
285                .map_or(text.len(), |offset| start + offset);
286            vec![PiiEntity {
287                entity_type: EntityType::Email,
288                span: Span::new(start, end),
289                text: text[start..end].to_string(),
290                confidence: Confidence::new(0.95).unwrap(),
291                recognizer_id: self.id().to_string(),
292            }]
293        }
294    }
295
296    #[test]
297    fn test_scanner_builder_without_recognizers_errors() {
298        assert!(Scanner::builder().build().is_err());
299    }
300
301    #[test]
302    fn test_scanner_scan_returns_findings_and_masked_text() {
303        let scanner = Scanner::builder()
304            .recognizer(EmailRecognizer)
305            .build()
306            .unwrap();
307        let result = scanner.scan("Contact user@example.com").unwrap();
308        assert_eq!(result.findings.len(), 1);
309        assert_eq!(result.masked_text.as_deref(), Some("Contact [EMAIL]"));
310    }
311
312    #[test]
313    fn test_scanner_without_masking_returns_no_masked_text() {
314        let scanner = Scanner::builder()
315            .recognizer(EmailRecognizer)
316            .without_masking()
317            .build()
318            .unwrap();
319        let result = scanner.scan("Contact user@example.com").unwrap();
320        assert!(result.masked_text.is_none());
321    }
322
323    #[test]
324    fn test_scanner_min_confidence_filters_low_confidence_findings() {
325        let scanner = Scanner::builder()
326            .recognizer(EmailRecognizer)
327            .min_confidence(1.0)
328            .unwrap()
329            .build()
330            .unwrap();
331        let result = scanner.scan("Contact user@example.com").unwrap();
332        assert!(result.findings.is_empty());
333    }
334}