Skip to main content

cloakrs_adapters/
plaintext.rs

1//! Plaintext scanning adapter.
2
3use cloakrs_core::{PiiEntity, Result, Scanner};
4use serde::{Deserialize, Serialize};
5use std::io::BufRead;
6
7/// PII findings for one line of plaintext.
8///
9/// Spans are byte offsets relative to the line content.
10#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
11pub struct LineScanResult {
12    /// One-based line number.
13    pub line_number: usize,
14    /// Findings detected on this line.
15    pub findings: Vec<PiiEntity>,
16    /// Masked line when the scanner has masking enabled.
17    pub masked_line: Option<String>,
18}
19
20/// Scans a string as plaintext, line by line.
21///
22/// # Examples
23///
24/// ```
25/// use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Scanner, Span};
26/// use cloakrs_adapters::scan_text;
27///
28/// struct Email;
29/// impl Recognizer for Email {
30///     fn id(&self) -> &str { "email_test" }
31///     fn entity_type(&self) -> EntityType { EntityType::Email }
32///     fn supported_locales(&self) -> &[Locale] { &[] }
33///     fn scan(&self, text: &str) -> Vec<PiiEntity> {
34///         text.find('@').map(|_| PiiEntity {
35///             entity_type: EntityType::Email,
36///             span: Span::new(0, text.len()),
37///             text: text.to_string(),
38///             confidence: Confidence::new(0.9).unwrap(),
39///             recognizer_id: self.id().to_string(),
40///         }).into_iter().collect()
41///     }
42/// }
43///
44/// let scanner = Scanner::builder().recognizer(Email).build().unwrap();
45/// let results = scan_text("a@b.test\nplain", &scanner).unwrap();
46/// assert_eq!(results.len(), 2);
47/// ```
48pub fn scan_text(text: &str, scanner: &Scanner) -> Result<Vec<LineScanResult>> {
49    scan_lines(std::io::Cursor::new(text), scanner)
50}
51
52/// Scans a buffered reader as plaintext without loading the whole input.
53pub fn scan_lines<R>(reader: R, scanner: &Scanner) -> Result<Vec<LineScanResult>>
54where
55    R: BufRead,
56{
57    let mut results = Vec::new();
58    for (index, line) in reader.lines().enumerate() {
59        let line = line?;
60        let scan = scanner.scan(&line)?;
61        results.push(LineScanResult {
62            line_number: index + 1,
63            findings: scan.findings,
64            masked_line: scan.masked_text,
65        });
66    }
67    Ok(results)
68}
69
70#[cfg(test)]
71mod tests {
72    use super::*;
73    use cloakrs_core::Locale;
74    use cloakrs_patterns::default_registry;
75
76    fn scanner() -> Scanner {
77        default_registry()
78            .into_scanner_builder()
79            .locale(Locale::US)
80            .build()
81            .unwrap()
82    }
83
84    #[test]
85    fn test_scan_text_multiline_reports_line_numbers() {
86        let results =
87            scan_text("email jane@example.com\nplain\nssn 123-45-6789", &scanner()).unwrap();
88        assert_eq!(results[0].line_number, 1);
89        assert_eq!(results[1].line_number, 2);
90        assert_eq!(results[2].line_number, 3);
91        assert_eq!(results[0].findings.len(), 1);
92        assert_eq!(results[2].findings.len(), 1);
93    }
94
95    #[test]
96    fn test_scan_text_empty_lines_are_preserved() {
97        let results = scan_text("\nemail jane@example.com", &scanner()).unwrap();
98        assert_eq!(results.len(), 2);
99        assert!(results[0].findings.is_empty());
100    }
101
102    #[test]
103    fn test_scan_text_long_line_detects_finding() {
104        let mut line = "a".repeat(12_000);
105        line.push_str(" email jane@example.com");
106        let results = scan_text(&line, &scanner()).unwrap();
107        assert_eq!(results.len(), 1);
108        assert_eq!(results[0].findings.len(), 1);
109    }
110}