Skip to main content

cloakrs_adapters/
csv.rs

1//! CSV adapter for column-aware scanning.
2
3use cloakrs_core::{PiiEntity, Result, Scanner};
4use serde::{Deserialize, Serialize};
5use std::collections::HashSet;
6use std::io::{Read, Write};
7
8/// Options controlling which CSV columns are scanned.
9#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
10pub struct CsvScanOptions {
11    /// Whether the first record contains headers.
12    pub has_headers: bool,
13    /// Column names to scan when headers are present.
14    pub columns: Vec<String>,
15    /// Zero-based column indexes to scan.
16    pub column_indexes: Vec<usize>,
17    /// CSV delimiter byte.
18    pub delimiter: u8,
19}
20
21impl CsvScanOptions {
22    fn delimiter(&self) -> u8 {
23        if self.delimiter == 0 {
24            b','
25        } else {
26            self.delimiter
27        }
28    }
29}
30
31/// PII findings for one CSV cell.
32#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
33pub struct CsvCellScanResult {
34    /// One-based data row number. Header row is not counted.
35    pub row_number: usize,
36    /// Zero-based column index.
37    pub column_index: usize,
38    /// Header name when available.
39    pub column_name: Option<String>,
40    /// Findings detected in this cell.
41    pub findings: Vec<PiiEntity>,
42    /// Masked cell value when masking is enabled.
43    pub masked_value: Option<String>,
44}
45
46/// Result of scanning CSV data.
47#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
48pub struct CsvScanResult {
49    /// Findings grouped by CSV cell.
50    pub cells: Vec<CsvCellScanResult>,
51    /// Masked CSV data.
52    pub masked_csv: String,
53}
54
55/// Scans CSV text and returns column-aware findings.
56pub fn scan_csv_str(
57    input: &str,
58    scanner: &Scanner,
59    options: &CsvScanOptions,
60) -> Result<CsvScanResult> {
61    let mut output = Vec::new();
62    let cells = mask_csv_reader(input.as_bytes(), &mut output, scanner, options)?;
63    let masked_csv = String::from_utf8(output)
64        .map_err(|error| cloakrs_core::CloakError::ConfigError(error.to_string()))?;
65    Ok(CsvScanResult { cells, masked_csv })
66}
67
68/// Streams CSV from a reader to a writer while masking selected cells.
69pub fn mask_csv_reader<R, W>(
70    reader: R,
71    writer: W,
72    scanner: &Scanner,
73    options: &CsvScanOptions,
74) -> Result<Vec<CsvCellScanResult>>
75where
76    R: Read,
77    W: Write,
78{
79    let mut csv_reader = csv::ReaderBuilder::new()
80        .has_headers(options.has_headers)
81        .delimiter(options.delimiter())
82        .from_reader(reader);
83    let mut csv_writer = csv::WriterBuilder::new()
84        .has_headers(false)
85        .delimiter(options.delimiter())
86        .from_writer(writer);
87
88    let headers = if options.has_headers {
89        let headers = csv_reader.headers()?.clone();
90        csv_writer.write_record(&headers)?;
91        Some(headers)
92    } else {
93        None
94    };
95    let selected = selected_indexes(headers.as_ref(), options);
96    let mut cells = Vec::new();
97
98    for (row_index, record) in csv_reader.records().enumerate() {
99        let record = record?;
100        let mut masked_record: Vec<String> = record.iter().map(str::to_string).collect();
101        for (column_index, value) in record.iter().enumerate() {
102            if !selected.is_empty() && !selected.contains(&column_index) {
103                continue;
104            }
105            let scan = scanner.scan(value)?;
106            if scan.findings.is_empty() {
107                continue;
108            }
109            let masked_value = scan.masked_text.clone();
110            if let Some(masked_value) = &masked_value {
111                masked_record[column_index] = masked_value.clone();
112            }
113            cells.push(CsvCellScanResult {
114                row_number: row_index + 1,
115                column_index,
116                column_name: headers
117                    .as_ref()
118                    .and_then(|headers| headers.get(column_index))
119                    .map(str::to_string),
120                findings: scan.findings,
121                masked_value,
122            });
123        }
124        csv_writer.write_record(masked_record)?;
125    }
126    csv_writer.flush()?;
127    Ok(cells)
128}
129
130fn selected_indexes(
131    headers: Option<&csv::StringRecord>,
132    options: &CsvScanOptions,
133) -> HashSet<usize> {
134    let mut selected: HashSet<usize> = options.column_indexes.iter().copied().collect();
135    if let Some(headers) = headers {
136        for column in &options.columns {
137            if let Some(index) = headers.iter().position(|header| header == column) {
138                selected.insert(index);
139            }
140        }
141    }
142    selected
143}
144
145#[cfg(test)]
146mod tests {
147    use super::*;
148    use cloakrs_core::Locale;
149    use cloakrs_patterns::default_registry;
150
151    fn scanner() -> Scanner {
152        default_registry()
153            .into_scanner_builder()
154            .locale(Locale::US)
155            .build()
156            .unwrap()
157    }
158
159    #[test]
160    fn test_scan_csv_str_with_headers_scans_named_column() {
161        let input = "name,email\nJane,jane@example.com\n";
162        let options = CsvScanOptions {
163            has_headers: true,
164            columns: vec!["email".to_string()],
165            column_indexes: Vec::new(),
166            delimiter: b',',
167        };
168        let result = scan_csv_str(input, &scanner(), &options).unwrap();
169        assert_eq!(result.cells.len(), 1);
170        assert_eq!(result.cells[0].column_name.as_deref(), Some("email"));
171        assert!(result.masked_csv.contains("[EMAIL]"));
172    }
173
174    #[test]
175    fn test_scan_csv_str_without_headers_scans_index() {
176        let input = "Jane,jane@example.com\n";
177        let options = CsvScanOptions {
178            has_headers: false,
179            columns: Vec::new(),
180            column_indexes: vec![1],
181            delimiter: b',',
182        };
183        let result = scan_csv_str(input, &scanner(), &options).unwrap();
184        assert_eq!(result.cells[0].row_number, 1);
185        assert_eq!(result.cells[0].column_index, 1);
186    }
187
188    #[test]
189    fn test_scan_csv_str_empty_selection_scans_all_columns() {
190        let input = "name,email,phone\nJane,jane@example.com,+1 (555) 123-4567\n";
191        let options = CsvScanOptions {
192            has_headers: true,
193            columns: Vec::new(),
194            column_indexes: Vec::new(),
195            delimiter: b',',
196        };
197        let result = scan_csv_str(input, &scanner(), &options).unwrap();
198        assert_eq!(result.cells.len(), 2);
199    }
200
201    #[test]
202    fn test_scan_csv_str_semicolon_delimiter() {
203        let input = "name;email\nJane;jane@example.com\n";
204        let options = CsvScanOptions {
205            has_headers: true,
206            columns: vec!["email".to_string()],
207            column_indexes: Vec::new(),
208            delimiter: b';',
209        };
210        let result = scan_csv_str(input, &scanner(), &options).unwrap();
211        assert!(result.masked_csv.contains("[EMAIL]"));
212    }
213
214    #[test]
215    fn test_scan_csv_str_quoted_multiline_field() {
216        let input = "notes\n\"hello\nemail jane@example.com\"\n";
217        let options = CsvScanOptions {
218            has_headers: true,
219            columns: vec!["notes".to_string()],
220            column_indexes: Vec::new(),
221            delimiter: b',',
222        };
223        let result = scan_csv_str(input, &scanner(), &options).unwrap();
224        assert_eq!(result.cells.len(), 1);
225    }
226}