Skip to main content

sanitize_engine/processor/
csv_proc.rs

1//! CSV structured processor.
2//!
3//! Parses CSV (or TSV) input, replaces values in specified columns,
4//! and writes back preserving the delimiter and quoting style.
5//!
6//! # Column Matching
7//!
8//! Field rules match by **header name**. If the first row is a header
9//! (default assumption), column names are extracted from it and matched
10//! against the profile's field rules.
11//!
12//! # Profile Options
13//!
14//! | Key          | Default | Description                            |
15//! |--------------|---------|----------------------------------------|
16//! | `delimiter`  | `","`   | Field delimiter (single ASCII char).   |
17//! | `has_header` | `"true"`| Whether the first row is a header row. |
18
19use crate::error::{Result, SanitizeError};
20use crate::processor::limits::DEFAULT_INPUT_SIZE;
21use crate::processor::{
22    find_matching_rule, pattern_matches, replace_value, FileTypeProfile, Processor,
23};
24use crate::store::MappingStore;
25
26/// Structured processor for CSV/TSV files.
27pub struct CsvProcessor;
28
29impl Processor for CsvProcessor {
30    fn name(&self) -> &'static str {
31        "csv"
32    }
33
34    fn can_handle(&self, _content: &[u8], profile: &FileTypeProfile) -> bool {
35        profile.processor == "csv"
36    }
37
38    fn process(
39        &self,
40        content: &[u8],
41        profile: &FileTypeProfile,
42        store: &MappingStore,
43    ) -> Result<Vec<u8>> {
44        // F-04 fix: enforce input size limit.
45        if content.len() > DEFAULT_INPUT_SIZE {
46            return Err(SanitizeError::InputTooLarge {
47                size: content.len(),
48                limit: DEFAULT_INPUT_SIZE,
49            });
50        }
51
52        let delimiter = profile
53            .options
54            .get("delimiter")
55            .and_then(|s| s.as_bytes().first().copied())
56            .unwrap_or(b',');
57
58        let has_header = profile
59            .options
60            .get("has_header")
61            .map_or(true, |v| v != "false");
62
63        let mut reader = csv::ReaderBuilder::new()
64            .delimiter(delimiter)
65            .has_headers(has_header)
66            .flexible(true)
67            .from_reader(content);
68
69        let mut output = Vec::new();
70        let mut wtr = csv::WriterBuilder::new()
71            .delimiter(delimiter)
72            .from_writer(&mut output);
73
74        // Determine which column indices need replacement.
75        let column_rules: Vec<Option<usize>> = if has_header {
76            let headers = reader
77                .headers()
78                .map_err(|e| SanitizeError::ParseError {
79                    format: "CSV".into(),
80                    message: format!("CSV header error: {}", e),
81                })?
82                .clone();
83
84            // Write header row.
85            wtr.write_record(headers.iter()).map_err(|e| {
86                SanitizeError::IoError(std::io::Error::other(format!("CSV write error: {e}")))
87            })?;
88
89            // Map each column index to the index of its first matching rule (if any).
90            // Uses pattern_matches directly to avoid allocating a temporary
91            // FileTypeProfile for every (header, rule) pair.
92            headers
93                .iter()
94                .map(|h| {
95                    profile
96                        .fields
97                        .iter()
98                        .position(|r| pattern_matches(&r.pattern, h))
99                })
100                .collect()
101        } else {
102            Vec::new()
103        };
104
105        for result in reader.records() {
106            let record = result.map_err(|e| SanitizeError::ParseError {
107                format: "CSV".into(),
108                message: format!("CSV read error: {}", e),
109            })?;
110
111            let mut row: Vec<String> = Vec::with_capacity(record.len());
112            for (idx, field) in record.iter().enumerate() {
113                if has_header {
114                    if let Some(Some(rule_idx)) = column_rules.get(idx) {
115                        let rule = &profile.fields[*rule_idx];
116                        let replaced = replace_value(field, rule, store)?;
117                        row.push(replaced);
118                    } else {
119                        row.push(field.to_string());
120                    }
121                } else {
122                    // Without headers, match by column index as string.
123                    let col_key = idx.to_string();
124                    if let Some(rule) = find_matching_rule(&col_key, profile) {
125                        let replaced = replace_value(field, rule, store)?;
126                        row.push(replaced);
127                    } else {
128                        row.push(field.to_string());
129                    }
130                }
131            }
132
133            wtr.write_record(&row).map_err(|e| {
134                SanitizeError::IoError(std::io::Error::other(format!("CSV write error: {e}")))
135            })?;
136        }
137
138        wtr.flush().map_err(|e| {
139            SanitizeError::IoError(std::io::Error::other(format!("CSV flush error: {e}")))
140        })?;
141        drop(wtr);
142
143        Ok(output)
144    }
145}
146
147#[cfg(test)]
148mod tests {
149    use super::*;
150    use crate::category::Category;
151    use crate::generator::HmacGenerator;
152    use crate::processor::profile::FieldRule;
153    use std::sync::Arc;
154
155    fn make_store() -> MappingStore {
156        let gen = Arc::new(HmacGenerator::new([42u8; 32]));
157        MappingStore::new(gen, None)
158    }
159
160    #[test]
161    fn basic_csv_replacement() {
162        let store = make_store();
163        let proc = CsvProcessor;
164
165        let content =
166            b"name,email,department\nAlice,alice@corp.com,Engineering\nBob,bob@corp.com,Sales\n";
167        let profile = FileTypeProfile::new(
168            "csv",
169            vec![
170                FieldRule::new("name").with_category(Category::Name),
171                FieldRule::new("email").with_category(Category::Email),
172            ],
173        );
174
175        let result = proc.process(content, &profile, &store).unwrap();
176        let out = String::from_utf8(result).unwrap();
177
178        assert!(!out.contains("Alice"));
179        assert!(!out.contains("alice@corp.com"));
180        assert!(!out.contains("Bob"));
181        assert!(!out.contains("bob@corp.com"));
182        // Department column preserved.
183        assert!(out.contains("Engineering"));
184        assert!(out.contains("Sales"));
185        // Header preserved.
186        assert!(out.starts_with("name,email,department"));
187    }
188
189    #[test]
190    fn can_handle_requires_csv_profile() {
191        let proc = CsvProcessor;
192        let yes = FileTypeProfile::new("csv", vec![]).with_extension(".csv");
193        let no = FileTypeProfile::new("json", vec![]).with_extension(".json");
194        assert!(proc.can_handle(b"a,b,c\n1,2,3\n", &yes));
195        assert!(!proc.can_handle(b"a,b,c\n1,2,3\n", &no));
196    }
197
198    #[test]
199    fn tsv_delimiter() {
200        let store = make_store();
201        let proc = CsvProcessor;
202        let content = b"name\temail\nAlice\talice@corp.com\n";
203        let mut profile = FileTypeProfile::new(
204            "csv",
205            vec![FieldRule::new("email").with_category(Category::Email)],
206        );
207        profile.options.insert("delimiter".into(), "\t".into());
208
209        let result = proc.process(content, &profile, &store).unwrap();
210        let out = String::from_utf8(result).unwrap();
211        assert!(!out.contains("alice@corp.com"));
212        assert!(out.contains("Alice"));
213    }
214
215    #[test]
216    fn no_header_mode_matches_by_column_index() {
217        let store = make_store();
218        let proc = CsvProcessor;
219        // Column 1 (0-indexed) should be replaced.
220        let content = b"Alice,alice@corp.com,Engineering\n";
221        let mut profile = FileTypeProfile::new(
222            "csv",
223            vec![FieldRule::new("1").with_category(Category::Email)],
224        );
225        profile.options.insert("has_header".into(), "false".into());
226
227        let result = proc.process(content, &profile, &store).unwrap();
228        let out = String::from_utf8(result).unwrap();
229        assert!(!out.contains("alice@corp.com"));
230        assert!(out.contains("Alice"));
231        assert!(out.contains("Engineering"));
232    }
233
234    #[test]
235    fn header_only_no_data_rows() {
236        let store = make_store();
237        let proc = CsvProcessor;
238        let content = b"name,email,department\n";
239        let profile = FileTypeProfile::new(
240            "csv",
241            vec![FieldRule::new("email").with_category(Category::Email)],
242        );
243        let result = proc.process(content, &profile, &store).unwrap();
244        let out = String::from_utf8(result).unwrap();
245        assert!(out.contains("name,email,department"));
246    }
247
248    #[test]
249    fn empty_field_passes_through() {
250        let store = make_store();
251        let proc = CsvProcessor;
252        let content = b"email\n\nalice@corp.com\n";
253        let profile = FileTypeProfile::new(
254            "csv",
255            vec![FieldRule::new("email").with_category(Category::Email)],
256        );
257        let result = proc.process(content, &profile, &store).unwrap();
258        let out = String::from_utf8(result).unwrap();
259        assert!(!out.contains("alice@corp.com"));
260    }
261
262    #[test]
263    fn unmatched_columns_pass_through_unchanged() {
264        let store = make_store();
265        let proc = CsvProcessor;
266        let content = b"id,email\n42,alice@corp.com\n";
267        let profile = FileTypeProfile::new(
268            "csv",
269            vec![FieldRule::new("email").with_category(Category::Email)],
270        );
271        let result = proc.process(content, &profile, &store).unwrap();
272        let out = String::from_utf8(result).unwrap();
273        assert!(out.contains("42"), "id column must be preserved");
274        assert!(!out.contains("alice@corp.com"));
275    }
276
277    #[test]
278    fn csv_deterministic_replacement() {
279        let store = make_store();
280        let proc = CsvProcessor;
281
282        let content = b"email\ntest@x.com\ntest@x.com\n";
283        let profile = FileTypeProfile::new(
284            "csv",
285            vec![FieldRule::new("email").with_category(Category::Email)],
286        );
287
288        let result = proc.process(content, &profile, &store).unwrap();
289        let out = String::from_utf8(result).unwrap();
290        let lines: Vec<&str> = out.lines().collect();
291
292        // Same input → same replacement.
293        assert_eq!(lines[1], lines[2]);
294        assert_ne!(lines[1], "test@x.com");
295    }
296}