Skip to main content

sanitize_engine/processor/
csv_proc.rs

1//! CSV structured processor.
2//!
3//! Parses CSV (or TSV) input, replaces values in specified columns,
4//! and writes back preserving the delimiter and quoting style.
5//!
6//! # Column Matching
7//!
8//! Field rules match by **header name**. If the first row is a header
9//! (default assumption), column names are extracted from it and matched
10//! against the profile's field rules.
11//!
12//! # Profile Options
13//!
14//! | Key          | Default | Description                            |
15//! |--------------|---------|----------------------------------------|
16//! | `delimiter`  | `","`   | Field delimiter (single ASCII char).   |
17//! | `has_header` | `"true"`| Whether the first row is a header row. |
18
19use crate::error::{Result, SanitizeError};
20use crate::processor::{find_matching_rule, pattern_matches, replace_value, FileTypeProfile, Processor};
21use crate::store::MappingStore;
22
23/// Maximum allowed input size (bytes) for CSV processing (F-04 fix).
24const MAX_CSV_INPUT_SIZE: usize = 256 * 1024 * 1024; // 256 MiB
25
26/// Structured processor for CSV/TSV files.
27pub struct CsvProcessor;
28
29impl Processor for CsvProcessor {
30    fn name(&self) -> &'static str {
31        "csv"
32    }
33
34    fn can_handle(&self, _content: &[u8], profile: &FileTypeProfile) -> bool {
35        profile.processor == "csv"
36    }
37
38    fn process(
39        &self,
40        content: &[u8],
41        profile: &FileTypeProfile,
42        store: &MappingStore,
43    ) -> Result<Vec<u8>> {
44        // F-04 fix: enforce input size limit.
45        if content.len() > MAX_CSV_INPUT_SIZE {
46            return Err(SanitizeError::InputTooLarge {
47                size: content.len(),
48                limit: MAX_CSV_INPUT_SIZE,
49            });
50        }
51
52        let delimiter = profile
53            .options
54            .get("delimiter")
55            .and_then(|s| s.as_bytes().first().copied())
56            .unwrap_or(b',');
57
58        let has_header = profile
59            .options
60            .get("has_header")
61            .map_or(true, |v| v != "false");
62
63        let mut reader = csv::ReaderBuilder::new()
64            .delimiter(delimiter)
65            .has_headers(has_header)
66            .flexible(true)
67            .from_reader(content);
68
69        let mut output = Vec::new();
70        let mut wtr = csv::WriterBuilder::new()
71            .delimiter(delimiter)
72            .from_writer(&mut output);
73
74        // Determine which column indices need replacement.
75        let column_rules: Vec<Option<usize>> = if has_header {
76            let headers = reader
77                .headers()
78                .map_err(|e| SanitizeError::ParseError {
79                    format: "CSV".into(),
80                    message: format!("CSV header error: {}", e),
81                })?
82                .clone();
83
84            // Write header row.
85            wtr.write_record(headers.iter())
86                .map_err(|e| SanitizeError::IoError(format!("CSV write error: {}", e)))?;
87
88            // Map each column index to the index of its first matching rule (if any).
89            // Uses pattern_matches directly to avoid allocating a temporary
90            // FileTypeProfile for every (header, rule) pair.
91            headers
92                .iter()
93                .map(|h| {
94                    profile
95                        .fields
96                        .iter()
97                        .position(|r| pattern_matches(&r.pattern, h))
98                })
99                .collect()
100        } else {
101            Vec::new()
102        };
103
104        for result in reader.records() {
105            let record = result.map_err(|e| SanitizeError::ParseError {
106                format: "CSV".into(),
107                message: format!("CSV read error: {}", e),
108            })?;
109
110            let mut row: Vec<String> = Vec::with_capacity(record.len());
111            for (idx, field) in record.iter().enumerate() {
112                if has_header {
113                    if let Some(Some(rule_idx)) = column_rules.get(idx) {
114                        let rule = &profile.fields[*rule_idx];
115                        let replaced = replace_value(field, rule, store)?;
116                        row.push(replaced);
117                    } else {
118                        row.push(field.to_string());
119                    }
120                } else {
121                    // Without headers, match by column index as string.
122                    let col_key = idx.to_string();
123                    if let Some(rule) = find_matching_rule(&col_key, profile) {
124                        let replaced = replace_value(field, rule, store)?;
125                        row.push(replaced);
126                    } else {
127                        row.push(field.to_string());
128                    }
129                }
130            }
131
132            wtr.write_record(&row)
133                .map_err(|e| SanitizeError::IoError(format!("CSV write error: {}", e)))?;
134        }
135
136        wtr.flush()
137            .map_err(|e| SanitizeError::IoError(format!("CSV flush error: {}", e)))?;
138        drop(wtr);
139
140        Ok(output)
141    }
142}
143
144#[cfg(test)]
145mod tests {
146    use super::*;
147    use crate::category::Category;
148    use crate::generator::HmacGenerator;
149    use crate::processor::profile::FieldRule;
150    use std::sync::Arc;
151
152    fn make_store() -> MappingStore {
153        let gen = Arc::new(HmacGenerator::new([42u8; 32]));
154        MappingStore::new(gen, None)
155    }
156
157    #[test]
158    fn basic_csv_replacement() {
159        let store = make_store();
160        let proc = CsvProcessor;
161
162        let content =
163            b"name,email,department\nAlice,alice@corp.com,Engineering\nBob,bob@corp.com,Sales\n";
164        let profile = FileTypeProfile::new(
165            "csv",
166            vec![
167                FieldRule::new("name").with_category(Category::Name),
168                FieldRule::new("email").with_category(Category::Email),
169            ],
170        );
171
172        let result = proc.process(content, &profile, &store).unwrap();
173        let out = String::from_utf8(result).unwrap();
174
175        assert!(!out.contains("Alice"));
176        assert!(!out.contains("alice@corp.com"));
177        assert!(!out.contains("Bob"));
178        assert!(!out.contains("bob@corp.com"));
179        // Department column preserved.
180        assert!(out.contains("Engineering"));
181        assert!(out.contains("Sales"));
182        // Header preserved.
183        assert!(out.starts_with("name,email,department"));
184    }
185
186    #[test]
187    fn csv_deterministic_replacement() {
188        let store = make_store();
189        let proc = CsvProcessor;
190
191        let content = b"email\ntest@x.com\ntest@x.com\n";
192        let profile = FileTypeProfile::new(
193            "csv",
194            vec![FieldRule::new("email").with_category(Category::Email)],
195        );
196
197        let result = proc.process(content, &profile, &store).unwrap();
198        let out = String::from_utf8(result).unwrap();
199        let lines: Vec<&str> = out.lines().collect();
200
201        // Same input → same replacement.
202        assert_eq!(lines[1], lines[2]);
203        assert_ne!(lines[1], "test@x.com");
204    }
205}