Skip to main content

sanitize_engine/processor/
csv_proc.rs

1//! CSV structured processor.
2//!
3//! Parses CSV (or TSV) input, replaces values in specified columns,
4//! and writes back preserving the delimiter and quoting style.
5//!
6//! # Column Matching
7//!
8//! Field rules match by **header name**. If the first row is a header
9//! (default assumption), column names are extracted from it and matched
10//! against the profile's field rules.
11//!
12//! # Profile Options
13//!
14//! | Key          | Default | Description                            |
15//! |--------------|---------|----------------------------------------|
16//! | `delimiter`  | `","`   | Field delimiter (single ASCII char).   |
17//! | `has_header` | `"true"`| Whether the first row is a header row. |
18
19use crate::error::{Result, SanitizeError};
20use crate::processor::{find_matching_rule, replace_value, FileTypeProfile, Processor};
21use crate::store::MappingStore;
22
23/// Maximum allowed input size (bytes) for CSV processing (F-04 fix).
24const MAX_CSV_INPUT_SIZE: usize = 256 * 1024 * 1024; // 256 MiB
25
26/// Structured processor for CSV/TSV files.
27pub struct CsvProcessor;
28
29impl Processor for CsvProcessor {
30    fn name(&self) -> &'static str {
31        "csv"
32    }
33
34    fn can_handle(&self, _content: &[u8], profile: &FileTypeProfile) -> bool {
35        profile.processor == "csv"
36    }
37
38    fn process(
39        &self,
40        content: &[u8],
41        profile: &FileTypeProfile,
42        store: &MappingStore,
43    ) -> Result<Vec<u8>> {
44        // F-04 fix: enforce input size limit.
45        if content.len() > MAX_CSV_INPUT_SIZE {
46            return Err(SanitizeError::InputTooLarge {
47                size: content.len(),
48                limit: MAX_CSV_INPUT_SIZE,
49            });
50        }
51
52        let delimiter = profile
53            .options
54            .get("delimiter")
55            .and_then(|s| s.as_bytes().first().copied())
56            .unwrap_or(b',');
57
58        let has_header = profile
59            .options
60            .get("has_header")
61            .map_or(true, |v| v != "false");
62
63        let mut reader = csv::ReaderBuilder::new()
64            .delimiter(delimiter)
65            .has_headers(has_header)
66            .flexible(true)
67            .from_reader(content);
68
69        let mut output = Vec::new();
70        let mut wtr = csv::WriterBuilder::new()
71            .delimiter(delimiter)
72            .from_writer(&mut output);
73
74        // Determine which column indices need replacement.
75        let column_rules: Vec<Option<usize>> = if has_header {
76            let headers = reader
77                .headers()
78                .map_err(|e| SanitizeError::ParseError {
79                    format: "CSV".into(),
80                    message: format!("CSV header error: {}", e),
81                })?
82                .clone();
83
84            // Write header row.
85            wtr.write_record(headers.iter())
86                .map_err(|e| SanitizeError::IoError(format!("CSV write error: {}", e)))?;
87
88            // Map each column index to a matching rule index (if any).
89            headers
90                .iter()
91                .map(|h| {
92                    profile.fields.iter().position(|r| {
93                        find_matching_rule(h, &FileTypeProfile::new("csv", vec![r.clone()]))
94                            .is_some()
95                    })
96                })
97                .collect()
98        } else {
99            Vec::new()
100        };
101
102        for result in reader.records() {
103            let record = result.map_err(|e| SanitizeError::ParseError {
104                format: "CSV".into(),
105                message: format!("CSV read error: {}", e),
106            })?;
107
108            let mut row: Vec<String> = Vec::with_capacity(record.len());
109            for (idx, field) in record.iter().enumerate() {
110                if has_header {
111                    if let Some(Some(rule_idx)) = column_rules.get(idx) {
112                        let rule = &profile.fields[*rule_idx];
113                        let replaced = replace_value(field, rule, store)?;
114                        row.push(replaced);
115                    } else {
116                        row.push(field.to_string());
117                    }
118                } else {
119                    // Without headers, match by column index as string.
120                    let col_key = idx.to_string();
121                    if let Some(rule) = find_matching_rule(&col_key, profile) {
122                        let replaced = replace_value(field, rule, store)?;
123                        row.push(replaced);
124                    } else {
125                        row.push(field.to_string());
126                    }
127                }
128            }
129
130            wtr.write_record(&row)
131                .map_err(|e| SanitizeError::IoError(format!("CSV write error: {}", e)))?;
132        }
133
134        wtr.flush()
135            .map_err(|e| SanitizeError::IoError(format!("CSV flush error: {}", e)))?;
136        drop(wtr);
137
138        Ok(output)
139    }
140}
141
142#[cfg(test)]
143mod tests {
144    use super::*;
145    use crate::category::Category;
146    use crate::generator::HmacGenerator;
147    use crate::processor::profile::FieldRule;
148    use std::sync::Arc;
149
150    fn make_store() -> MappingStore {
151        let gen = Arc::new(HmacGenerator::new([42u8; 32]));
152        MappingStore::new(gen, None)
153    }
154
155    #[test]
156    fn basic_csv_replacement() {
157        let store = make_store();
158        let proc = CsvProcessor;
159
160        let content =
161            b"name,email,department\nAlice,alice@corp.com,Engineering\nBob,bob@corp.com,Sales\n";
162        let profile = FileTypeProfile::new(
163            "csv",
164            vec![
165                FieldRule::new("name").with_category(Category::Name),
166                FieldRule::new("email").with_category(Category::Email),
167            ],
168        );
169
170        let result = proc.process(content, &profile, &store).unwrap();
171        let out = String::from_utf8(result).unwrap();
172
173        assert!(!out.contains("Alice"));
174        assert!(!out.contains("alice@corp.com"));
175        assert!(!out.contains("Bob"));
176        assert!(!out.contains("bob@corp.com"));
177        // Department column preserved.
178        assert!(out.contains("Engineering"));
179        assert!(out.contains("Sales"));
180        // Header preserved.
181        assert!(out.starts_with("name,email,department"));
182    }
183
184    #[test]
185    fn csv_deterministic_replacement() {
186        let store = make_store();
187        let proc = CsvProcessor;
188
189        let content = b"email\ntest@x.com\ntest@x.com\n";
190        let profile = FileTypeProfile::new(
191            "csv",
192            vec![FieldRule::new("email").with_category(Category::Email)],
193        );
194
195        let result = proc.process(content, &profile, &store).unwrap();
196        let out = String::from_utf8(result).unwrap();
197        let lines: Vec<&str> = out.lines().collect();
198
199        // Same input → same replacement.
200        assert_eq!(lines[1], lines[2]);
201        assert_ne!(lines[1], "test@x.com");
202    }
203}