sanitize_engine/processor/
csv_proc.rs1use crate::error::{Result, SanitizeError};
20use crate::processor::limits::DEFAULT_INPUT_SIZE;
21use crate::processor::{
22 find_matching_rule, pattern_matches, replace_value, FileTypeProfile, Processor,
23};
24use crate::store::MappingStore;
25
26pub struct CsvProcessor;
28
29impl Processor for CsvProcessor {
30 fn name(&self) -> &'static str {
31 "csv"
32 }
33
34 fn can_handle(&self, _content: &[u8], profile: &FileTypeProfile) -> bool {
35 profile.processor == "csv"
36 }
37
38 fn process(
39 &self,
40 content: &[u8],
41 profile: &FileTypeProfile,
42 store: &MappingStore,
43 ) -> Result<Vec<u8>> {
44 if content.len() > DEFAULT_INPUT_SIZE {
46 return Err(SanitizeError::InputTooLarge {
47 size: content.len(),
48 limit: DEFAULT_INPUT_SIZE,
49 });
50 }
51
52 let delimiter = profile
53 .options
54 .get("delimiter")
55 .and_then(|s| s.as_bytes().first().copied())
56 .unwrap_or(b',');
57
58 let has_header = profile
59 .options
60 .get("has_header")
61 .map_or(true, |v| v != "false");
62
63 let mut reader = csv::ReaderBuilder::new()
64 .delimiter(delimiter)
65 .has_headers(has_header)
66 .flexible(true)
67 .from_reader(content);
68
69 let mut output = Vec::new();
70 let mut wtr = csv::WriterBuilder::new()
71 .delimiter(delimiter)
72 .from_writer(&mut output);
73
74 let column_rules: Vec<Option<usize>> = if has_header {
76 let headers = reader
77 .headers()
78 .map_err(|e| SanitizeError::ParseError {
79 format: "CSV".into(),
80 message: format!("CSV header error: {}", e),
81 })?
82 .clone();
83
84 wtr.write_record(headers.iter()).map_err(|e| {
86 SanitizeError::IoError(std::io::Error::other(format!("CSV write error: {e}")))
87 })?;
88
89 headers
93 .iter()
94 .map(|h| {
95 profile
96 .fields
97 .iter()
98 .position(|r| pattern_matches(&r.pattern, h))
99 })
100 .collect()
101 } else {
102 Vec::new()
103 };
104
105 for result in reader.records() {
106 let record = result.map_err(|e| SanitizeError::ParseError {
107 format: "CSV".into(),
108 message: format!("CSV read error: {}", e),
109 })?;
110
111 let mut row: Vec<String> = Vec::with_capacity(record.len());
112 for (idx, field) in record.iter().enumerate() {
113 if has_header {
114 if let Some(Some(rule_idx)) = column_rules.get(idx) {
115 let rule = &profile.fields[*rule_idx];
116 let replaced = replace_value(field, rule, store)?;
117 row.push(replaced);
118 } else {
119 row.push(field.to_string());
120 }
121 } else {
122 let col_key = idx.to_string();
124 if let Some(rule) = find_matching_rule(&col_key, profile) {
125 let replaced = replace_value(field, rule, store)?;
126 row.push(replaced);
127 } else {
128 row.push(field.to_string());
129 }
130 }
131 }
132
133 wtr.write_record(&row).map_err(|e| {
134 SanitizeError::IoError(std::io::Error::other(format!("CSV write error: {e}")))
135 })?;
136 }
137
138 wtr.flush().map_err(|e| {
139 SanitizeError::IoError(std::io::Error::other(format!("CSV flush error: {e}")))
140 })?;
141 drop(wtr);
142
143 Ok(output)
144 }
145}
146
147#[cfg(test)]
148mod tests {
149 use super::*;
150 use crate::category::Category;
151 use crate::generator::HmacGenerator;
152 use crate::processor::profile::FieldRule;
153 use std::sync::Arc;
154
155 fn make_store() -> MappingStore {
156 let gen = Arc::new(HmacGenerator::new([42u8; 32]));
157 MappingStore::new(gen, None)
158 }
159
160 #[test]
161 fn basic_csv_replacement() {
162 let store = make_store();
163 let proc = CsvProcessor;
164
165 let content =
166 b"name,email,department\nAlice,alice@corp.com,Engineering\nBob,bob@corp.com,Sales\n";
167 let profile = FileTypeProfile::new(
168 "csv",
169 vec![
170 FieldRule::new("name").with_category(Category::Name),
171 FieldRule::new("email").with_category(Category::Email),
172 ],
173 );
174
175 let result = proc.process(content, &profile, &store).unwrap();
176 let out = String::from_utf8(result).unwrap();
177
178 assert!(!out.contains("Alice"));
179 assert!(!out.contains("alice@corp.com"));
180 assert!(!out.contains("Bob"));
181 assert!(!out.contains("bob@corp.com"));
182 assert!(out.contains("Engineering"));
184 assert!(out.contains("Sales"));
185 assert!(out.starts_with("name,email,department"));
187 }
188
189 #[test]
190 fn can_handle_requires_csv_profile() {
191 let proc = CsvProcessor;
192 let yes = FileTypeProfile::new("csv", vec![]).with_extension(".csv");
193 let no = FileTypeProfile::new("json", vec![]).with_extension(".json");
194 assert!(proc.can_handle(b"a,b,c\n1,2,3\n", &yes));
195 assert!(!proc.can_handle(b"a,b,c\n1,2,3\n", &no));
196 }
197
198 #[test]
199 fn tsv_delimiter() {
200 let store = make_store();
201 let proc = CsvProcessor;
202 let content = b"name\temail\nAlice\talice@corp.com\n";
203 let mut profile = FileTypeProfile::new(
204 "csv",
205 vec![FieldRule::new("email").with_category(Category::Email)],
206 );
207 profile.options.insert("delimiter".into(), "\t".into());
208
209 let result = proc.process(content, &profile, &store).unwrap();
210 let out = String::from_utf8(result).unwrap();
211 assert!(!out.contains("alice@corp.com"));
212 assert!(out.contains("Alice"));
213 }
214
215 #[test]
216 fn no_header_mode_matches_by_column_index() {
217 let store = make_store();
218 let proc = CsvProcessor;
219 let content = b"Alice,alice@corp.com,Engineering\n";
221 let mut profile = FileTypeProfile::new(
222 "csv",
223 vec![FieldRule::new("1").with_category(Category::Email)],
224 );
225 profile.options.insert("has_header".into(), "false".into());
226
227 let result = proc.process(content, &profile, &store).unwrap();
228 let out = String::from_utf8(result).unwrap();
229 assert!(!out.contains("alice@corp.com"));
230 assert!(out.contains("Alice"));
231 assert!(out.contains("Engineering"));
232 }
233
234 #[test]
235 fn header_only_no_data_rows() {
236 let store = make_store();
237 let proc = CsvProcessor;
238 let content = b"name,email,department\n";
239 let profile = FileTypeProfile::new(
240 "csv",
241 vec![FieldRule::new("email").with_category(Category::Email)],
242 );
243 let result = proc.process(content, &profile, &store).unwrap();
244 let out = String::from_utf8(result).unwrap();
245 assert!(out.contains("name,email,department"));
246 }
247
248 #[test]
249 fn empty_field_passes_through() {
250 let store = make_store();
251 let proc = CsvProcessor;
252 let content = b"email\n\nalice@corp.com\n";
253 let profile = FileTypeProfile::new(
254 "csv",
255 vec![FieldRule::new("email").with_category(Category::Email)],
256 );
257 let result = proc.process(content, &profile, &store).unwrap();
258 let out = String::from_utf8(result).unwrap();
259 assert!(!out.contains("alice@corp.com"));
260 }
261
262 #[test]
263 fn unmatched_columns_pass_through_unchanged() {
264 let store = make_store();
265 let proc = CsvProcessor;
266 let content = b"id,email\n42,alice@corp.com\n";
267 let profile = FileTypeProfile::new(
268 "csv",
269 vec![FieldRule::new("email").with_category(Category::Email)],
270 );
271 let result = proc.process(content, &profile, &store).unwrap();
272 let out = String::from_utf8(result).unwrap();
273 assert!(out.contains("42"), "id column must be preserved");
274 assert!(!out.contains("alice@corp.com"));
275 }
276
277 #[test]
278 fn csv_deterministic_replacement() {
279 let store = make_store();
280 let proc = CsvProcessor;
281
282 let content = b"email\ntest@x.com\ntest@x.com\n";
283 let profile = FileTypeProfile::new(
284 "csv",
285 vec![FieldRule::new("email").with_category(Category::Email)],
286 );
287
288 let result = proc.process(content, &profile, &store).unwrap();
289 let out = String::from_utf8(result).unwrap();
290 let lines: Vec<&str> = out.lines().collect();
291
292 assert_eq!(lines[1], lines[2]);
294 assert_ne!(lines[1], "test@x.com");
295 }
296}