use crate::error::{Result, SanitizeError};
use crate::processor::{find_matching_rule, pattern_matches, replace_value, FileTypeProfile, Processor};
use crate::store::MappingStore;
const MAX_CSV_INPUT_SIZE: usize = 256 * 1024 * 1024;
pub struct CsvProcessor;
impl Processor for CsvProcessor {
fn name(&self) -> &'static str {
"csv"
}
fn can_handle(&self, _content: &[u8], profile: &FileTypeProfile) -> bool {
profile.processor == "csv"
}
fn process(
&self,
content: &[u8],
profile: &FileTypeProfile,
store: &MappingStore,
) -> Result<Vec<u8>> {
if content.len() > MAX_CSV_INPUT_SIZE {
return Err(SanitizeError::InputTooLarge {
size: content.len(),
limit: MAX_CSV_INPUT_SIZE,
});
}
let delimiter = profile
.options
.get("delimiter")
.and_then(|s| s.as_bytes().first().copied())
.unwrap_or(b',');
let has_header = profile
.options
.get("has_header")
.map_or(true, |v| v != "false");
let mut reader = csv::ReaderBuilder::new()
.delimiter(delimiter)
.has_headers(has_header)
.flexible(true)
.from_reader(content);
let mut output = Vec::new();
let mut wtr = csv::WriterBuilder::new()
.delimiter(delimiter)
.from_writer(&mut output);
let column_rules: Vec<Option<usize>> = if has_header {
let headers = reader
.headers()
.map_err(|e| SanitizeError::ParseError {
format: "CSV".into(),
message: format!("CSV header error: {}", e),
})?
.clone();
wtr.write_record(headers.iter())
.map_err(|e| SanitizeError::IoError(format!("CSV write error: {}", e)))?;
headers
.iter()
.map(|h| {
profile
.fields
.iter()
.position(|r| pattern_matches(&r.pattern, h))
})
.collect()
} else {
Vec::new()
};
for result in reader.records() {
let record = result.map_err(|e| SanitizeError::ParseError {
format: "CSV".into(),
message: format!("CSV read error: {}", e),
})?;
let mut row: Vec<String> = Vec::with_capacity(record.len());
for (idx, field) in record.iter().enumerate() {
if has_header {
if let Some(Some(rule_idx)) = column_rules.get(idx) {
let rule = &profile.fields[*rule_idx];
let replaced = replace_value(field, rule, store)?;
row.push(replaced);
} else {
row.push(field.to_string());
}
} else {
let col_key = idx.to_string();
if let Some(rule) = find_matching_rule(&col_key, profile) {
let replaced = replace_value(field, rule, store)?;
row.push(replaced);
} else {
row.push(field.to_string());
}
}
}
wtr.write_record(&row)
.map_err(|e| SanitizeError::IoError(format!("CSV write error: {}", e)))?;
}
wtr.flush()
.map_err(|e| SanitizeError::IoError(format!("CSV flush error: {}", e)))?;
drop(wtr);
Ok(output)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::category::Category;
use crate::generator::HmacGenerator;
use crate::processor::profile::FieldRule;
use std::sync::Arc;
fn make_store() -> MappingStore {
let gen = Arc::new(HmacGenerator::new([42u8; 32]));
MappingStore::new(gen, None)
}
#[test]
fn basic_csv_replacement() {
let store = make_store();
let proc = CsvProcessor;
let content =
b"name,email,department\nAlice,alice@corp.com,Engineering\nBob,bob@corp.com,Sales\n";
let profile = FileTypeProfile::new(
"csv",
vec![
FieldRule::new("name").with_category(Category::Name),
FieldRule::new("email").with_category(Category::Email),
],
);
let result = proc.process(content, &profile, &store).unwrap();
let out = String::from_utf8(result).unwrap();
assert!(!out.contains("Alice"));
assert!(!out.contains("alice@corp.com"));
assert!(!out.contains("Bob"));
assert!(!out.contains("bob@corp.com"));
assert!(out.contains("Engineering"));
assert!(out.contains("Sales"));
assert!(out.starts_with("name,email,department"));
}
#[test]
fn csv_deterministic_replacement() {
let store = make_store();
let proc = CsvProcessor;
let content = b"email\ntest@x.com\ntest@x.com\n";
let profile = FileTypeProfile::new(
"csv",
vec![FieldRule::new("email").with_category(Category::Email)],
);
let result = proc.process(content, &profile, &store).unwrap();
let out = String::from_utf8(result).unwrap();
let lines: Vec<&str> = out.lines().collect();
assert_eq!(lines[1], lines[2]);
assert_ne!(lines[1], "test@x.com");
}
}