use csv::{ReaderBuilder, StringRecord};
use std::io::Read;
use thiserror::Error;
#[derive(Debug, Clone, PartialEq)]
pub struct CsvError {
pub record: Option<Vec<String>>,
pub record_num: usize,
pub error: CsvErrorKind,
}
#[derive(Debug, Clone, PartialEq, Error)]
pub enum CsvErrorKind {
#[error("wrong number of fields")]
FieldCount,
#[error("bare \" in non-quoted-field")]
BareQuote,
#[error("quote in quoted field")]
Quote,
#[error("invalid escape sequence")]
InvalidEscape,
#[error("unterminated quote")]
UnterminatedQuote,
#[error("invalid line ending (RFC 4180 requires CRLF)")]
InvalidLineEnding,
#[error("field contains unescaped special characters")]
UnescapedSpecialChars,
#[error("trailing comma found")]
TrailingComma,
#[error("I/O error: {0}")]
Io(String),
#[error("UTF-8 error: {0}")]
Utf8(String),
}
impl std::fmt::Display for CsvError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Record #{} has error: {}", self.record_num, self.error)
}
}
#[derive(Debug)]
pub struct ValidationResult {
pub errors: Vec<CsvError>,
pub halted: bool,
}
pub fn validate<R: Read>(
reader: R,
delimiter: u8,
lazy_quotes: bool,
rfc4180_mode: bool,
) -> Result<ValidationResult, Box<dyn std::error::Error>> {
let mut content = Vec::new();
let mut reader = reader;
reader.read_to_end(&mut content)?;
let mut errors = Vec::new();
if rfc4180_mode {
validate_line_endings(&content, &mut errors);
}
let cursor = std::io::Cursor::new(&content);
let mut csv_reader = ReaderBuilder::new()
.delimiter(delimiter)
.flexible(true) .quoting(!lazy_quotes) .from_reader(cursor);
let mut record_num = 0;
let mut header_len: Option<usize> = None;
let mut string_record = StringRecord::new();
match csv_reader.read_record(&mut string_record) {
Ok(has_record) => {
if has_record {
header_len = Some(string_record.len());
if !lazy_quotes {
validate_record_format(&string_record, 0, &mut errors);
}
}
}
Err(csv_error) => {
errors.push(CsvError {
record: None,
record_num: 0,
error: convert_csv_error(&csv_error),
});
return Ok(ValidationResult {
errors,
halted: true,
});
}
}
loop {
match csv_reader.read_record(&mut string_record) {
Ok(has_record) => {
if !has_record {
break; }
record_num += 1;
if !lazy_quotes {
validate_record_format(&string_record, record_num + 1, &mut errors);
}
if let Some(expected_len) = header_len {
if string_record.len() != expected_len {
errors.push(CsvError {
record: Some(string_record.iter().map(|s| s.to_string()).collect()),
record_num: record_num + 1, error: CsvErrorKind::FieldCount,
});
}
}
}
Err(csv_error) => {
let error_kind = convert_csv_error(&csv_error);
errors.push(CsvError {
record: None,
record_num: record_num + 1,
error: error_kind,
});
let halted = matches!(
csv_error.kind(),
csv::ErrorKind::Io(_) | csv::ErrorKind::Utf8 { .. }
);
return Ok(ValidationResult { errors, halted });
}
}
}
Ok(ValidationResult {
errors,
halted: false,
})
}
fn validate_line_endings(content: &[u8], errors: &mut Vec<CsvError>) {
let mut line_num = 1;
let mut i = 0;
while i < content.len() {
if content[i] == b'\n' {
if i == 0 || content[i - 1] != b'\r' {
errors.push(CsvError {
record: None,
record_num: line_num,
error: CsvErrorKind::InvalidLineEnding,
});
}
line_num += 1;
} else if content[i] == b'\r' {
if i + 1 >= content.len() || content[i + 1] != b'\n' {
errors.push(CsvError {
record: None,
record_num: line_num,
error: CsvErrorKind::InvalidLineEnding,
});
}
}
i += 1;
}
}
fn validate_record_format(_record: &StringRecord, _record_num: usize, _errors: &mut [CsvError]) {
}
fn convert_csv_error(csv_error: &csv::Error) -> CsvErrorKind {
match csv_error.kind() {
csv::ErrorKind::UnequalLengths { .. } => CsvErrorKind::FieldCount,
csv::ErrorKind::Utf8 { .. } => CsvErrorKind::Utf8(csv_error.to_string()),
csv::ErrorKind::Io(_) => CsvErrorKind::Io(csv_error.to_string()),
_ => {
let error_msg = csv_error.to_string().to_lowercase();
if error_msg.contains("bare") {
CsvErrorKind::BareQuote
} else if error_msg.contains("quote") || error_msg.contains("unterminated") {
if error_msg.contains("unterminated") {
CsvErrorKind::UnterminatedQuote
} else {
CsvErrorKind::Quote
}
} else {
CsvErrorKind::InvalidEscape
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs::File;
use std::io::Cursor;
#[test]
fn test_perfect_csv() {
let csv_data = "field1,field2,field3\r\na,b,c\r\nd,e,f\r\n";
let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
assert!(result.errors.is_empty());
assert!(!result.halted);
}
#[test]
fn test_field_count_error() {
let csv_data = "field1,field2,field3\r\na,b,c\r\nd,e,f,g\r\n";
let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
assert_eq!(result.errors.len(), 1);
assert_eq!(result.errors[0].record_num, 2);
assert_eq!(result.errors[0].error, CsvErrorKind::FieldCount);
assert_eq!(
result.errors[0].record,
Some(vec![
"d".to_string(),
"e".to_string(),
"f".to_string(),
"g".to_string()
])
);
}
#[test]
fn test_line_ending_validation() {
let csv_data = "field1,field2,field3\na,b,c\nd,e,f\n"; let result = validate(Cursor::new(csv_data), b',', false, true).unwrap(); assert!(!result.errors.is_empty());
assert!(
result
.errors
.iter()
.any(|e| matches!(e.error, CsvErrorKind::InvalidLineEnding))
);
}
#[test]
fn test_lazy_quotes_allows_lf() {
let csv_data = "field1,field2,field3\na,b,c\nd,e,f\n"; let result = validate(Cursor::new(csv_data), b',', true, false).unwrap(); assert!(
result
.errors
.iter()
.all(|e| !matches!(e.error, CsvErrorKind::InvalidLineEnding))
);
}
#[test]
fn test_csv_parser_validation() {
let csv_data = "field1,field2,field3\r\na,b,c\r\n";
let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
assert!(result.errors.is_empty());
}
#[test]
fn test_proper_quote_escaping() {
let csv_data = "field1,field2,field3\r\n\"a\",\"b\"\"c\",\"d\"\r\n";
let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
for error in &result.errors {
println!("Error: {:?}", error);
}
assert!(result.errors.is_empty());
}
#[test]
fn test_different_delimiters() {
let csv_data = "field1\tfield2\tfield3\r\na\tb\tc\r\nd\te\tf\r\n";
let result = validate(Cursor::new(csv_data), b'\t', false, false).unwrap();
assert!(result.errors.is_empty());
assert!(!result.halted);
}
#[test]
fn test_multiple_field_count_errors() {
let csv_data = "field1,field2,field3\r\na,b,c\r\nd,e,f,g\r\nh,i,j\r\nk,l,m,n\r\n";
let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
assert_eq!(result.errors.len(), 2);
assert_eq!(result.errors[0].record_num, 2);
assert_eq!(result.errors[1].record_num, 4);
}
#[test]
fn test_rfc4180_compliance_mode() {
let csv_data =
"Name,Age,City\r\n\"John Doe\",30,\"New York\"\r\n\"Jane Smith\",25,Chicago\r\n";
let result = validate(Cursor::new(csv_data), b',', false, true).unwrap(); assert!(result.errors.is_empty());
assert!(!result.halted);
}
#[test]
fn test_fields_with_commas_and_quotes() {
let csv_data = "field1,field2,field3\r\n\"a,b\",\"c\"\"d\",\"e\r\nf\"\r\n";
let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
assert!(result.errors.is_empty());
}
struct TestCase {
file: &'static str,
delimiter: u8,
expected_errors: usize,
expected_error_records: Vec<usize>,
expected_halted: bool,
}
#[test]
fn integration_tests() {
let test_cases = vec![
TestCase {
file: "test_data/perfect.csv",
delimiter: b',',
expected_errors: 0,
expected_error_records: vec![],
expected_halted: false,
},
TestCase {
file: "test_data/perfect_tab.csv",
delimiter: b'\t',
expected_errors: 0,
expected_error_records: vec![],
expected_halted: false,
},
TestCase {
file: "test_data/perfect_pipe.csv",
delimiter: b'|',
expected_errors: 0,
expected_error_records: vec![],
expected_halted: false,
},
TestCase {
file: "test_data/perfect_colon.csv",
delimiter: b':',
expected_errors: 0,
expected_error_records: vec![],
expected_halted: false,
},
TestCase {
file: "test_data/perfect_semicolon.csv",
delimiter: b';',
expected_errors: 0,
expected_error_records: vec![],
expected_halted: false,
},
TestCase {
file: "test_data/one_long_column.csv",
delimiter: b',',
expected_errors: 1,
expected_error_records: vec![2],
expected_halted: false,
},
TestCase {
file: "test_data/mult_long_columns.csv",
delimiter: b',',
expected_errors: 2,
expected_error_records: vec![2, 4],
expected_halted: false,
},
TestCase {
file: "test_data/mult_long_columns_tabs.csv",
delimiter: b'\t',
expected_errors: 2,
expected_error_records: vec![2, 4],
expected_halted: false,
},
];
for test_case in test_cases {
println!("Testing file: {}", test_case.file);
let file = File::open(test_case.file)
.unwrap_or_else(|_| panic!("Could not open test file: {}", test_case.file));
let result = validate(file, test_case.delimiter, true, false).unwrap();
let relevant_errors: Vec<_> = result
.errors
.iter()
.filter(|e| !matches!(e.error, CsvErrorKind::InvalidLineEnding))
.collect();
assert_eq!(
relevant_errors.len(),
test_case.expected_errors,
"Wrong number of errors for {}",
test_case.file
);
assert_eq!(
result.halted, test_case.expected_halted,
"Wrong halted status for {}",
test_case.file
);
for (i, expected_record_num) in test_case.expected_error_records.iter().enumerate() {
assert_eq!(
relevant_errors[i].record_num, *expected_record_num,
"Wrong record number for error {} in {}",
i, test_case.file
);
assert_eq!(
relevant_errors[i].error,
CsvErrorKind::FieldCount,
"Wrong error type for error {} in {}",
i,
test_case.file
);
}
}
}
#[test]
fn test_error_display() {
let error = CsvError {
record: Some(vec!["a".to_string(), "b".to_string(), "c".to_string()]),
record_num: 3,
error: CsvErrorKind::FieldCount,
};
assert_eq!(
error.to_string(),
"Record #3 has error: wrong number of fields"
);
let error = CsvError {
record: Some(vec!["d".to_string(), "e".to_string(), "f".to_string()]),
record_num: 1,
error: CsvErrorKind::BareQuote,
};
assert_eq!(
error.to_string(),
"Record #1 has error: bare \" in non-quoted-field"
);
}
}