1use csv::{ReaderBuilder, StringRecord};
2use std::io::Read;
3use thiserror::Error;
4
5#[derive(Debug, Clone, PartialEq)]
7pub struct CsvError {
8 pub record: Option<Vec<String>>,
10 pub record_num: usize,
12 pub error: CsvErrorKind,
14}
15
16#[derive(Debug, Clone, PartialEq, Error)]
18pub enum CsvErrorKind {
19 #[error("wrong number of fields")]
20 FieldCount,
21 #[error("bare \" in non-quoted-field")]
22 BareQuote,
23 #[error("quote in quoted field")]
24 Quote,
25 #[error("invalid escape sequence")]
26 InvalidEscape,
27 #[error("unterminated quote")]
28 UnterminatedQuote,
29 #[error("invalid line ending (RFC 4180 requires CRLF)")]
30 InvalidLineEnding,
31 #[error("field contains unescaped special characters")]
32 UnescapedSpecialChars,
33 #[error("trailing comma found")]
34 TrailingComma,
35 #[error("I/O error: {0}")]
36 Io(String),
37 #[error("UTF-8 error: {0}")]
38 Utf8(String),
39}
40
41impl std::fmt::Display for CsvError {
42 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
43 write!(f, "Record #{} has error: {}", self.record_num, self.error)
44 }
45}
46
47#[derive(Debug)]
49pub struct ValidationResult {
50 pub errors: Vec<CsvError>,
52 pub halted: bool,
54}
55
56pub fn validate<R: Read>(
66 reader: R,
67 delimiter: u8,
68 lazy_quotes: bool,
69 rfc4180_mode: bool,
70) -> Result<ValidationResult, Box<dyn std::error::Error>> {
71 let mut content = Vec::new();
73 let mut reader = reader;
74 reader.read_to_end(&mut content)?;
75
76 let mut errors = Vec::new();
77
78 if rfc4180_mode {
80 validate_line_endings(&content, &mut errors);
81 }
82
83 let cursor = std::io::Cursor::new(&content);
85 let mut csv_reader = ReaderBuilder::new()
86 .delimiter(delimiter)
87 .flexible(true) .quoting(!lazy_quotes) .from_reader(cursor);
90
91 let mut record_num = 0;
92 let mut header_len: Option<usize> = None;
93 let mut string_record = StringRecord::new();
94
95 match csv_reader.read_record(&mut string_record) {
97 Ok(has_record) => {
98 if has_record {
99 header_len = Some(string_record.len());
100 if !lazy_quotes {
102 validate_record_format(&string_record, 0, &mut errors);
103 }
104 }
105 }
106 Err(csv_error) => {
107 errors.push(CsvError {
108 record: None,
109 record_num: 0,
110 error: convert_csv_error(&csv_error),
111 });
112 return Ok(ValidationResult {
113 errors,
114 halted: true,
115 });
116 }
117 }
118
119 loop {
121 match csv_reader.read_record(&mut string_record) {
122 Ok(has_record) => {
123 if !has_record {
124 break; }
126
127 record_num += 1;
128
129 if !lazy_quotes {
131 validate_record_format(&string_record, record_num + 1, &mut errors);
132 }
133
134 if let Some(expected_len) = header_len {
136 if string_record.len() != expected_len {
137 errors.push(CsvError {
138 record: Some(string_record.iter().map(|s| s.to_string()).collect()),
139 record_num: record_num + 1, error: CsvErrorKind::FieldCount,
141 });
142 }
143 }
144 }
145 Err(csv_error) => {
146 let error_kind = convert_csv_error(&csv_error);
148
149 errors.push(CsvError {
150 record: None,
151 record_num: record_num + 1,
152 error: error_kind,
153 });
154
155 let halted = matches!(
157 csv_error.kind(),
158 csv::ErrorKind::Io(_) | csv::ErrorKind::Utf8 { .. }
159 );
160
161 return Ok(ValidationResult { errors, halted });
162 }
163 }
164 }
165
166 Ok(ValidationResult {
167 errors,
168 halted: false,
169 })
170}
171
172fn validate_line_endings(content: &[u8], errors: &mut Vec<CsvError>) {
174 let mut line_num = 1;
175 let mut i = 0;
176
177 while i < content.len() {
178 if content[i] == b'\n' {
179 if i == 0 || content[i - 1] != b'\r' {
181 errors.push(CsvError {
182 record: None,
183 record_num: line_num,
184 error: CsvErrorKind::InvalidLineEnding,
185 });
186 }
187 line_num += 1;
188 } else if content[i] == b'\r' {
189 if i + 1 >= content.len() || content[i + 1] != b'\n' {
191 errors.push(CsvError {
192 record: None,
193 record_num: line_num,
194 error: CsvErrorKind::InvalidLineEnding,
195 });
196 }
197 }
198 i += 1;
199 }
200}
201
202fn validate_record_format(_record: &StringRecord, _record_num: usize, _errors: &mut [CsvError]) {
205 }
214
215fn convert_csv_error(csv_error: &csv::Error) -> CsvErrorKind {
217 match csv_error.kind() {
218 csv::ErrorKind::UnequalLengths { .. } => CsvErrorKind::FieldCount,
219 csv::ErrorKind::Utf8 { .. } => CsvErrorKind::Utf8(csv_error.to_string()),
220 csv::ErrorKind::Io(_) => CsvErrorKind::Io(csv_error.to_string()),
221 _ => {
222 let error_msg = csv_error.to_string().to_lowercase();
224 if error_msg.contains("bare") {
225 CsvErrorKind::BareQuote
226 } else if error_msg.contains("quote") || error_msg.contains("unterminated") {
227 if error_msg.contains("unterminated") {
228 CsvErrorKind::UnterminatedQuote
229 } else {
230 CsvErrorKind::Quote
231 }
232 } else {
233 CsvErrorKind::InvalidEscape
234 }
235 }
236 }
237}
238
239#[cfg(test)]
240mod tests {
241 use super::*;
242 use std::fs::File;
243 use std::io::Cursor;
244
245 #[test]
246 fn test_perfect_csv() {
247 let csv_data = "field1,field2,field3\r\na,b,c\r\nd,e,f\r\n";
248 let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
249 assert!(result.errors.is_empty());
250 assert!(!result.halted);
251 }
252
253 #[test]
254 fn test_field_count_error() {
255 let csv_data = "field1,field2,field3\r\na,b,c\r\nd,e,f,g\r\n";
256 let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
257 assert_eq!(result.errors.len(), 1);
258 assert_eq!(result.errors[0].record_num, 2);
259 assert_eq!(result.errors[0].error, CsvErrorKind::FieldCount);
260 assert_eq!(
261 result.errors[0].record,
262 Some(vec![
263 "d".to_string(),
264 "e".to_string(),
265 "f".to_string(),
266 "g".to_string()
267 ])
268 );
269 }
270
271 #[test]
272 fn test_line_ending_validation() {
273 let csv_data = "field1,field2,field3\na,b,c\nd,e,f\n"; let result = validate(Cursor::new(csv_data), b',', false, true).unwrap(); assert!(!result.errors.is_empty());
276 assert!(
277 result
278 .errors
279 .iter()
280 .any(|e| matches!(e.error, CsvErrorKind::InvalidLineEnding))
281 );
282 }
283
284 #[test]
285 fn test_lazy_quotes_allows_lf() {
286 let csv_data = "field1,field2,field3\na,b,c\nd,e,f\n"; let result = validate(Cursor::new(csv_data), b',', true, false).unwrap(); assert!(
290 result
291 .errors
292 .iter()
293 .all(|e| !matches!(e.error, CsvErrorKind::InvalidLineEnding))
294 );
295 }
296
297 #[test]
298 fn test_csv_parser_validation() {
299 let csv_data = "field1,field2,field3\r\na,b,c\r\n";
302 let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
303 assert!(result.errors.is_empty());
304 }
305
306 #[test]
307 fn test_proper_quote_escaping() {
308 let csv_data = "field1,field2,field3\r\n\"a\",\"b\"\"c\",\"d\"\r\n";
309 let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
310 for error in &result.errors {
311 println!("Error: {:?}", error);
312 }
313 assert!(result.errors.is_empty());
314 }
315
316 #[test]
317 fn test_different_delimiters() {
318 let csv_data = "field1\tfield2\tfield3\r\na\tb\tc\r\nd\te\tf\r\n";
319 let result = validate(Cursor::new(csv_data), b'\t', false, false).unwrap();
320 assert!(result.errors.is_empty());
321 assert!(!result.halted);
322 }
323
324 #[test]
325 fn test_multiple_field_count_errors() {
326 let csv_data = "field1,field2,field3\r\na,b,c\r\nd,e,f,g\r\nh,i,j\r\nk,l,m,n\r\n";
327 let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
328 assert_eq!(result.errors.len(), 2);
329 assert_eq!(result.errors[0].record_num, 2);
330 assert_eq!(result.errors[1].record_num, 4);
331 }
332
333 #[test]
334 fn test_rfc4180_compliance_mode() {
335 let csv_data =
337 "Name,Age,City\r\n\"John Doe\",30,\"New York\"\r\n\"Jane Smith\",25,Chicago\r\n";
338 let result = validate(Cursor::new(csv_data), b',', false, true).unwrap(); assert!(result.errors.is_empty());
340 assert!(!result.halted);
341 }
342
343 #[test]
344 fn test_fields_with_commas_and_quotes() {
345 let csv_data = "field1,field2,field3\r\n\"a,b\",\"c\"\"d\",\"e\r\nf\"\r\n";
346 let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
347 assert!(result.errors.is_empty());
348 }
349
350 struct TestCase {
352 file: &'static str,
353 delimiter: u8,
354 expected_errors: usize,
355 expected_error_records: Vec<usize>,
356 expected_halted: bool,
357 }
358
359 #[test]
360 fn integration_tests() {
361 let test_cases = vec![
362 TestCase {
363 file: "test_data/perfect.csv",
364 delimiter: b',',
365 expected_errors: 0,
366 expected_error_records: vec![],
367 expected_halted: false,
368 },
369 TestCase {
370 file: "test_data/perfect_tab.csv",
371 delimiter: b'\t',
372 expected_errors: 0,
373 expected_error_records: vec![],
374 expected_halted: false,
375 },
376 TestCase {
377 file: "test_data/perfect_pipe.csv",
378 delimiter: b'|',
379 expected_errors: 0,
380 expected_error_records: vec![],
381 expected_halted: false,
382 },
383 TestCase {
384 file: "test_data/perfect_colon.csv",
385 delimiter: b':',
386 expected_errors: 0,
387 expected_error_records: vec![],
388 expected_halted: false,
389 },
390 TestCase {
391 file: "test_data/perfect_semicolon.csv",
392 delimiter: b';',
393 expected_errors: 0,
394 expected_error_records: vec![],
395 expected_halted: false,
396 },
397 TestCase {
398 file: "test_data/one_long_column.csv",
399 delimiter: b',',
400 expected_errors: 1,
401 expected_error_records: vec![2],
402 expected_halted: false,
403 },
404 TestCase {
405 file: "test_data/mult_long_columns.csv",
406 delimiter: b',',
407 expected_errors: 2,
408 expected_error_records: vec![2, 4],
409 expected_halted: false,
410 },
411 TestCase {
412 file: "test_data/mult_long_columns_tabs.csv",
413 delimiter: b'\t',
414 expected_errors: 2,
415 expected_error_records: vec![2, 4],
416 expected_halted: false,
417 },
418 ];
419
420 for test_case in test_cases {
421 println!("Testing file: {}", test_case.file);
422
423 let file = File::open(test_case.file)
424 .unwrap_or_else(|_| panic!("Could not open test file: {}", test_case.file));
425
426 let result = validate(file, test_case.delimiter, true, false).unwrap();
428
429 let relevant_errors: Vec<_> = result
431 .errors
432 .iter()
433 .filter(|e| !matches!(e.error, CsvErrorKind::InvalidLineEnding))
434 .collect();
435
436 assert_eq!(
437 relevant_errors.len(),
438 test_case.expected_errors,
439 "Wrong number of errors for {}",
440 test_case.file
441 );
442
443 assert_eq!(
444 result.halted, test_case.expected_halted,
445 "Wrong halted status for {}",
446 test_case.file
447 );
448
449 for (i, expected_record_num) in test_case.expected_error_records.iter().enumerate() {
450 assert_eq!(
451 relevant_errors[i].record_num, *expected_record_num,
452 "Wrong record number for error {} in {}",
453 i, test_case.file
454 );
455 assert_eq!(
456 relevant_errors[i].error,
457 CsvErrorKind::FieldCount,
458 "Wrong error type for error {} in {}",
459 i,
460 test_case.file
461 );
462 }
463 }
464 }
465
466 #[test]
467 fn test_error_display() {
468 let error = CsvError {
469 record: Some(vec!["a".to_string(), "b".to_string(), "c".to_string()]),
470 record_num: 3,
471 error: CsvErrorKind::FieldCount,
472 };
473 assert_eq!(
474 error.to_string(),
475 "Record #3 has error: wrong number of fields"
476 );
477
478 let error = CsvError {
479 record: Some(vec!["d".to_string(), "e".to_string(), "f".to_string()]),
480 record_num: 1,
481 error: CsvErrorKind::BareQuote,
482 };
483 assert_eq!(
484 error.to_string(),
485 "Record #1 has error: bare \" in non-quoted-field"
486 );
487 }
488}