Skip to main content

csvlint/
lib.rs

1use csv::{ReaderBuilder, StringRecord};
2use std::io::Read;
3use thiserror::Error;
4
5/// Error information about an invalid record in a CSV file
6#[derive(Debug, Clone, PartialEq)]
7pub struct CsvError {
8    /// The invalid record. This will be None when we were unable to parse a record.
9    pub record: Option<Vec<String>>,
10    /// The record number of this record (1-indexed, excluding header)
11    pub record_num: usize,
12    /// The underlying error
13    pub error: CsvErrorKind,
14}
15
16/// Types of CSV validation errors
17#[derive(Debug, Clone, PartialEq, Error)]
18pub enum CsvErrorKind {
19    #[error("wrong number of fields")]
20    FieldCount,
21    #[error("bare \" in non-quoted-field")]
22    BareQuote,
23    #[error("quote in quoted field")]
24    Quote,
25    #[error("invalid escape sequence")]
26    InvalidEscape,
27    #[error("unterminated quote")]
28    UnterminatedQuote,
29    #[error("invalid line ending (RFC 4180 requires CRLF)")]
30    InvalidLineEnding,
31    #[error("field contains unescaped special characters")]
32    UnescapedSpecialChars,
33    #[error("trailing comma found")]
34    TrailingComma,
35    #[error("I/O error: {0}")]
36    Io(String),
37    #[error("UTF-8 error: {0}")]
38    Utf8(String),
39}
40
41impl std::fmt::Display for CsvError {
42    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
43        write!(f, "Record #{} has error: {}", self.record_num, self.error)
44    }
45}
46
47/// Result of CSV validation
48#[derive(Debug)]
49pub struct ValidationResult {
50    /// List of validation errors found
51    pub errors: Vec<CsvError>,
52    /// Whether parsing was halted due to a fatal error
53    pub halted: bool,
54}
55
56/// Validates whether a CSV file conforms to RFC 4180
57///
58/// # Arguments
59/// * `reader` - A reader containing CSV data
60/// * `delimiter` - The field delimiter character (e.g., ',', '\t', '|')
61/// * `lazy_quotes` - Whether to attempt parsing lines that aren't quoted properly
62///
63/// # Returns
64/// A `ValidationResult` containing any errors found and whether parsing was halted
65pub fn validate<R: Read>(
66    reader: R,
67    delimiter: u8,
68    lazy_quotes: bool,
69    rfc4180_mode: bool,
70) -> Result<ValidationResult, Box<dyn std::error::Error>> {
71    // First, read the entire content to check line endings and other RFC 4180 requirements
72    let mut content = Vec::new();
73    let mut reader = reader;
74    reader.read_to_end(&mut content)?;
75
76    let mut errors = Vec::new();
77
78    // Check for proper line endings (RFC 4180 requires CRLF)
79    if rfc4180_mode {
80        validate_line_endings(&content, &mut errors);
81    }
82
83    // Now validate CSV structure using the csv crate
84    let cursor = std::io::Cursor::new(&content);
85    let mut csv_reader = ReaderBuilder::new()
86        .delimiter(delimiter)
87        .flexible(true) // Allow variable number of fields per record for validation
88        .quoting(!lazy_quotes) // Disable strict quoting if lazy_quotes is true
89        .from_reader(cursor);
90
91    let mut record_num = 0;
92    let mut header_len: Option<usize> = None;
93    let mut string_record = StringRecord::new();
94
95    // Read header first
96    match csv_reader.read_record(&mut string_record) {
97        Ok(has_record) => {
98            if has_record {
99                header_len = Some(string_record.len());
100                // Validate header doesn't end with comma (trailing comma)
101                if !lazy_quotes {
102                    validate_record_format(&string_record, 0, &mut errors);
103                }
104            }
105        }
106        Err(csv_error) => {
107            errors.push(CsvError {
108                record: None,
109                record_num: 0,
110                error: convert_csv_error(&csv_error),
111            });
112            return Ok(ValidationResult {
113                errors,
114                halted: true,
115            });
116        }
117    }
118
119    // Read remaining records
120    loop {
121        match csv_reader.read_record(&mut string_record) {
122            Ok(has_record) => {
123                if !has_record {
124                    break; // End of file
125                }
126
127                record_num += 1;
128
129                // Validate record format (quotes, escaping, etc.)
130                if !lazy_quotes {
131                    validate_record_format(&string_record, record_num + 1, &mut errors);
132                }
133
134                // Check field count consistency
135                if let Some(expected_len) = header_len {
136                    if string_record.len() != expected_len {
137                        errors.push(CsvError {
138                            record: Some(string_record.iter().map(|s| s.to_string()).collect()),
139                            record_num: record_num + 1, // +1 because we want to report 1-indexed record numbers including the header
140                            error: CsvErrorKind::FieldCount,
141                        });
142                    }
143                }
144            }
145            Err(csv_error) => {
146                // Convert csv::Error to our error types
147                let error_kind = convert_csv_error(&csv_error);
148
149                errors.push(CsvError {
150                    record: None,
151                    record_num: record_num + 1,
152                    error: error_kind,
153                });
154
155                // For serious parse errors, we should halt
156                let halted = matches!(
157                    csv_error.kind(),
158                    csv::ErrorKind::Io(_) | csv::ErrorKind::Utf8 { .. }
159                );
160
161                return Ok(ValidationResult { errors, halted });
162            }
163        }
164    }
165
166    Ok(ValidationResult {
167        errors,
168        halted: false,
169    })
170}
171
172/// Validates line endings according to RFC 4180 (requires CRLF)
173fn validate_line_endings(content: &[u8], errors: &mut Vec<CsvError>) {
174    let mut line_num = 1;
175    let mut i = 0;
176
177    while i < content.len() {
178        if content[i] == b'\n' {
179            // Found LF, check if it's preceded by CR
180            if i == 0 || content[i - 1] != b'\r' {
181                errors.push(CsvError {
182                    record: None,
183                    record_num: line_num,
184                    error: CsvErrorKind::InvalidLineEnding,
185                });
186            }
187            line_num += 1;
188        } else if content[i] == b'\r' {
189            // Found CR, check if it's followed by LF
190            if i + 1 >= content.len() || content[i + 1] != b'\n' {
191                errors.push(CsvError {
192                    record: None,
193                    record_num: line_num,
194                    error: CsvErrorKind::InvalidLineEnding,
195                });
196            }
197        }
198        i += 1;
199    }
200}
201
202/// Validates individual record format according to RFC 4180
203/// Note: This validates the raw CSV content, not parsed fields
204fn validate_record_format(_record: &StringRecord, _record_num: usize, _errors: &mut [CsvError]) {
205    // For now, we'll rely on the CSV parser's built-in validation
206    // since it already handles quote escaping and field parsing correctly.
207    // Additional validation could be added here for specific RFC 4180 requirements
208    // that the CSV parser doesn't enforce.
209
210    // The main validations we need (field count, line endings) are handled elsewhere.
211    // Quote validation is handled by the CSV parser itself and will generate parse errors
212    // if there are issues.
213}
214
215/// Converts csv crate errors to our error types
216fn convert_csv_error(csv_error: &csv::Error) -> CsvErrorKind {
217    match csv_error.kind() {
218        csv::ErrorKind::UnequalLengths { .. } => CsvErrorKind::FieldCount,
219        csv::ErrorKind::Utf8 { .. } => CsvErrorKind::Utf8(csv_error.to_string()),
220        csv::ErrorKind::Io(_) => CsvErrorKind::Io(csv_error.to_string()),
221        _ => {
222            // For parse errors, try to determine the specific type
223            let error_msg = csv_error.to_string().to_lowercase();
224            if error_msg.contains("bare") {
225                CsvErrorKind::BareQuote
226            } else if error_msg.contains("quote") || error_msg.contains("unterminated") {
227                if error_msg.contains("unterminated") {
228                    CsvErrorKind::UnterminatedQuote
229                } else {
230                    CsvErrorKind::Quote
231                }
232            } else {
233                CsvErrorKind::InvalidEscape
234            }
235        }
236    }
237}
238
239#[cfg(test)]
240mod tests {
241    use super::*;
242    use std::fs::File;
243    use std::io::Cursor;
244
245    #[test]
246    fn test_perfect_csv() {
247        let csv_data = "field1,field2,field3\r\na,b,c\r\nd,e,f\r\n";
248        let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
249        assert!(result.errors.is_empty());
250        assert!(!result.halted);
251    }
252
253    #[test]
254    fn test_field_count_error() {
255        let csv_data = "field1,field2,field3\r\na,b,c\r\nd,e,f,g\r\n";
256        let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
257        assert_eq!(result.errors.len(), 1);
258        assert_eq!(result.errors[0].record_num, 2);
259        assert_eq!(result.errors[0].error, CsvErrorKind::FieldCount);
260        assert_eq!(
261            result.errors[0].record,
262            Some(vec![
263                "d".to_string(),
264                "e".to_string(),
265                "f".to_string(),
266                "g".to_string()
267            ])
268        );
269    }
270
271    #[test]
272    fn test_line_ending_validation() {
273        let csv_data = "field1,field2,field3\na,b,c\nd,e,f\n"; // LF only, not CRLF
274        let result = validate(Cursor::new(csv_data), b',', false, true).unwrap(); // RFC 4180 mode
275        assert!(!result.errors.is_empty());
276        assert!(
277            result
278                .errors
279                .iter()
280                .any(|e| matches!(e.error, CsvErrorKind::InvalidLineEnding))
281        );
282    }
283
284    #[test]
285    fn test_lazy_quotes_allows_lf() {
286        let csv_data = "field1,field2,field3\na,b,c\nd,e,f\n"; // LF only
287        let result = validate(Cursor::new(csv_data), b',', true, false).unwrap(); // lazy_quotes = true, not RFC 4180
288        // Should not validate line endings in lazy mode
289        assert!(
290            result
291                .errors
292                .iter()
293                .all(|e| !matches!(e.error, CsvErrorKind::InvalidLineEnding))
294        );
295    }
296
297    #[test]
298    fn test_csv_parser_validation() {
299        // Test that the CSV parser can handle various quote scenarios
300        // Some parsers are more lenient than others regarding bare quotes
301        let csv_data = "field1,field2,field3\r\na,b,c\r\n";
302        let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
303        assert!(result.errors.is_empty());
304    }
305
306    #[test]
307    fn test_proper_quote_escaping() {
308        let csv_data = "field1,field2,field3\r\n\"a\",\"b\"\"c\",\"d\"\r\n";
309        let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
310        for error in &result.errors {
311            println!("Error: {:?}", error);
312        }
313        assert!(result.errors.is_empty());
314    }
315
316    #[test]
317    fn test_different_delimiters() {
318        let csv_data = "field1\tfield2\tfield3\r\na\tb\tc\r\nd\te\tf\r\n";
319        let result = validate(Cursor::new(csv_data), b'\t', false, false).unwrap();
320        assert!(result.errors.is_empty());
321        assert!(!result.halted);
322    }
323
324    #[test]
325    fn test_multiple_field_count_errors() {
326        let csv_data = "field1,field2,field3\r\na,b,c\r\nd,e,f,g\r\nh,i,j\r\nk,l,m,n\r\n";
327        let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
328        assert_eq!(result.errors.len(), 2);
329        assert_eq!(result.errors[0].record_num, 2);
330        assert_eq!(result.errors[1].record_num, 4);
331    }
332
333    #[test]
334    fn test_rfc4180_compliance_mode() {
335        // Test strict RFC 4180 compliance (comma delimiter, CRLF line endings)
336        let csv_data =
337            "Name,Age,City\r\n\"John Doe\",30,\"New York\"\r\n\"Jane Smith\",25,Chicago\r\n";
338        let result = validate(Cursor::new(csv_data), b',', false, true).unwrap(); // RFC 4180 mode
339        assert!(result.errors.is_empty());
340        assert!(!result.halted);
341    }
342
343    #[test]
344    fn test_fields_with_commas_and_quotes() {
345        let csv_data = "field1,field2,field3\r\n\"a,b\",\"c\"\"d\",\"e\r\nf\"\r\n";
346        let result = validate(Cursor::new(csv_data), b',', false, false).unwrap();
347        assert!(result.errors.is_empty());
348    }
349
350    // Integration tests using actual test data files
351    struct TestCase {
352        file: &'static str,
353        delimiter: u8,
354        expected_errors: usize,
355        expected_error_records: Vec<usize>,
356        expected_halted: bool,
357    }
358
359    #[test]
360    fn integration_tests() {
361        let test_cases = vec![
362            TestCase {
363                file: "test_data/perfect.csv",
364                delimiter: b',',
365                expected_errors: 0,
366                expected_error_records: vec![],
367                expected_halted: false,
368            },
369            TestCase {
370                file: "test_data/perfect_tab.csv",
371                delimiter: b'\t',
372                expected_errors: 0,
373                expected_error_records: vec![],
374                expected_halted: false,
375            },
376            TestCase {
377                file: "test_data/perfect_pipe.csv",
378                delimiter: b'|',
379                expected_errors: 0,
380                expected_error_records: vec![],
381                expected_halted: false,
382            },
383            TestCase {
384                file: "test_data/perfect_colon.csv",
385                delimiter: b':',
386                expected_errors: 0,
387                expected_error_records: vec![],
388                expected_halted: false,
389            },
390            TestCase {
391                file: "test_data/perfect_semicolon.csv",
392                delimiter: b';',
393                expected_errors: 0,
394                expected_error_records: vec![],
395                expected_halted: false,
396            },
397            TestCase {
398                file: "test_data/one_long_column.csv",
399                delimiter: b',',
400                expected_errors: 1,
401                expected_error_records: vec![2],
402                expected_halted: false,
403            },
404            TestCase {
405                file: "test_data/mult_long_columns.csv",
406                delimiter: b',',
407                expected_errors: 2,
408                expected_error_records: vec![2, 4],
409                expected_halted: false,
410            },
411            TestCase {
412                file: "test_data/mult_long_columns_tabs.csv",
413                delimiter: b'\t',
414                expected_errors: 2,
415                expected_error_records: vec![2, 4],
416                expected_halted: false,
417            },
418        ];
419
420        for test_case in test_cases {
421            println!("Testing file: {}", test_case.file);
422
423            let file = File::open(test_case.file)
424                .unwrap_or_else(|_| panic!("Could not open test file: {}", test_case.file));
425
426            // Use lazy quotes for existing test files to maintain compatibility
427            let result = validate(file, test_case.delimiter, true, false).unwrap();
428
429            // Filter out line ending errors for test compatibility
430            let relevant_errors: Vec<_> = result
431                .errors
432                .iter()
433                .filter(|e| !matches!(e.error, CsvErrorKind::InvalidLineEnding))
434                .collect();
435
436            assert_eq!(
437                relevant_errors.len(),
438                test_case.expected_errors,
439                "Wrong number of errors for {}",
440                test_case.file
441            );
442
443            assert_eq!(
444                result.halted, test_case.expected_halted,
445                "Wrong halted status for {}",
446                test_case.file
447            );
448
449            for (i, expected_record_num) in test_case.expected_error_records.iter().enumerate() {
450                assert_eq!(
451                    relevant_errors[i].record_num, *expected_record_num,
452                    "Wrong record number for error {} in {}",
453                    i, test_case.file
454                );
455                assert_eq!(
456                    relevant_errors[i].error,
457                    CsvErrorKind::FieldCount,
458                    "Wrong error type for error {} in {}",
459                    i,
460                    test_case.file
461                );
462            }
463        }
464    }
465
466    #[test]
467    fn test_error_display() {
468        let error = CsvError {
469            record: Some(vec!["a".to_string(), "b".to_string(), "c".to_string()]),
470            record_num: 3,
471            error: CsvErrorKind::FieldCount,
472        };
473        assert_eq!(
474            error.to_string(),
475            "Record #3 has error: wrong number of fields"
476        );
477
478        let error = CsvError {
479            record: Some(vec!["d".to_string(), "e".to_string(), "f".to_string()]),
480            record_num: 1,
481            error: CsvErrorKind::BareQuote,
482        };
483        assert_eq!(
484            error.to_string(),
485            "Record #1 has error: bare \" in non-quoted-field"
486        );
487    }
488}