Skip to main content

kobold_csv/
dialect.rs

1//! The delimited-file DIALECT layer -- what makes kobold-csv CSV-aware.
2//!
3//! A [`Dialect`] fixes the four parameters that decide how a flat record of string fields becomes one line
4//! of delimited text and back: the field `delimiter` (`,` `|` `\t` ...), the `quote` character, whether to
5//! quote `quote_all` fields unconditionally, and the [`LineTerminator`] used between rows.
6//!
7//! ## Quoting (RFC-4180 style)
8//!
9//! A field is QUOTED when it contains the delimiter, the quote character, a carriage return, or a line feed
10//! (or always, when `quote_all`). Inside a quoted field an embedded quote is DOUBLED (`"` -> `""`). This is
11//! the de-facto CSV escaping convention (RFC 4180, and what spreadsheets/`COPY ... CSV` agree on). Doing it
12//! by hand -- rather than depending on a CSV crate -- keeps kobold-csv std-only and lets the escaping itself
13//! be evidence (`KOBOLD.CSV.ESCAPE`): the writer and the fail-closed reader are exact inverses.
14//!
15//! ## Fail-closed reader
16//!
17//! [`parse_row`] is deliberately strict: a quote that opens a field but never closes, or stray text after a
18//! closing quote, is a [`Finding`], NEVER a best-effort guess. A reconciliation tool must be able to trust
19//! that a row either parsed exactly or was rejected with a reason -- silent recovery would corrupt custody.
20//!
21//! This module is independent of GnuCOBOL/libcob.
22
23use crate::model::Finding;
24
25// Character literals are spelled as hex byte constants where a bare literal would be visually ambiguous
26// (a comma, a tab, a double quote). This keeps the source unambiguous and ASCII-only.
27const COMMA: u8 = 0x2c; // ','
28const PIPE: u8 = 0x7c; // '|'
29const TAB: u8 = 0x09; // '\t'
30const DQUOTE: u8 = 0x22; // '"'
31const CR: u8 = 0x0d; // '\r'
32const LF: u8 = 0x0a; // '\n'
33
34/// The line terminator a dialect writes between rows (and tolerates when reading).
35#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub enum LineTerminator {
37    /// A single line feed (`\n`) -- the Unix convention.
38    Lf,
39    /// A carriage return + line feed (`\r\n`) -- the RFC-4180 / DOS convention.
40    CrLf,
41}
42
43impl LineTerminator {
44    /// The bytes this terminator emits.
45    pub fn bytes(self) -> &'static [u8] {
46        match self {
47            LineTerminator::Lf => b"\n",
48            LineTerminator::CrLf => b"\r\n",
49        }
50    }
51}
52
53/// A delimited-file dialect: delimiter, quote char, quote-all flag, and line terminator.
54#[derive(Debug, Clone, Copy, PartialEq, Eq)]
55pub struct Dialect {
56    /// The field delimiter byte (e.g. `,` `|` tab).
57    pub delimiter: u8,
58    /// The quote byte (conventionally `"`).
59    pub quote: u8,
60    /// When true, every field is quoted regardless of content (some downstream loaders require it).
61    pub quote_all: bool,
62    /// The terminator written between rows.
63    pub line_terminator: LineTerminator,
64}
65
66impl Dialect {
67    /// Comma-separated, double-quoted, LF terminated -- the default CSV dialect.
68    pub fn csv() -> Self {
69        Dialect { delimiter: COMMA, quote: DQUOTE, quote_all: false, line_terminator: LineTerminator::Lf }
70    }
71
72    /// Pipe-separated, double-quoted, LF terminated -- common in mainframe extract files where commas occur
73    /// inside data.
74    pub fn pipe() -> Self {
75        Dialect { delimiter: PIPE, quote: DQUOTE, quote_all: false, line_terminator: LineTerminator::Lf }
76    }
77
78    /// Tab-separated, double-quoted, LF terminated (TSV).
79    pub fn tab() -> Self {
80        Dialect { delimiter: TAB, quote: DQUOTE, quote_all: false, line_terminator: LineTerminator::Lf }
81    }
82}
83
84/// Decide whether `field` must be quoted under dialect `d`: it must when `quote_all` is set, or when the
85/// text contains the delimiter, the quote char, CR, or LF.
86fn needs_quoting(field: &str, d: &Dialect) -> bool {
87    if d.quote_all {
88        return true;
89    }
90    field.bytes().any(|b| b == d.delimiter || b == d.quote || b == CR || b == LF)
91}
92
93/// Write one `field` to `out` under dialect `d`, quoting and doubling embedded quotes as required. The
94/// inverse of [`parse_row`]'s field handling: `write_field`/`parse_row` are an exact round-trip pair.
95pub fn write_field(field: &str, d: &Dialect, out: &mut String) {
96    if !needs_quoting(field, d) {
97        out.push_str(field);
98        return;
99    }
100    out.push(d.quote as char);
101    for b in field.bytes() {
102        if b == d.quote {
103            // Double the embedded quote.
104            out.push(d.quote as char);
105            out.push(d.quote as char);
106        } else {
107            out.push(b as char);
108        }
109    }
110    out.push(d.quote as char);
111}
112
113/// Write a full `row` of fields to `out` under dialect `d`, joining with the delimiter and appending the
114/// line terminator. Does NOT add a trailing terminator beyond this row's own.
115pub fn write_row(row: &[String], d: &Dialect, out: &mut String) {
116    for (i, field) in row.iter().enumerate() {
117        if i > 0 {
118            out.push(d.delimiter as char);
119        }
120        write_field(field, d, out);
121    }
122    for &b in d.line_terminator.bytes() {
123        out.push(b as char);
124    }
125}
126
127/// `KOBOLD.CSV.ESCAPE` / parse evidence: FAIL-CLOSED parse of one `line` of delimited bytes into its fields
128/// under dialect `d`.
129///
130/// `line` is one logical record's bytes WITHOUT its line terminator (a trailing CR is tolerated and stripped
131/// to support CrLf splitting). Handles: unquoted fields, quoted fields, doubled quotes inside quoted fields,
132/// and embedded delimiters/CR/LF inside quoted fields.
133///
134/// Fails closed (returns a [`Finding`], never a best-effort row) on a quote that opens a field but never
135/// closes, or any non-delimiter text immediately after a closing quote.
136pub fn parse_row(line: &[u8], d: &Dialect) -> Result<Vec<String>, Finding> {
137    // Tolerate a single trailing CR (the case where a CrLf file was split on LF only).
138    let line = if line.last() == Some(&CR) { &line[..line.len() - 1] } else { line };
139
140    let mut fields: Vec<String> = Vec::new();
141    let mut i = 0usize;
142    let n = line.len();
143
144    loop {
145        // Parse one field starting at i.
146        let mut field = String::new();
147        if i < n && line[i] == d.quote {
148            // Quoted field.
149            i += 1; // consume opening quote
150            loop {
151                if i >= n {
152                    return Err(Finding::new(
153                        "CSV_UNTERMINATED_QUOTE",
154                        "quoted field opened but never closed before end of line".to_string(),
155                    ));
156                }
157                let b = line[i];
158                if b == d.quote {
159                    // Either a doubled quote (escaped) or the closing quote.
160                    if i + 1 < n && line[i + 1] == d.quote {
161                        field.push(d.quote as char);
162                        i += 2;
163                        continue;
164                    }
165                    // Closing quote.
166                    i += 1;
167                    // Must be at end-of-line or a delimiter now; anything else is malformed.
168                    if i < n && line[i] != d.delimiter {
169                        return Err(Finding::new(
170                            "CSV_TEXT_AFTER_QUOTE",
171                            format!(
172                                "unexpected byte 0x{:02x} after closing quote (field {})",
173                                line[i],
174                                fields.len()
175                            ),
176                        ));
177                    }
178                    break;
179                }
180                field.push(b as char);
181                i += 1;
182            }
183        } else {
184            // Unquoted field: read up to the next delimiter or end of line. A quote inside an unquoted
185            // field is taken literally (it is not a CSV special there).
186            while i < n && line[i] != d.delimiter {
187                field.push(line[i] as char);
188                i += 1;
189            }
190        }
191
192        fields.push(field);
193
194        if i >= n {
195            break;
196        }
197        // line[i] is the delimiter.
198        i += 1;
199        // A trailing delimiter at end-of-line means a final empty field.
200        if i >= n {
201            fields.push(String::new());
202            break;
203        }
204    }
205
206    Ok(fields)
207}
208
209#[cfg(test)]
210mod tests {
211    use super::*;
212
213    #[test]
214    fn plain_fields_unquoted() {
215        let d = Dialect::csv();
216        let mut s = String::new();
217        write_row(&["A".into(), "B".into(), "C".into()], &d, &mut s);
218        assert_eq!(s, "A,B,C\n");
219        assert_eq!(parse_row(b"A,B,C", &d).unwrap(), vec!["A", "B", "C"]);
220    }
221
222    #[test]
223    fn field_with_delimiter_is_quoted_and_roundtrips() {
224        let d = Dialect::csv();
225        let mut s = String::new();
226        write_field("a,b", &d, &mut s);
227        assert_eq!(s, "\"a,b\"");
228        assert_eq!(parse_row(s.as_bytes(), &d).unwrap(), vec!["a,b"]);
229    }
230
231    #[test]
232    fn embedded_quote_is_doubled_and_roundtrips() {
233        // KOBOLD.CSV.ESCAPE: a field containing both the delimiter and an embedded quote.
234        let d = Dialect::csv();
235        let field = "say \"hi\", now"; // contains a comma and double quotes
236        let mut s = String::new();
237        write_field(field, &d, &mut s);
238        // -> "say ""hi"", now"
239        assert_eq!(s, "\"say \"\"hi\"\", now\"");
240        assert_eq!(parse_row(s.as_bytes(), &d).unwrap(), vec![field]);
241    }
242
243    #[test]
244    fn quote_all_quotes_plain_fields() {
245        let mut d = Dialect::csv();
246        d.quote_all = true;
247        let mut s = String::new();
248        write_field("AB", &d, &mut s);
249        assert_eq!(s, "\"AB\"");
250    }
251
252    #[test]
253    fn trailing_empty_field() {
254        let d = Dialect::csv();
255        assert_eq!(parse_row(b"A,", &d).unwrap(), vec!["A", ""]);
256        assert_eq!(parse_row(b",", &d).unwrap(), vec!["", ""]);
257    }
258
259    #[test]
260    fn crlf_trailing_cr_tolerated() {
261        let d = Dialect::csv();
262        assert_eq!(parse_row(b"A,B\r", &d).unwrap(), vec!["A", "B"]);
263    }
264
265    #[test]
266    fn fail_closed_unterminated_quote() {
267        let d = Dialect::csv();
268        let f = parse_row(b"\"abc", &d).expect_err("must fail closed");
269        assert_eq!(f.code, "CSV_UNTERMINATED_QUOTE");
270    }
271
272    #[test]
273    fn fail_closed_text_after_quote() {
274        let d = Dialect::csv();
275        let f = parse_row(b"\"ab\"c", &d).expect_err("must fail closed");
276        assert_eq!(f.code, "CSV_TEXT_AFTER_QUOTE");
277    }
278
279    #[test]
280    fn pipe_dialect() {
281        let d = Dialect::pipe();
282        let mut s = String::new();
283        write_row(&["X".into(), "a|b".into()], &d, &mut s);
284        assert_eq!(s, "X|\"a|b\"\n");
285        assert_eq!(parse_row(b"X|\"a|b\"", &d).unwrap(), vec!["X", "a|b"]);
286    }
287
288    #[test]
289    fn tab_dialect() {
290        let d = Dialect::tab();
291        assert_eq!(d.delimiter, 0x09);
292        let mut s = String::new();
293        write_row(&["A".into(), "B".into()], &d, &mut s);
294        assert_eq!(s, "A\tB\n");
295    }
296}