kobold_csv/dialect.rs
1//! The delimited-file DIALECT layer -- what makes kobold-csv CSV-aware.
2//!
3//! A [`Dialect`] fixes the four parameters that decide how a flat record of string fields becomes one line
4//! of delimited text and back: the field `delimiter` (`,` `|` `\t` ...), the `quote` character, whether to
5//! quote `quote_all` fields unconditionally, and the [`LineTerminator`] used between rows.
6//!
7//! ## Quoting (RFC-4180 style)
8//!
9//! A field is QUOTED when it contains the delimiter, the quote character, a carriage return, or a line feed
10//! (or always, when `quote_all`). Inside a quoted field an embedded quote is DOUBLED (`"` -> `""`). This is
11//! the de-facto CSV escaping convention (RFC 4180, and what spreadsheets/`COPY ... CSV` agree on). Doing it
12//! by hand -- rather than depending on a CSV crate -- keeps kobold-csv std-only and lets the escaping itself
13//! be evidence (`KOBOLD.CSV.ESCAPE`): the writer and the fail-closed reader are exact inverses.
14//!
15//! ## Fail-closed reader
16//!
17//! [`parse_row`] is deliberately strict: a quote that opens a field but never closes, or stray text after a
18//! closing quote, is a [`Finding`], NEVER a best-effort guess. A reconciliation tool must be able to trust
19//! that a row either parsed exactly or was rejected with a reason -- silent recovery would corrupt custody.
20//!
21//! This module is independent of GnuCOBOL/libcob.
22
23use crate::model::Finding;
24
25// Character literals are spelled as hex byte constants where a bare literal would be visually ambiguous
26// (a comma, a tab, a double quote). This keeps the source unambiguous and ASCII-only.
27const COMMA: u8 = 0x2c; // ','
28const PIPE: u8 = 0x7c; // '|'
29const TAB: u8 = 0x09; // '\t'
30const DQUOTE: u8 = 0x22; // '"'
31const CR: u8 = 0x0d; // '\r'
32const LF: u8 = 0x0a; // '\n'
33
34/// The line terminator a dialect writes between rows (and tolerates when reading).
35#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub enum LineTerminator {
37 /// A single line feed (`\n`) -- the Unix convention.
38 Lf,
39 /// A carriage return + line feed (`\r\n`) -- the RFC-4180 / DOS convention.
40 CrLf,
41}
42
43impl LineTerminator {
44 /// The bytes this terminator emits.
45 pub fn bytes(self) -> &'static [u8] {
46 match self {
47 LineTerminator::Lf => b"\n",
48 LineTerminator::CrLf => b"\r\n",
49 }
50 }
51}
52
53/// A delimited-file dialect: delimiter, quote char, quote-all flag, and line terminator.
54#[derive(Debug, Clone, Copy, PartialEq, Eq)]
55pub struct Dialect {
56 /// The field delimiter byte (e.g. `,` `|` tab).
57 pub delimiter: u8,
58 /// The quote byte (conventionally `"`).
59 pub quote: u8,
60 /// When true, every field is quoted regardless of content (some downstream loaders require it).
61 pub quote_all: bool,
62 /// The terminator written between rows.
63 pub line_terminator: LineTerminator,
64}
65
66impl Dialect {
67 /// Comma-separated, double-quoted, LF terminated -- the default CSV dialect.
68 pub fn csv() -> Self {
69 Dialect { delimiter: COMMA, quote: DQUOTE, quote_all: false, line_terminator: LineTerminator::Lf }
70 }
71
72 /// Pipe-separated, double-quoted, LF terminated -- common in mainframe extract files where commas occur
73 /// inside data.
74 pub fn pipe() -> Self {
75 Dialect { delimiter: PIPE, quote: DQUOTE, quote_all: false, line_terminator: LineTerminator::Lf }
76 }
77
78 /// Tab-separated, double-quoted, LF terminated (TSV).
79 pub fn tab() -> Self {
80 Dialect { delimiter: TAB, quote: DQUOTE, quote_all: false, line_terminator: LineTerminator::Lf }
81 }
82}
83
84/// Decide whether `field` must be quoted under dialect `d`: it must when `quote_all` is set, or when the
85/// text contains the delimiter, the quote char, CR, or LF.
86fn needs_quoting(field: &str, d: &Dialect) -> bool {
87 if d.quote_all {
88 return true;
89 }
90 field.bytes().any(|b| b == d.delimiter || b == d.quote || b == CR || b == LF)
91}
92
93/// Write one `field` to `out` under dialect `d`, quoting and doubling embedded quotes as required. The
94/// inverse of [`parse_row`]'s field handling: `write_field`/`parse_row` are an exact round-trip pair.
95pub fn write_field(field: &str, d: &Dialect, out: &mut String) {
96 if !needs_quoting(field, d) {
97 out.push_str(field);
98 return;
99 }
100 out.push(d.quote as char);
101 for b in field.bytes() {
102 if b == d.quote {
103 // Double the embedded quote.
104 out.push(d.quote as char);
105 out.push(d.quote as char);
106 } else {
107 out.push(b as char);
108 }
109 }
110 out.push(d.quote as char);
111}
112
113/// Write a full `row` of fields to `out` under dialect `d`, joining with the delimiter and appending the
114/// line terminator. Does NOT add a trailing terminator beyond this row's own.
115pub fn write_row(row: &[String], d: &Dialect, out: &mut String) {
116 for (i, field) in row.iter().enumerate() {
117 if i > 0 {
118 out.push(d.delimiter as char);
119 }
120 write_field(field, d, out);
121 }
122 for &b in d.line_terminator.bytes() {
123 out.push(b as char);
124 }
125}
126
127/// `KOBOLD.CSV.ESCAPE` / parse evidence: FAIL-CLOSED parse of one `line` of delimited bytes into its fields
128/// under dialect `d`.
129///
130/// `line` is one logical record's bytes WITHOUT its line terminator (a trailing CR is tolerated and stripped
131/// to support CrLf splitting). Handles: unquoted fields, quoted fields, doubled quotes inside quoted fields,
132/// and embedded delimiters/CR/LF inside quoted fields.
133///
134/// Fails closed (returns a [`Finding`], never a best-effort row) on a quote that opens a field but never
135/// closes, or any non-delimiter text immediately after a closing quote.
136pub fn parse_row(line: &[u8], d: &Dialect) -> Result<Vec<String>, Finding> {
137 // Tolerate a single trailing CR (the case where a CrLf file was split on LF only).
138 let line = if line.last() == Some(&CR) { &line[..line.len() - 1] } else { line };
139
140 let mut fields: Vec<String> = Vec::new();
141 let mut i = 0usize;
142 let n = line.len();
143
144 loop {
145 // Parse one field starting at i.
146 let mut field = String::new();
147 if i < n && line[i] == d.quote {
148 // Quoted field.
149 i += 1; // consume opening quote
150 loop {
151 if i >= n {
152 return Err(Finding::new(
153 "CSV_UNTERMINATED_QUOTE",
154 "quoted field opened but never closed before end of line".to_string(),
155 ));
156 }
157 let b = line[i];
158 if b == d.quote {
159 // Either a doubled quote (escaped) or the closing quote.
160 if i + 1 < n && line[i + 1] == d.quote {
161 field.push(d.quote as char);
162 i += 2;
163 continue;
164 }
165 // Closing quote.
166 i += 1;
167 // Must be at end-of-line or a delimiter now; anything else is malformed.
168 if i < n && line[i] != d.delimiter {
169 return Err(Finding::new(
170 "CSV_TEXT_AFTER_QUOTE",
171 format!(
172 "unexpected byte 0x{:02x} after closing quote (field {})",
173 line[i],
174 fields.len()
175 ),
176 ));
177 }
178 break;
179 }
180 field.push(b as char);
181 i += 1;
182 }
183 } else {
184 // Unquoted field: read up to the next delimiter or end of line. A quote inside an unquoted
185 // field is taken literally (it is not a CSV special there).
186 while i < n && line[i] != d.delimiter {
187 field.push(line[i] as char);
188 i += 1;
189 }
190 }
191
192 fields.push(field);
193
194 if i >= n {
195 break;
196 }
197 // line[i] is the delimiter.
198 i += 1;
199 // A trailing delimiter at end-of-line means a final empty field.
200 if i >= n {
201 fields.push(String::new());
202 break;
203 }
204 }
205
206 Ok(fields)
207}
208
209#[cfg(test)]
210mod tests {
211 use super::*;
212
213 #[test]
214 fn plain_fields_unquoted() {
215 let d = Dialect::csv();
216 let mut s = String::new();
217 write_row(&["A".into(), "B".into(), "C".into()], &d, &mut s);
218 assert_eq!(s, "A,B,C\n");
219 assert_eq!(parse_row(b"A,B,C", &d).unwrap(), vec!["A", "B", "C"]);
220 }
221
222 #[test]
223 fn field_with_delimiter_is_quoted_and_roundtrips() {
224 let d = Dialect::csv();
225 let mut s = String::new();
226 write_field("a,b", &d, &mut s);
227 assert_eq!(s, "\"a,b\"");
228 assert_eq!(parse_row(s.as_bytes(), &d).unwrap(), vec!["a,b"]);
229 }
230
231 #[test]
232 fn embedded_quote_is_doubled_and_roundtrips() {
233 // KOBOLD.CSV.ESCAPE: a field containing both the delimiter and an embedded quote.
234 let d = Dialect::csv();
235 let field = "say \"hi\", now"; // contains a comma and double quotes
236 let mut s = String::new();
237 write_field(field, &d, &mut s);
238 // -> "say ""hi"", now"
239 assert_eq!(s, "\"say \"\"hi\"\", now\"");
240 assert_eq!(parse_row(s.as_bytes(), &d).unwrap(), vec![field]);
241 }
242
243 #[test]
244 fn quote_all_quotes_plain_fields() {
245 let mut d = Dialect::csv();
246 d.quote_all = true;
247 let mut s = String::new();
248 write_field("AB", &d, &mut s);
249 assert_eq!(s, "\"AB\"");
250 }
251
252 #[test]
253 fn trailing_empty_field() {
254 let d = Dialect::csv();
255 assert_eq!(parse_row(b"A,", &d).unwrap(), vec!["A", ""]);
256 assert_eq!(parse_row(b",", &d).unwrap(), vec!["", ""]);
257 }
258
259 #[test]
260 fn crlf_trailing_cr_tolerated() {
261 let d = Dialect::csv();
262 assert_eq!(parse_row(b"A,B\r", &d).unwrap(), vec!["A", "B"]);
263 }
264
265 #[test]
266 fn fail_closed_unterminated_quote() {
267 let d = Dialect::csv();
268 let f = parse_row(b"\"abc", &d).expect_err("must fail closed");
269 assert_eq!(f.code, "CSV_UNTERMINATED_QUOTE");
270 }
271
272 #[test]
273 fn fail_closed_text_after_quote() {
274 let d = Dialect::csv();
275 let f = parse_row(b"\"ab\"c", &d).expect_err("must fail closed");
276 assert_eq!(f.code, "CSV_TEXT_AFTER_QUOTE");
277 }
278
279 #[test]
280 fn pipe_dialect() {
281 let d = Dialect::pipe();
282 let mut s = String::new();
283 write_row(&["X".into(), "a|b".into()], &d, &mut s);
284 assert_eq!(s, "X|\"a|b\"\n");
285 assert_eq!(parse_row(b"X|\"a|b\"", &d).unwrap(), vec!["X", "a|b"]);
286 }
287
288 #[test]
289 fn tab_dialect() {
290 let d = Dialect::tab();
291 assert_eq!(d.delimiter, 0x09);
292 let mut s = String::new();
293 write_row(&["A".into(), "B".into()], &d, &mut s);
294 assert_eq!(s, "A\tB\n");
295 }
296}