Skip to main content

carta_readers/
csv.rs

1//! Delimiter-separated value readers (CSV and TSV).
2//!
3//! Both formats render to a single [`Block::Table`]: the first record becomes the table head and
4//! every later record a body row. The first record also fixes the column count — wider records are
5//! truncated, narrower ones padded with empty cells.
6//!
7//! The two formats differ only in how records split into fields. CSV uses a comma delimiter and
8//! honors double-quote quoting (a quoted field may contain the delimiter, line breaks, and `""`
9//! escapes for a literal quote). TSV uses a tab delimiter and has no quoting: every byte is literal
10//! and a line break always ends a record. Both share the field-to-inlines tokenizer below.
11
12use carta_ast::{
13    Alignment, Attr, Block, Caption, Cell, ColSpec, ColWidth, Document, Inline, Row, Table,
14    TableBody, TableFoot, TableHead,
15};
16use carta_core::{Reader, ReaderOptions, Result};
17
18/// Parses comma-separated values into a single table.
19#[derive(Debug, Default, Clone, Copy)]
20pub struct CsvReader;
21
22impl Reader for CsvReader {
23    fn read(&self, input: &str, _options: &ReaderOptions) -> Result<Document> {
24        Ok(build_document(parse_records(input, ',', true)))
25    }
26}
27
28/// Splits delimiter-separated input into records of fields. With `quoting` enabled a field may be
29/// double-quoted, in which case the delimiter and line breaks are literal and `""` denotes a single
30/// quote; otherwise every character is literal and a line break always ends the record.
31pub(crate) fn parse_records(input: &str, delimiter: char, quoting: bool) -> Vec<Vec<String>> {
32    let mut records = Vec::new();
33    let mut record = Vec::new();
34    let mut field = String::new();
35    let mut chars = input
36        .strip_prefix('\u{feff}')
37        .unwrap_or(input)
38        .chars()
39        .peekable();
40
41    loop {
42        match chars.next() {
43            None => break,
44            Some('"') if quoting && field.is_empty() => {
45                read_quoted_field(&mut chars, &mut field);
46            }
47            Some(c) if c == delimiter => {
48                record.push(std::mem::take(&mut field));
49                skip_leading_blanks(&mut chars, delimiter);
50            }
51            Some('\r') => {
52                if chars.peek() == Some(&'\n') {
53                    chars.next();
54                }
55                record.push(std::mem::take(&mut field));
56                records.push(std::mem::take(&mut record));
57            }
58            Some('\n') => {
59                record.push(std::mem::take(&mut field));
60                records.push(std::mem::take(&mut record));
61            }
62            Some(c) => field.push(c),
63        }
64    }
65
66    if !field.is_empty() || !record.is_empty() {
67        record.push(field);
68        records.push(record);
69    }
70
71    records
72}
73
74/// Skips the spaces and tabs that lead a field, stopping at the delimiter itself so a delimiter is
75/// never consumed as padding.
76fn skip_leading_blanks(chars: &mut std::iter::Peekable<std::str::Chars<'_>>, delimiter: char) {
77    while let Some(&c) = chars.peek() {
78        if (c == ' ' || c == '\t') && c != delimiter {
79            chars.next();
80        } else {
81            break;
82        }
83    }
84}
85
86/// Consumes a double-quoted field body, having already passed the opening quote. A doubled quote
87/// inside the body is a literal quote; the first lone quote closes the field.
88fn read_quoted_field(chars: &mut std::iter::Peekable<std::str::Chars<'_>>, field: &mut String) {
89    while let Some(c) = chars.next() {
90        if c == '"' {
91            if chars.peek() == Some(&'"') {
92                chars.next();
93                field.push('"');
94            } else {
95                return;
96            }
97        } else {
98            field.push(c);
99        }
100    }
101}
102
103/// Assembles parsed records into a one-table document. An input with no records yields an empty
104/// document.
105pub(crate) fn build_document(records: Vec<Vec<String>>) -> Document {
106    let mut records = records.into_iter();
107    let Some(header) = records.next() else {
108        return Document::default();
109    };
110
111    let column_count = header.len();
112    let col_specs = (0..column_count)
113        .map(|_| ColSpec {
114            align: Alignment::AlignDefault,
115            width: ColWidth::ColWidthDefault,
116        })
117        .collect();
118
119    let head = TableHead {
120        attr: Attr::default(),
121        rows: vec![field_row(header, column_count)],
122    };
123    let body_rows = records
124        .map(|record| field_row(record, column_count))
125        .collect();
126    let body = TableBody {
127        attr: Attr::default(),
128        row_head_columns: 0,
129        head: Vec::new(),
130        body: body_rows,
131    };
132
133    let table = Table {
134        attr: Attr::default(),
135        caption: Caption::default(),
136        col_specs,
137        head,
138        bodies: vec![body],
139        foot: TableFoot::default(),
140    };
141
142    Document {
143        blocks: vec![Block::Table(Box::new(table))],
144        ..Default::default()
145    }
146}
147
148/// Builds one table row of exactly `column_count` cells: extra fields are dropped, missing fields
149/// are added as empty cells.
150fn field_row(fields: Vec<String>, column_count: usize) -> Row {
151    let mut cells: Vec<Cell> = fields
152        .into_iter()
153        .take(column_count)
154        .map(|field| field_cell(&field))
155        .collect();
156    while cells.len() < column_count {
157        cells.push(field_cell(""));
158    }
159    Row {
160        attr: Attr::default(),
161        cells,
162    }
163}
164
165fn field_cell(field: &str) -> Cell {
166    let inlines = field_inlines(field);
167    let content = if inlines.is_empty() {
168        Vec::new()
169    } else {
170        vec![Block::Plain(inlines)]
171    };
172    Cell {
173        attr: Attr::default(),
174        align: Alignment::AlignDefault,
175        row_span: 1,
176        col_span: 1,
177        content,
178    }
179}
180
181/// Tokenizes a field's text into inlines. Carriage returns are dropped, a single trailing line feed
182/// is discarded as a record terminator artifact, runs of non-newline whitespace become a single
183/// [`Inline::Space`], and each remaining line feed becomes an [`Inline::LineBreak`].
184fn field_inlines(field: &str) -> Vec<Inline> {
185    let cleaned: String = field.chars().filter(|&c| c != '\r').collect();
186    let cleaned = match cleaned.strip_suffix('\n') {
187        Some(trimmed) => trimmed,
188        None => &cleaned,
189    };
190
191    let mut inlines = Vec::new();
192    let mut chars = cleaned.chars().peekable();
193    while let Some(&c) = chars.peek() {
194        if is_separator(c) {
195            let mut newlines = 0;
196            while let Some(&w) = chars.peek() {
197                if w == '\n' {
198                    newlines += 1;
199                    chars.next();
200                } else if is_separator(w) {
201                    chars.next();
202                } else {
203                    break;
204                }
205            }
206            if newlines == 0 {
207                inlines.push(Inline::Space);
208            } else {
209                for _ in 0..newlines {
210                    inlines.push(Inline::LineBreak);
211                }
212            }
213        } else {
214            let mut word = String::new();
215            while let Some(&w) = chars.peek() {
216                if is_separator(w) {
217                    break;
218                }
219                word.push(w);
220                chars.next();
221            }
222            inlines.push(Inline::Str(word.into()));
223        }
224    }
225
226    inlines
227}
228
229/// The field tokenizer treats only ASCII space, tab, and line feed as separators; every other
230/// character (including other Unicode whitespace) is part of a word.
231fn is_separator(c: char) -> bool {
232    matches!(c, ' ' | '\t' | '\n')
233}
234
235#[cfg(test)]
236mod tests {
237    use super::*;
238
239    fn tags(inlines: &[Inline]) -> Vec<&'static str> {
240        inlines
241            .iter()
242            .map(|inline| match inline {
243                Inline::Str(_) => "Str",
244                Inline::Space => "Space",
245                Inline::LineBreak => "LineBreak",
246                _ => "other",
247            })
248            .collect()
249    }
250
251    #[test]
252    fn collapses_whitespace_runs_to_single_space() {
253        assert_eq!(tags(&field_inlines("x  y")), ["Str", "Space", "Str"]);
254        assert_eq!(tags(&field_inlines("x\ty")), ["Str", "Space", "Str"]);
255    }
256
257    #[test]
258    fn keeps_leading_and_trailing_space_around_words() {
259        assert_eq!(tags(&field_inlines(" x ")), ["Space", "Str", "Space"]);
260    }
261
262    #[test]
263    fn pure_whitespace_field_is_one_space() {
264        assert_eq!(tags(&field_inlines("   ")), ["Space"]);
265    }
266
267    #[test]
268    fn embedded_newlines_become_line_breaks() {
269        assert_eq!(tags(&field_inlines("x\ny")), ["Str", "LineBreak", "Str"]);
270        assert_eq!(
271            tags(&field_inlines("x\n\ny")),
272            ["Str", "LineBreak", "LineBreak", "Str"]
273        );
274    }
275
276    #[test]
277    fn single_trailing_newline_is_dropped() {
278        assert!(field_inlines("\n").is_empty());
279        assert_eq!(tags(&field_inlines(" \n")), ["Space"]);
280        assert_eq!(tags(&field_inlines("\n ")), ["LineBreak"]);
281    }
282
283    #[test]
284    fn carriage_returns_are_removed() {
285        assert_eq!(tags(&field_inlines("x\ry")), ["Str"]);
286        assert_eq!(tags(&field_inlines("x\r\ny")), ["Str", "LineBreak", "Str"]);
287    }
288
289    #[test]
290    fn non_ascii_whitespace_stays_in_word() {
291        assert_eq!(tags(&field_inlines("x\u{a0}y")), ["Str"]);
292    }
293
294    #[test]
295    fn quoting_protects_delimiter_and_escapes_quote() {
296        let records = parse_records("\"a,b\",\"c\"\"d\"\n", ',', true);
297        assert_eq!(records, vec![vec!["a,b".to_owned(), "c\"d".to_owned()]]);
298    }
299
300    #[test]
301    fn tab_records_keep_quotes_literal() {
302        let records = parse_records("\"a\"\tb\n", '\t', false);
303        assert_eq!(records, vec![vec!["\"a\"".to_owned(), "b".to_owned()]]);
304    }
305
306    #[test]
307    fn leading_blanks_after_delimiter_are_skipped() {
308        let records = parse_records("a,  b,\tc\n", ',', true);
309        assert_eq!(
310            records,
311            vec![vec!["a".to_owned(), "b".to_owned(), "c".to_owned()]]
312        );
313    }
314
315    #[test]
316    fn first_field_keeps_leading_blanks() {
317        let records = parse_records(" a,b\n", ',', true);
318        assert_eq!(records, vec![vec![" a".to_owned(), "b".to_owned()]]);
319    }
320
321    #[test]
322    fn crlf_and_bare_lf_both_end_records() {
323        let records = parse_records("a,b\r\nc,d\ne,f", ',', true);
324        assert_eq!(records.len(), 3);
325    }
326
327    #[test]
328    fn empty_input_yields_empty_document() {
329        assert!(
330            build_document(parse_records("", ',', true))
331                .blocks
332                .is_empty()
333        );
334    }
335
336    #[test]
337    fn leading_byte_order_mark_is_stripped() {
338        let records = parse_records("\u{feff}a,b\n", ',', true);
339        assert_eq!(records, vec![vec!["a".to_owned(), "b".to_owned()]]);
340    }
341}