csvp/
parser.rs

1//! # `csv+` Parser
2//!
3//! A recursive descent, single-threaded CSV parser. This aims to be a CSV-compliant
4//! implementation of RFC 4180, however there are some additional features:
5//!
6//! * Multiline-field support.  A `\` character preceding a newline will parse as if the newline
7//!   were not there.
8//! * Record comments. (A `#` at the beginning of the line comments out the whole record.)
9//! * Maintains a mapping of parsed positions to their original location in the source code.
10//!
11//! # Terminology
12//!
13//! * `record` - a row of data.  typically a single line, unless it's split across multiple lines
14//! * `field` - a single value from the record.  
15//!
16//! ## References
17//!
18//! * [RFC 4180: Common Format and MIME Type for Comma-Separated Values (CSV) Files](https://www.ietf.org/rfc/rfc4180.txt)
19//!
20use super::{Config, Error, Field, FieldBuilder, Record, Records, Result, SourcePosition};
21use std::{iter, str};
22
23#[derive(Debug)]
24pub struct Parser<'a> {
25    config: &'a Config,
26    chars: iter::Peekable<str::Chars<'a>>,
27    source_line: usize,
28    source_offset: isize,
29}
30
31#[derive(Debug)]
32enum FieldResult {
33    Some(Field),
34    Last(Field),
35    Eof,
36}
37
38impl FieldResult {
39    fn some<F: Into<Field>>(f: F) -> Self {
40        Self::Some(f.into())
41    }
42
43    fn last<F: Into<Field>>(f: F) -> Self {
44        Self::Last(f.into())
45    }
46}
47
48#[derive(Debug)]
49enum RecordResult {
50    Comment,
51    Eof,
52    Some(Record),
53}
54
55impl From<&mut Parser<'_>> for SourcePosition {
56    fn from(p: &mut Parser) -> Self {
57        if let Ok(o) = usize::try_from(p.source_offset) {
58            Self::new(o, p.source_line + p.config.lines_above)
59        } else {
60            panic!("Attempted to create a SourcePosition before the parser has consumed any characters.")
61        }
62    }
63}
64
65fn is_record_terminator(c: char) -> bool {
66    c == '\n'
67}
68
69impl<'a> Parser<'a> {
70    pub(super) fn new(input: &'a str, config: &'a Config) -> Self {
71        Parser {
72            config,
73            chars: input.chars().peekable(),
74            source_line: 0,
75            // -1 because we're going to increment as we consume characters
76            source_offset: -1,
77        }
78    }
79
80    pub(super) fn parse(&mut self) -> Result<Records> {
81        while self.chars.peek() == Some(&'\n') {
82            self.consume_char();
83        }
84
85        let mut records = vec![];
86        let mut row = 0;
87        loop {
88            match self.parse_record(row)? {
89                RecordResult::Comment => continue,
90                RecordResult::Some(r) => records.push(r),
91                RecordResult::Eof => break,
92            }
93            row += 1;
94        }
95
96        Ok(records)
97    }
98
99    fn is_field_separator(&self, c: char) -> bool {
100        c == self.config.separator
101    }
102
103    fn consume_and_ignore_line(&mut self) {
104        loop {
105            match self.consume_char() {
106                Some('\n') | None => break,
107                Some(_) => continue,
108            }
109        }
110    }
111
112    fn next_is_newline(&mut self) -> bool {
113        self.chars.peek() == Some(&'\n')
114    }
115
116    fn parse_record(&mut self, row: usize) -> Result<RecordResult> {
117        if let Some('#') = self.chars.peek() {
118            self.consume_and_ignore_line();
119            return Ok(RecordResult::Comment);
120        }
121
122        let mut fields = vec![];
123        let mut col = 0;
124        loop {
125            match self.parse_field(FieldBuilder::new((col, row)))? {
126                FieldResult::Some(f) => fields.push(f),
127                FieldResult::Last(f) => {
128                    fields.push(f);
129                    break;
130                }
131                FieldResult::Eof => {
132                    return Ok(if fields.is_empty() {
133                        RecordResult::Eof
134                    } else {
135                        RecordResult::Some(fields)
136                    })
137                }
138            };
139            col += 1;
140        }
141
142        Ok(RecordResult::Some(fields))
143    }
144
145    fn consume_whitespace(&mut self) {
146        while let Some(c) = self.chars.peek() {
147            if !is_record_terminator(*c) && c.is_whitespace() {
148                self.consume_char();
149            } else {
150                break;
151            }
152        }
153    }
154
155    fn parse_field(&mut self, mut fb: FieldBuilder) -> Result<FieldResult> {
156        self.consume_whitespace();
157
158        match self.consume_char() {
159            Some(c) if self.is_field_separator(c) => Ok(FieldResult::some(fb)),
160            Some(c) if is_record_terminator(c) => Ok(FieldResult::last(fb)),
161            Some('\\') if self.next_is_newline() => {
162                // consume the newline then continue parsing this field
163                self.consume_char();
164                self.parse_field(fb)
165            }
166            // Some(c) if c.is_whitespace() => self.parse_field(fb),
167            Some('"') => Ok(self.parse_quoted_field(fb, false)?),
168            Some(c) => {
169                fb.push(c, &mut *self);
170                Ok(self.parse_unquoted_field(fb)?)
171            }
172            None => Ok(FieldResult::Eof),
173        }
174    }
175
176    fn parse_unquoted_field(&mut self, mut fb: FieldBuilder) -> Result<FieldResult> {
177        match self.consume_char() {
178            Some('\\') if self.next_is_newline() => {
179                // consume the newline then continue parsing this field
180                self.consume_char();
181                self.parse_unquoted_field(fb)
182            }
183            Some(c) if self.is_field_separator(c) => Ok(FieldResult::some(fb)),
184            Some(c) if is_record_terminator(c) => Ok(FieldResult::last(fb)),
185            Some(c) => {
186                fb.push(c, &mut *self);
187                self.parse_unquoted_field(fb)
188            }
189            None => Ok(FieldResult::last(fb)),
190        }
191    }
192
193    fn parse_quoted_field(
194        &mut self,
195        mut fb: FieldBuilder,
196        escape_mode: bool,
197    ) -> Result<FieldResult> {
198        let c = self.consume_char();
199        if escape_mode {
200            if let Some(c) = c {
201                fb.push(c, &mut *self);
202                return self.parse_quoted_field(fb, false);
203            }
204
205            return Err(self.parse_error("Expected a quoted character but got EOF"));
206        }
207
208        match c {
209            Some('"') => {
210                // is it two quotes in a row? if so it's a quoted quote
211                // TODO: or it could be a empty string? would we parse `foo,"",bar` correctly?
212                if let Some('"') = self.chars.peek() {
213                    self.parse_quoted_field(fb, true)
214                } else {
215                    // otherwise it's a terminating quote. we need to also make sure we consume any
216                    // trailing spaces and make sure there is a ','
217                    self.parse_rest_of_quoted_field(fb)
218                }
219            }
220            Some(c) => {
221                fb.push(c, &mut *self);
222                self.parse_quoted_field(fb, false)
223            }
224            None => Ok(FieldResult::some(fb)),
225        }
226    }
227
228    /// At this point we've already seen the beginning and ending quotes and have consumed a field.
229    /// But we need to consume any trailing spaces as well as the terminating comma.  Or throw an
230    /// error if it'd not there.
231    fn parse_rest_of_quoted_field(&mut self, fb: FieldBuilder) -> Result<FieldResult> {
232        loop {
233            match self.consume_char() {
234                Some(c) if c.is_whitespace() => continue,
235                Some(c) if self.is_field_separator(c) => return Ok(FieldResult::some(fb)),
236                Some(c) => {
237                    // it's not whitespace or a comma
238                    return Err(self.parse_error(format!(
239                        "Invalid trailing character after quoted string: {c}"
240                    )));
241                }
242                None => return Ok(FieldResult::last(fb)),
243            }
244        }
245    }
246
247    fn consume_char(&mut self) -> Option<char> {
248        if let Some(c) = self.chars.next() {
249            if is_record_terminator(c) {
250                self.source_line += 1;
251                self.source_offset = 0;
252            } else {
253                self.source_offset += 1;
254            }
255            Some(c)
256        } else {
257            None
258        }
259    }
260
261    fn parse_error<S: Into<String>>(&mut self, message: S) -> Error {
262        Error::ParseError {
263            bad_input: self.chars.clone().take(10).collect::<String>(),
264            message: message.into(),
265            position: self.into(),
266        }
267    }
268}
269
270/// Parse a given string and return the `Cell`s.
271///
272/// # Errors
273///
274/// Will return an `Error::ParseError` if it is unable to parse the given `input`
275pub fn parse<'a>(input: &'a str, config: &'a Config) -> Result<Records> {
276    Parser::new(input, config).parse()
277}
278
279#[cfg(test)]
280mod tests {
281    use super::*;
282    use crate::*;
283
284    fn test_parse(s: &str) -> Records {
285        parse(s, &Config::default()).unwrap()
286    }
287
288    #[test]
289    fn source_position_from_parser() {
290        let config = Config::default();
291        let mut parser = Parser::new("foo", &config);
292        parser.parse().unwrap();
293        let source_position: SourcePosition = (&mut parser).into();
294
295        assert_eq!(source_position.line_number, 0);
296        assert_eq!(source_position.line_offset, 2);
297    }
298
299    #[test]
300    fn source_position_from_parser_lines_above() {
301        let config = Config {
302            lines_above: 100,
303            ..Config::default()
304        };
305        let mut parser = Parser::new("foo", &config);
306        parser.parse().unwrap();
307        let source_position: SourcePosition = (&mut parser).into();
308
309        assert_eq!(source_position.line_number, 100);
310        assert_eq!(source_position.line_offset, 2);
311    }
312
313    #[test]
314    fn parse_simple() {
315        let cells = test_parse("foo,bar,baz");
316
317        assert_eq!(cells.len(), 1);
318        assert_eq!(cells[0].len(), 3);
319
320        let cell = &cells[0][0];
321        assert_eq!(cell.value, "foo");
322        assert_eq!(cell.address, (0, 0).into());
323        assert_eq!(cell.positions[0].line_offset, 0);
324        assert_eq!(cell.positions[1].line_offset, 1);
325        assert_eq!(cell.positions[2].line_offset, 2);
326        assert_eq!(cell.positions[0].line_number, 0);
327        assert_eq!(cell.positions[1].line_number, 0);
328        assert_eq!(cell.positions[2].line_number, 0);
329
330        let cell = &cells[0][1];
331        assert_eq!(cell.value, "bar");
332        assert_eq!(cell.address, (1, 0).into());
333        assert_eq!(cell.positions[0].line_offset, 4);
334        assert_eq!(cell.positions[1].line_offset, 5);
335        assert_eq!(cell.positions[2].line_offset, 6);
336        assert_eq!(cell.positions[0].line_number, 0);
337        assert_eq!(cell.positions[1].line_number, 0);
338        assert_eq!(cell.positions[2].line_number, 0);
339
340        let cell = &cells[0][2];
341        assert_eq!(cell.value, "baz");
342        assert_eq!(cell.address, (2, 0).into());
343        assert_eq!(cell.positions[0].line_offset, 8);
344        assert_eq!(cell.positions[1].line_offset, 9);
345        assert_eq!(cell.positions[2].line_offset, 10);
346        assert_eq!(cell.positions[0].line_number, 0);
347        assert_eq!(cell.positions[1].line_number, 0);
348        assert_eq!(cell.positions[2].line_number, 0);
349    }
350
351    #[test]
352    fn parse_empty_cell() {
353        let cells = test_parse("foo,,baz");
354
355        assert_eq!(cells.len(), 1);
356        assert_eq!(cells[0].len(), 3);
357        assert_eq!(cells[0][0].value, "foo");
358        assert_eq!(cells[0][0].address, (0, 0).into());
359        assert_eq!(cells[0][1].value, "");
360        assert_eq!(cells[0][1].address, (1, 0).into());
361        assert_eq!(cells[0][2].value, "baz");
362        assert_eq!(cells[0][2].address, (2, 0).into());
363    }
364
365    #[test]
366    fn parse_multiple_lines() {
367        let cells = test_parse("foo,bar,baz\nfoos,bars,bazs");
368
369        assert_eq!(cells.len(), 2);
370        assert_eq!(cells[0].len(), 3);
371        assert_eq!(cells[0][0].value, "foo");
372        assert_eq!(cells[0][1].value, "bar");
373        assert_eq!(cells[0][2].value, "baz");
374        assert_eq!(cells[1][0].value, "foos");
375        assert_eq!(cells[1][1].value, "bars");
376        assert_eq!(cells[1][2].value, "bazs");
377    }
378
379    #[test]
380    fn parse_spaces() {
381        let cells = test_parse("   foo ,    bar   ,one two three");
382
383        assert_eq!(cells.len(), 1);
384        assert_eq!(cells[0].len(), 3);
385
386        let cell = &cells[0][0];
387        assert_eq!(cell.value, "foo");
388        assert_eq!(cell.positions[0].line_offset, 3);
389        assert_eq!(cell.positions[1].line_offset, 4);
390        assert_eq!(cell.positions[2].line_offset, 5);
391
392        let cell = &cells[0][1];
393        assert_eq!(cell.value, "bar");
394        assert_eq!(cell.positions[0].line_offset, 12);
395        assert_eq!(cell.positions[1].line_offset, 13);
396        assert_eq!(cell.positions[2].line_offset, 14);
397
398        let cell = &cells[0][2];
399        assert_eq!(cell.value, "one two three");
400        assert_eq!(cell.positions[0].line_offset, 19);
401        assert_eq!(cell.positions[1].line_offset, 20);
402        assert_eq!(cell.positions[2].line_offset, 21);
403    }
404
405    #[test]
406    fn parse_trailing_newline() {
407        let cells = test_parse("foo\nbar\n");
408
409        assert_eq!(cells.len(), 2);
410        assert_eq!(cells[0][0].address, (0, 0).into());
411        assert_eq!(cells[1][0].address, (0, 1).into());
412    }
413
414    #[test]
415    fn parse_leading_newline() {
416        let cells = test_parse("\nfoo\nbar\n");
417
418        assert_eq!(cells.len(), 2);
419        assert_eq!(cells[0][0].address, (0, 0).into());
420        assert_eq!(cells[1][0].address, (0, 1).into());
421    }
422
423    #[test]
424    fn parse_windows_newline() {
425        let cells = test_parse("foo\r\nbar\r\nbaz\r\n");
426
427        assert_eq!(cells.len(), 3);
428    }
429
430    #[test]
431    fn parse_quoted() {
432        let cells = test_parse(r#""this, is, a, quoted, sentence",bar"#);
433
434        assert_eq!(cells.len(), 1);
435        assert_eq!(cells[0].len(), 2);
436        assert_eq!(cells[0][0].value, "this, is, a, quoted, sentence");
437        assert_eq!(cells[0][1].value, "bar");
438    }
439
440    #[test]
441    fn parse_quoted_newline() {
442        let cells = test_parse("\"this field \n has a newline\",bar");
443
444        assert_eq!(cells.len(), 1);
445        assert_eq!(cells[0].len(), 2);
446        assert_eq!(cells[0][0].value, "this field \n has a newline");
447    }
448
449    #[test]
450    fn parse_quoted_quote() {
451        let cells = test_parse("\"this field has a quote \"\"\",bar");
452
453        assert_eq!(cells.len(), 1);
454        assert_eq!(cells[0].len(), 2);
455        assert_eq!(cells[0][0].value, "this field has a quote \"");
456    }
457
458    #[test]
459    fn parse_comment() {
460        let cells = test_parse("# this is a comment\nfoo,bar\n# another comment");
461
462        assert_eq!(cells.len(), 1);
463        assert_eq!(cells[0].len(), 2);
464        assert_eq!(cells[0][0].value, "foo");
465        assert_eq!(cells[0][1].value, "bar");
466    }
467
468    #[test]
469    fn parse_multiline_field() {
470        let cells = test_parse("this \\\nspans \\\nmultiple lines");
471
472        assert_eq!(cells.len(), 1);
473        assert_eq!(cells[0].len(), 1);
474        assert_eq!(cells[0][0].value, "this spans multiple lines");
475    }
476
477    #[test]
478    fn parse_trailing_comma_newline() {
479        let cells = test_parse("foo  ,\n");
480
481        assert_eq!(cells.len(), 1);
482        assert_eq!(cells[0].len(), 2);
483    }
484
485    #[test]
486    fn parse_trailing_comma_no_newline() {
487        let cells = test_parse(
488            r"[[var=a1]]A1,foo,bar
489![[f=10]],bar,=var2
490foo
491[[l]]test,
492![[l]]test1,test2,test3,",
493        );
494
495        assert_eq!(cells.len(), 5);
496    }
497
498    #[test]
499    fn parse_ending_quote() {
500        let cells = test_parse("\"=profit\" ,\"=fees\"");
501
502        assert_eq!(cells.len(), 1);
503        assert_eq!(cells[0].len(), 2);
504    }
505}