simd_csv/
reader.rs

1use std::io::{BufRead, BufReader, Read};
2
3use crate::core::{CoreReader, ReadResult};
4use crate::error::{self, Error};
5use crate::records::{ByteRecord, ByteRecordBuilder};
6use crate::utils::trim_bom;
7
8pub struct ReaderBuilder {
9    delimiter: u8,
10    quote: u8,
11    buffer_capacity: Option<usize>,
12    flexible: bool,
13    has_headers: bool,
14}
15
16impl Default for ReaderBuilder {
17    fn default() -> Self {
18        Self {
19            delimiter: b',',
20            quote: b'"',
21            buffer_capacity: None,
22            flexible: false,
23            has_headers: true,
24        }
25    }
26}
27
28impl ReaderBuilder {
29    pub fn new() -> Self {
30        Self::default()
31    }
32
33    pub fn with_capacity(capacity: usize) -> Self {
34        let mut reader = Self::default();
35        reader.buffer_capacity(capacity);
36        reader
37    }
38
39    pub fn delimiter(&mut self, delimiter: u8) -> &mut Self {
40        self.delimiter = delimiter;
41        self
42    }
43
44    pub fn quote(&mut self, quote: u8) -> &mut Self {
45        self.quote = quote;
46        self
47    }
48
49    pub fn buffer_capacity(&mut self, capacity: usize) -> &mut Self {
50        self.buffer_capacity = Some(capacity);
51        self
52    }
53
54    pub fn flexible(&mut self, yes: bool) -> &mut Self {
55        self.flexible = yes;
56        self
57    }
58
59    pub fn has_headers(&mut self, yes: bool) -> &mut Self {
60        self.has_headers = yes;
61        self
62    }
63
64    fn bufreader<R: Read>(&self, reader: R) -> BufReader<R> {
65        match self.buffer_capacity {
66            None => BufReader::new(reader),
67            Some(capacity) => BufReader::with_capacity(capacity, reader),
68        }
69    }
70
71    pub fn from_reader<R: Read>(&self, reader: R) -> Reader<R> {
72        Reader {
73            buffer: self.bufreader(reader),
74            inner: CoreReader::new(self.delimiter, self.quote),
75            flexible: self.flexible,
76            headers: ByteRecord::new(),
77            has_read: false,
78            must_reemit_headers: !self.has_headers,
79        }
80    }
81}
82
83pub struct Reader<R> {
84    buffer: BufReader<R>,
85    inner: CoreReader,
86    flexible: bool,
87    headers: ByteRecord,
88    has_read: bool,
89    must_reemit_headers: bool,
90}
91
92impl<R: Read> Reader<R> {
93    pub fn from_reader(reader: R) -> Self {
94        ReaderBuilder::new().from_reader(reader)
95    }
96
97    #[inline]
98    fn check_field_count(&mut self, written: usize) -> error::Result<()> {
99        if self.flexible {
100            return Ok(());
101        }
102
103        if self.has_read && written != self.headers.len() {
104            return Err(Error::unequal_lengths(self.headers.len(), written));
105        }
106
107        Ok(())
108    }
109
110    fn read_byte_record_impl(&mut self, record: &mut ByteRecord) -> error::Result<bool> {
111        use ReadResult::*;
112
113        record.clear();
114
115        let mut record_builder = ByteRecordBuilder::wrap(record);
116
117        loop {
118            let input = self.buffer.fill_buf()?;
119
120            let (result, pos) = self.inner.read_record(input, &mut record_builder);
121
122            self.buffer.consume(pos);
123
124            match result {
125                End => {
126                    return Ok(false);
127                }
128                Cr | Lf | InputEmpty => {
129                    continue;
130                }
131                Record => {
132                    self.check_field_count(record.len())?;
133                    return Ok(true);
134                }
135            };
136        }
137    }
138
139    #[inline]
140    fn on_first_read(&mut self) -> error::Result<()> {
141        if self.has_read {
142            return Ok(());
143        }
144
145        // Trimming BOM
146        let input = self.buffer.fill_buf()?;
147        let bom_len = trim_bom(input);
148        self.buffer.consume(bom_len);
149
150        // Reading headers
151        let mut headers = ByteRecord::new();
152
153        let has_data = self.read_byte_record_impl(&mut headers)?;
154
155        if !has_data {
156            self.must_reemit_headers = false;
157        }
158
159        self.headers = headers;
160        self.has_read = true;
161
162        Ok(())
163    }
164
165    #[inline]
166    pub fn byte_headers(&mut self) -> error::Result<&ByteRecord> {
167        self.on_first_read()?;
168
169        Ok(&self.headers)
170    }
171
172    #[inline(always)]
173    pub fn read_byte_record(&mut self, record: &mut ByteRecord) -> error::Result<bool> {
174        self.on_first_read()?;
175
176        if self.must_reemit_headers {
177            self.headers.clone_into(record);
178            self.must_reemit_headers = false;
179            return Ok(true);
180        }
181
182        self.read_byte_record_impl(record)
183    }
184
185    pub fn byte_records(&mut self) -> ByteRecordsIter<'_, R> {
186        ByteRecordsIter {
187            reader: self,
188            record: ByteRecord::new(),
189        }
190    }
191
192    pub fn into_byte_records(self) -> ByteRecordsIntoIter<R> {
193        ByteRecordsIntoIter {
194            reader: self,
195            record: ByteRecord::new(),
196        }
197    }
198}
199
200pub struct ByteRecordsIter<'r, R> {
201    reader: &'r mut Reader<R>,
202    record: ByteRecord,
203}
204
205impl<'r, R: Read> Iterator for ByteRecordsIter<'r, R> {
206    type Item = error::Result<ByteRecord>;
207
208    #[inline]
209    fn next(&mut self) -> Option<Self::Item> {
210        // NOTE: cloning the record will not carry over excess capacity
211        // because the record only contains `Vec` currently.
212        match self.reader.read_byte_record(&mut self.record) {
213            Err(err) => Some(Err(err)),
214            Ok(true) => Some(Ok(self.record.clone())),
215            Ok(false) => None,
216        }
217    }
218}
219
220pub struct ByteRecordsIntoIter<R> {
221    reader: Reader<R>,
222    record: ByteRecord,
223}
224
225impl<R: Read> Iterator for ByteRecordsIntoIter<R> {
226    type Item = error::Result<ByteRecord>;
227
228    #[inline]
229    fn next(&mut self) -> Option<Self::Item> {
230        // NOTE: cloning the record will not carry over excess capacity
231        // because the record only contains `Vec` currently.
232        match self.reader.read_byte_record(&mut self.record) {
233            Err(err) => Some(Err(err)),
234            Ok(true) => Some(Ok(self.record.clone())),
235            Ok(false) => None,
236        }
237    }
238}
239
240#[cfg(test)]
241mod tests {
242    use std::io::Cursor;
243
244    use crate::brec;
245
246    use super::*;
247
248    impl<R: Read> Reader<R> {
249        fn from_reader_no_headers(reader: R) -> Self {
250            ReaderBuilder::new().has_headers(false).from_reader(reader)
251        }
252    }
253
254    #[test]
255    fn test_read_byte_record() -> error::Result<()> {
256        let csv = "name,surname,age\n\"john\",\"landy, the \"\"everlasting\"\" bastard\",45\n\"\"\"ok\"\"\",whatever,dude\nlucy,rose,\"67\"\njermaine,jackson,\"89\"\n\nkarine,loucan,\"52\"\nrose,\"glib\",12\n\"guillaume\",\"plique\",\"42\"\r\n";
257
258        let expected = vec![
259            brec!["name", "surname", "age"],
260            brec!["john", "landy, the \"everlasting\" bastard", "45"],
261            brec!["\"ok\"", "whatever", "dude"],
262            brec!["lucy", "rose", "67"],
263            brec!["jermaine", "jackson", "89"],
264            brec!["karine", "loucan", "52"],
265            brec!["rose", "glib", "12"],
266            brec!["guillaume", "plique", "42"],
267        ];
268
269        for capacity in [32usize, 4, 3, 2, 1] {
270            let mut reader = ReaderBuilder::with_capacity(capacity)
271                .has_headers(false)
272                .from_reader(Cursor::new(csv));
273
274            assert_eq!(
275                reader.byte_records().collect::<Result<Vec<_>, _>>()?,
276                expected,
277            );
278        }
279
280        Ok(())
281    }
282
283    #[test]
284    fn test_strip_bom() -> error::Result<()> {
285        let mut reader = Reader::from_reader_no_headers(Cursor::new("name,surname,age"));
286
287        assert_eq!(
288            reader.byte_records().next().unwrap()?,
289            brec!["name", "surname", "age"]
290        );
291
292        let mut reader =
293            Reader::from_reader_no_headers(Cursor::new(b"\xef\xbb\xbfname,surname,age"));
294
295        assert_eq!(
296            reader.byte_records().next().unwrap()?,
297            brec!["name", "surname", "age"]
298        );
299
300        Ok(())
301    }
302
303    #[test]
304    fn test_empty_row() -> error::Result<()> {
305        let data = "name\n\"\"\nlucy\n\"\"";
306
307        // Read
308        let reader = Reader::from_reader_no_headers(Cursor::new(data));
309
310        let expected = vec![brec!["name"], brec![""], brec!["lucy"], brec![""]];
311
312        let records = reader.into_byte_records().collect::<Result<Vec<_>, _>>()?;
313
314        assert_eq!(records, expected);
315
316        Ok(())
317    }
318
319    #[test]
320    fn test_crlf() -> error::Result<()> {
321        let reader = Reader::from_reader_no_headers(Cursor::new(
322            "name,surname\r\nlucy,\"john\"\r\nevan,zhong\r\nbéatrice,glougou\r\n",
323        ));
324
325        let expected = vec![
326            brec!["name", "surname"],
327            brec!["lucy", "john"],
328            brec!["evan", "zhong"],
329            brec!["béatrice", "glougou"],
330        ];
331
332        let records = reader.into_byte_records().collect::<Result<Vec<_>, _>>()?;
333
334        assert_eq!(records, expected);
335
336        Ok(())
337    }
338
339    #[test]
340    fn test_quote_always() -> error::Result<()> {
341        let reader = Reader::from_reader_no_headers(Cursor::new(
342            "\"name\",\"surname\"\n\"lucy\",\"rose\"\n\"john\",\"mayhew\"",
343        ));
344
345        let expected = vec![
346            brec!["name", "surname"],
347            brec!["lucy", "rose"],
348            brec!["john", "mayhew"],
349        ];
350
351        let records = reader.into_byte_records().collect::<Result<Vec<_>, _>>()?;
352
353        assert_eq!(records, expected);
354
355        Ok(())
356    }
357
358    #[test]
359    fn test_byte_headers() -> error::Result<()> {
360        let data = b"name,surname\njohn,dandy";
361
362        // Headers, call before read
363        let mut reader = Reader::from_reader(Cursor::new(data));
364        assert_eq!(reader.byte_headers()?, &brec!["name", "surname"]);
365        assert_eq!(
366            reader.byte_records().next().unwrap()?,
367            brec!["john", "dandy"]
368        );
369
370        // Headers, call after read
371        let mut reader = Reader::from_reader(Cursor::new(data));
372        assert_eq!(
373            reader.byte_records().next().unwrap()?,
374            brec!["john", "dandy"]
375        );
376        assert_eq!(reader.byte_headers()?, &brec!["name", "surname"]);
377
378        // No headers, call before read
379        let mut reader = Reader::from_reader_no_headers(Cursor::new(data));
380        assert_eq!(reader.byte_headers()?, &brec!["name", "surname"]);
381        assert_eq!(
382            reader.byte_records().next().unwrap()?,
383            brec!["name", "surname"]
384        );
385
386        // No headers, call after read
387        let mut reader = Reader::from_reader_no_headers(Cursor::new(data));
388        assert_eq!(
389            reader.byte_records().next().unwrap()?,
390            brec!["name", "surname"]
391        );
392        assert_eq!(reader.byte_headers()?, &brec!["name", "surname"]);
393
394        // Headers, empty
395        let mut reader = Reader::from_reader(Cursor::new(b""));
396        assert_eq!(reader.byte_headers()?, &brec![]);
397        assert!(reader.byte_records().next().is_none());
398
399        // No headers, empty
400        let mut reader = Reader::from_reader_no_headers(Cursor::new(b""));
401        assert_eq!(reader.byte_headers()?, &brec![]);
402        assert!(reader.byte_records().next().is_none());
403
404        Ok(())
405    }
406
407    #[test]
408    fn test_weirdness() -> error::Result<()> {
409        // Data after quotes, before next delimiter
410        let data =
411            b"name,surname\n\"test\"  \"wat\", ok\ntest \"wat\",ok  \ntest,\"whatever\"  ok\n\"test\"   there,\"ok\"\r\n";
412        let mut reader = Reader::from_reader_no_headers(Cursor::new(data));
413
414        let records = reader.byte_records().collect::<Result<Vec<_>, _>>()?;
415
416        let expected = vec![
417            brec!["name", "surname"],
418            brec!["test  \"wat", " ok"],
419            brec!["test \"wat", "ok  "],
420            brec!["test", "whatever  ok"],
421            brec!["test   there", "ok"],
422        ];
423
424        assert_eq!(records, expected);
425
426        // let data = "aaa\"aaa,bbb";
427        // let mut reader = Reader::from_reader_no_headers(Cursor::new(data));
428        // let record = reader.byte_records().next().unwrap().unwrap();
429
430        // assert_eq!(record, brec!["aaa\"aaa", "bbb"]);
431
432        let data = b"name,surname\n\r\rjohn,coucou";
433        let mut reader = Reader::from_reader_no_headers(Cursor::new(data));
434        let records = reader.byte_records().collect::<Result<Vec<_>, _>>()?;
435
436        assert_eq!(
437            records,
438            vec![brec!["name", "surname"], brec!["john", "coucou"]]
439        );
440
441        Ok(())
442    }
443}