simd_csv/
total_reader.rs

1use crate::core::{CoreReader, ReadResult};
2use crate::records::{ByteRecord, ByteRecordBuilder};
3use crate::utils::trim_bom;
4
5/// Builds a [`TotalReader`] with given configuration.
6pub struct TotalReaderBuilder {
7    delimiter: u8,
8    quote: u8,
9    has_headers: bool,
10}
11
12impl Default for TotalReaderBuilder {
13    fn default() -> Self {
14        Self {
15            delimiter: b',',
16            quote: b'"',
17            has_headers: true,
18        }
19    }
20}
21
22impl TotalReaderBuilder {
23    /// Create a new [`TotalReaderBuilder`] with default configuration.
24    pub fn new() -> Self {
25        Self::default()
26    }
27
28    /// Set the delimiter to be used by the created [`TotalReader`].
29    ///
30    /// This delimiter must be a single byte.
31    ///
32    /// Will default to a comma.
33    pub fn delimiter(&mut self, delimiter: u8) -> &mut Self {
34        self.delimiter = delimiter;
35        self
36    }
37
38    /// Set the quote char to be used by the created [`TotalReader`].
39    ///
40    /// This char must be a single byte.
41    ///
42    /// Will default to a double quote.
43    pub fn quote(&mut self, quote: u8) -> &mut Self {
44        self.quote = quote;
45        self
46    }
47
48    /// Indicate whether first record must be understood as a header.
49    ///
50    /// Will default to `true`.
51    pub fn has_headers(&mut self, yes: bool) -> &mut Self {
52        self.has_headers = yes;
53        self
54    }
55
56    /// Create a [`TotalReader`] from given bytes.
57    pub fn from_bytes<'b>(&self, bytes: &'b [u8]) -> TotalReader<'b> {
58        TotalReader {
59            inner: CoreReader::new(self.delimiter, self.quote),
60            bytes,
61            pos: 0,
62            headers: ByteRecord::new(),
63            has_read: false,
64            has_headers: self.has_headers,
65        }
66    }
67}
68
69/// An already configured CSV reader working on a slice of bytes or on a memory
70/// map.
71///
72/// # Configuration
73///
74/// To configure a [`TotalReader`], if you need a custom delimiter for instance
75/// of if you want to tweak the size of the inner buffer. Check out the
76/// [`TotalReaderBuilder`].
77pub struct TotalReader<'b> {
78    inner: CoreReader,
79    bytes: &'b [u8],
80    pos: usize,
81    headers: ByteRecord,
82    has_read: bool,
83    has_headers: bool,
84}
85
86impl<'b> TotalReader<'b> {
87    /// Create a new reader with default configuration using the provided byte
88    /// slice.
89    pub fn from_bytes(bytes: &'b [u8]) -> Self {
90        TotalReaderBuilder::new().from_bytes(bytes)
91    }
92
93    #[inline]
94    fn on_first_read(&mut self) {
95        if self.has_read {
96            return;
97        }
98
99        // Trimming BOM
100        let bom_len = trim_bom(self.bytes);
101        self.pos += bom_len;
102
103        // Reading headers
104        let mut headers = ByteRecord::new();
105
106        let has_data = self.read_byte_record_impl(&mut headers);
107
108        if has_data && !self.has_headers {
109            self.pos = bom_len;
110        }
111
112        self.headers = headers;
113        self.has_read = true;
114    }
115
116    /// Returns whether this reader has been configured to interpret the first
117    /// record as a header.
118    #[inline]
119    pub fn byte_headers(&mut self) -> &ByteRecord {
120        self.on_first_read();
121
122        &self.headers
123    }
124
125    /// Count the total number of records.
126    pub fn count_records(&mut self) -> u64 {
127        use ReadResult::*;
128
129        self.on_first_read();
130
131        let mut count: u64 = 0;
132
133        loop {
134            let (result, pos) = self.inner.split_record(&self.bytes[self.pos..]);
135
136            self.pos += pos;
137
138            match result {
139                End => break,
140                InputEmpty | Cr | Lf => continue,
141                Record => {
142                    count += 1;
143                }
144            };
145        }
146
147        count.saturating_sub(if self.has_headers { 1 } else { 0 })
148    }
149
150    /// Attempt to split the next CSV record and return an optional reference to
151    /// its byte slice.
152    ///
153    /// Returns `Ok(None)` when the reader is fully consumed.
154    pub fn split_record(&mut self) -> Option<&[u8]> {
155        use ReadResult::*;
156
157        self.on_first_read();
158
159        let starting_pos = self.pos;
160
161        loop {
162            let (result, pos) = self.inner.split_record(&self.bytes[self.pos..]);
163
164            self.pos += pos;
165
166            match result {
167                End => return None,
168                InputEmpty | Cr | Lf => continue,
169                Record => return Some(&self.bytes[starting_pos..self.pos]),
170            }
171        }
172    }
173
174    fn read_byte_record_impl(&mut self, record: &mut ByteRecord) -> bool {
175        use ReadResult::*;
176
177        record.clear();
178
179        let mut record_builder = ByteRecordBuilder::wrap(record);
180
181        loop {
182            let (result, pos) = self
183                .inner
184                .read_record(&self.bytes[self.pos..], &mut record_builder);
185
186            self.pos += pos;
187
188            match result {
189                End => {
190                    return false;
191                }
192                Cr | Lf | InputEmpty => {
193                    continue;
194                }
195                Record => {
196                    return true;
197                }
198            };
199        }
200    }
201
202    /// Attempt to read the next CSV record into a pre-allocated [`ByteRecord`].
203    ///
204    /// Returns a boolean indicating whether a record was actually read or if we
205    /// reached the end of the stream.
206    #[inline(always)]
207    pub fn read_byte_record(&mut self, record: &mut ByteRecord) -> bool {
208        self.on_first_read();
209        self.read_byte_record_impl(record)
210    }
211
212    /// Return an iterator yielding [`ByteRecord`] structs.
213    #[inline(always)]
214    pub fn byte_records<'r>(&'r mut self) -> ByteRecordsIter<'r, 'b> {
215        ByteRecordsIter {
216            reader: self,
217            record: ByteRecord::new(),
218        }
219    }
220
221    /// Returns the current byte offset of the reader into its byte slice.
222    #[inline(always)]
223    pub fn position(&self) -> u64 {
224        self.pos as u64
225    }
226}
227
228pub struct ByteRecordsIter<'r, 'b> {
229    reader: &'r mut TotalReader<'b>,
230    record: ByteRecord,
231}
232
233impl Iterator for ByteRecordsIter<'_, '_> {
234    type Item = ByteRecord;
235
236    #[inline]
237    fn next(&mut self) -> Option<Self::Item> {
238        // NOTE: cloning the record will not carry over excess capacity
239        // because the record only contains `Vec` currently.
240        if self.reader.read_byte_record(&mut self.record) {
241            Some(self.record.clone())
242        } else {
243            None
244        }
245    }
246}
247
248#[cfg(test)]
249mod tests {
250    use super::*;
251
252    impl<'b> TotalReader<'b> {
253        fn from_bytes_no_headers(bytes: &'b [u8]) -> Self {
254            TotalReaderBuilder::new()
255                .has_headers(false)
256                .from_bytes(bytes)
257        }
258    }
259
260    fn count_records(data: &str) -> u64 {
261        let mut reader = TotalReader::from_bytes_no_headers(data.as_bytes());
262        reader.count_records()
263    }
264
265    fn split_records(data: &str) -> u64 {
266        let mut reader = TotalReader::from_bytes_no_headers(data.as_bytes());
267
268        let mut count: u64 = 0;
269
270        while reader.split_record().is_some() {
271            count += 1;
272        }
273
274        count
275    }
276
277    #[test]
278    fn test_count() {
279        // Empty
280        assert_eq!(count_records(""), 0);
281
282        // Single cells with various empty lines
283        let tests = vec![
284            "name\njohn\nlucy",
285            "name\njohn\nlucy\n",
286            "name\n\njohn\r\nlucy\n",
287            "name\n\njohn\r\nlucy\n\n",
288            "name\n\n\njohn\r\n\r\nlucy\n\n\n",
289            "\nname\njohn\nlucy",
290            "\n\nname\njohn\nlucy",
291            "\r\n\r\nname\njohn\nlucy",
292            "name\njohn\nlucy\r\n",
293            "name\njohn\nlucy\r\n\r\n",
294        ];
295
296        for test in tests.iter() {
297            assert_eq!(count_records(test), 3, "string={:?}", test);
298            assert_eq!(split_records(test), 3, "string={:?}", test);
299        }
300    }
301
302    #[test]
303    fn test_byte_headers() {
304        let data = b"name,surname\njohn,dandy";
305
306        // Headers, call before read
307        let mut reader = TotalReader::from_bytes(data);
308        assert_eq!(reader.byte_headers(), &brec!["name", "surname"]);
309        assert_eq!(
310            reader.byte_records().next().unwrap(),
311            brec!["john", "dandy"]
312        );
313
314        // Headers, call after read
315        let mut reader = TotalReader::from_bytes(data);
316        assert_eq!(
317            reader.byte_records().next().unwrap(),
318            brec!["john", "dandy"]
319        );
320        assert_eq!(reader.byte_headers(), &brec!["name", "surname"]);
321
322        // No headers, call before read
323        let mut reader = TotalReader::from_bytes_no_headers(data);
324        assert_eq!(reader.byte_headers(), &brec!["name", "surname"]);
325        assert_eq!(
326            reader.byte_records().next().unwrap(),
327            brec!["name", "surname"]
328        );
329
330        // No headers, call after read
331        let mut reader = TotalReader::from_bytes_no_headers(data);
332        assert_eq!(
333            reader.byte_records().next().unwrap(),
334            brec!["name", "surname"]
335        );
336        assert_eq!(reader.byte_headers(), &brec!["name", "surname"]);
337
338        // Headers, empty
339        let mut reader = TotalReader::from_bytes(b"");
340        assert_eq!(reader.byte_headers(), &brec![]);
341        assert!(reader.byte_records().next().is_none());
342
343        // No headers, empty
344        let mut reader = TotalReader::from_bytes_no_headers(b"");
345        assert_eq!(reader.byte_headers(), &brec![]);
346        assert!(reader.byte_records().next().is_none());
347    }
348}