simd_csv/
total_reader.rs

1use crate::core::{CoreReader, ReadResult};
2use crate::records::{ByteRecord, ByteRecordBuilder};
3use crate::utils::trim_bom;
4
5/// Builds a [`TotalReader`] with given configuration.
6pub struct TotalReaderBuilder {
7    delimiter: u8,
8    quote: u8,
9    has_headers: bool,
10}
11
12impl Default for TotalReaderBuilder {
13    fn default() -> Self {
14        Self {
15            delimiter: b',',
16            quote: b'"',
17            has_headers: true,
18        }
19    }
20}
21
22impl TotalReaderBuilder {
23    pub fn new() -> Self {
24        Self::default()
25    }
26
27    pub fn delimiter(&mut self, delimiter: u8) -> &mut Self {
28        self.delimiter = delimiter;
29        self
30    }
31
32    pub fn quote(&mut self, quote: u8) -> &mut Self {
33        self.quote = quote;
34        self
35    }
36
37    pub fn has_headers(&mut self, yes: bool) -> &mut Self {
38        self.has_headers = yes;
39        self
40    }
41
42    pub fn from_bytes<'b>(&self, bytes: &'b [u8]) -> TotalReader<'b> {
43        TotalReader {
44            inner: CoreReader::new(self.delimiter, self.quote),
45            bytes,
46            pos: 0,
47            headers: ByteRecord::new(),
48            has_read: false,
49            has_headers: self.has_headers,
50        }
51    }
52}
53
54/// An already configured CSV reader working on a slice of bytes or on a memory
55/// map.
56///
57/// # Configuration
58///
59/// To configure a [`TotalReader`], if you need a custom delimiter for instance
60/// of if you want to tweak the size of the inner buffer. Check out the
61/// [`TotalReaderBuilder`].
62pub struct TotalReader<'b> {
63    inner: CoreReader,
64    bytes: &'b [u8],
65    pos: usize,
66    headers: ByteRecord,
67    has_read: bool,
68    has_headers: bool,
69}
70
71impl<'b> TotalReader<'b> {
72    pub fn from_bytes(bytes: &'b [u8]) -> Self {
73        TotalReaderBuilder::new().from_bytes(bytes)
74    }
75
76    #[inline]
77    fn on_first_read(&mut self) {
78        if self.has_read {
79            return;
80        }
81
82        // Trimming BOM
83        let bom_len = trim_bom(self.bytes);
84        self.pos += bom_len;
85
86        // Reading headers
87        let mut headers = ByteRecord::new();
88
89        let has_data = self.read_byte_record_impl(&mut headers);
90
91        if has_data && !self.has_headers {
92            self.pos = bom_len;
93        }
94
95        self.headers = headers;
96        self.has_read = true;
97    }
98
99    /// Returns whether this reader has been configured to interpret the first
100    /// record as a header.
101    #[inline]
102    pub fn byte_headers(&mut self) -> &ByteRecord {
103        self.on_first_read();
104
105        &self.headers
106    }
107
108    pub fn count_records(&mut self) -> u64 {
109        use ReadResult::*;
110
111        self.on_first_read();
112
113        let mut count: u64 = 0;
114
115        loop {
116            let (result, pos) = self.inner.split_record(&self.bytes[self.pos..]);
117
118            self.pos += pos;
119
120            match result {
121                End => break,
122                InputEmpty | Cr | Lf => continue,
123                Record => {
124                    count += 1;
125                }
126            };
127        }
128
129        count.saturating_sub(if self.has_headers { 1 } else { 0 })
130    }
131
132    pub fn split_record(&mut self) -> Option<&[u8]> {
133        use ReadResult::*;
134
135        self.on_first_read();
136
137        let starting_pos = self.pos;
138
139        loop {
140            let (result, pos) = self.inner.split_record(&self.bytes[self.pos..]);
141
142            self.pos += pos;
143
144            match result {
145                End => return None,
146                InputEmpty | Cr | Lf => continue,
147                Record => return Some(&self.bytes[starting_pos..self.pos]),
148            }
149        }
150    }
151
152    fn read_byte_record_impl(&mut self, record: &mut ByteRecord) -> bool {
153        use ReadResult::*;
154
155        record.clear();
156
157        let mut record_builder = ByteRecordBuilder::wrap(record);
158
159        loop {
160            let (result, pos) = self
161                .inner
162                .read_record(&self.bytes[self.pos..], &mut record_builder);
163
164            self.pos += pos;
165
166            match result {
167                End => {
168                    return false;
169                }
170                Cr | Lf | InputEmpty => {
171                    continue;
172                }
173                Record => {
174                    return true;
175                }
176            };
177        }
178    }
179
180    #[inline(always)]
181    pub fn read_byte_record(&mut self, record: &mut ByteRecord) -> bool {
182        self.on_first_read();
183        self.read_byte_record_impl(record)
184    }
185
186    #[inline(always)]
187    pub fn byte_records<'r>(&'r mut self) -> ByteRecordsIter<'r, 'b> {
188        ByteRecordsIter {
189            reader: self,
190            record: ByteRecord::new(),
191        }
192    }
193
194    #[inline(always)]
195    pub fn position(&self) -> u64 {
196        self.pos as u64
197    }
198}
199
200pub struct ByteRecordsIter<'r, 'b> {
201    reader: &'r mut TotalReader<'b>,
202    record: ByteRecord,
203}
204
205impl Iterator for ByteRecordsIter<'_, '_> {
206    type Item = ByteRecord;
207
208    #[inline]
209    fn next(&mut self) -> Option<Self::Item> {
210        // NOTE: cloning the record will not carry over excess capacity
211        // because the record only contains `Vec` currently.
212        if self.reader.read_byte_record(&mut self.record) {
213            Some(self.record.clone())
214        } else {
215            None
216        }
217    }
218}
219
220#[cfg(test)]
221mod tests {
222    use super::*;
223
224    impl<'b> TotalReader<'b> {
225        fn from_bytes_no_headers(bytes: &'b [u8]) -> Self {
226            TotalReaderBuilder::new()
227                .has_headers(false)
228                .from_bytes(bytes)
229        }
230    }
231
232    fn count_records(data: &str) -> u64 {
233        let mut reader = TotalReader::from_bytes_no_headers(data.as_bytes());
234        reader.count_records()
235    }
236
237    fn split_records(data: &str) -> u64 {
238        let mut reader = TotalReader::from_bytes_no_headers(data.as_bytes());
239
240        let mut count: u64 = 0;
241
242        while reader.split_record().is_some() {
243            count += 1;
244        }
245
246        count
247    }
248
249    #[test]
250    fn test_count() {
251        // Empty
252        assert_eq!(count_records(""), 0);
253
254        // Single cells with various empty lines
255        let tests = vec![
256            "name\njohn\nlucy",
257            "name\njohn\nlucy\n",
258            "name\n\njohn\r\nlucy\n",
259            "name\n\njohn\r\nlucy\n\n",
260            "name\n\n\njohn\r\n\r\nlucy\n\n\n",
261            "\nname\njohn\nlucy",
262            "\n\nname\njohn\nlucy",
263            "\r\n\r\nname\njohn\nlucy",
264            "name\njohn\nlucy\r\n",
265            "name\njohn\nlucy\r\n\r\n",
266        ];
267
268        for test in tests.iter() {
269            assert_eq!(count_records(test), 3, "string={:?}", test);
270            assert_eq!(split_records(test), 3, "string={:?}", test);
271        }
272    }
273
274    #[test]
275    fn test_byte_headers() {
276        let data = b"name,surname\njohn,dandy";
277
278        // Headers, call before read
279        let mut reader = TotalReader::from_bytes(data);
280        assert_eq!(reader.byte_headers(), &brec!["name", "surname"]);
281        assert_eq!(
282            reader.byte_records().next().unwrap(),
283            brec!["john", "dandy"]
284        );
285
286        // Headers, call after read
287        let mut reader = TotalReader::from_bytes(data);
288        assert_eq!(
289            reader.byte_records().next().unwrap(),
290            brec!["john", "dandy"]
291        );
292        assert_eq!(reader.byte_headers(), &brec!["name", "surname"]);
293
294        // No headers, call before read
295        let mut reader = TotalReader::from_bytes_no_headers(data);
296        assert_eq!(reader.byte_headers(), &brec!["name", "surname"]);
297        assert_eq!(
298            reader.byte_records().next().unwrap(),
299            brec!["name", "surname"]
300        );
301
302        // No headers, call after read
303        let mut reader = TotalReader::from_bytes_no_headers(data);
304        assert_eq!(
305            reader.byte_records().next().unwrap(),
306            brec!["name", "surname"]
307        );
308        assert_eq!(reader.byte_headers(), &brec!["name", "surname"]);
309
310        // Headers, empty
311        let mut reader = TotalReader::from_bytes(b"");
312        assert_eq!(reader.byte_headers(), &brec![]);
313        assert!(reader.byte_records().next().is_none());
314
315        // No headers, empty
316        let mut reader = TotalReader::from_bytes_no_headers(b"");
317        assert_eq!(reader.byte_headers(), &brec![]);
318        assert!(reader.byte_records().next().is_none());
319    }
320}