simd_csv/
total_reader.rs

1use crate::core::{CoreReader, ReadResult};
2use crate::records::{ByteRecord, ByteRecordBuilder};
3use crate::utils::trim_bom;
4
5pub struct TotalReaderBuilder {
6    delimiter: u8,
7    quote: u8,
8    has_headers: bool,
9}
10
11impl Default for TotalReaderBuilder {
12    fn default() -> Self {
13        Self {
14            delimiter: b',',
15            quote: b'"',
16            has_headers: true,
17        }
18    }
19}
20
21impl TotalReaderBuilder {
22    pub fn new() -> Self {
23        Self::default()
24    }
25
26    pub fn delimiter(&mut self, delimiter: u8) -> &mut Self {
27        self.delimiter = delimiter;
28        self
29    }
30
31    pub fn quote(&mut self, quote: u8) -> &mut Self {
32        self.quote = quote;
33        self
34    }
35
36    pub fn has_headers(&mut self, yes: bool) -> &mut Self {
37        self.has_headers = yes;
38        self
39    }
40
41    pub fn from_bytes<'b>(&self, bytes: &'b [u8]) -> TotalReader<'b> {
42        TotalReader {
43            inner: CoreReader::new(self.delimiter, self.quote),
44            bytes,
45            pos: 0,
46            headers: ByteRecord::new(),
47            has_read: false,
48            has_headers: self.has_headers,
49        }
50    }
51}
52
53// NOTE: a reader to be used when the whole data fits into memory or when using
54// memory maps.
55pub struct TotalReader<'b> {
56    inner: CoreReader,
57    bytes: &'b [u8],
58    pos: usize,
59    headers: ByteRecord,
60    has_read: bool,
61    has_headers: bool,
62}
63
64impl<'b> TotalReader<'b> {
65    pub fn from_bytes(bytes: &'b [u8]) -> Self {
66        TotalReaderBuilder::new().from_bytes(bytes)
67    }
68
69    #[inline]
70    fn on_first_read(&mut self) {
71        if self.has_read {
72            return;
73        }
74
75        // Trimming BOM
76        let bom_len = trim_bom(self.bytes);
77        self.pos += bom_len;
78
79        // Reading headers
80        let mut headers = ByteRecord::new();
81
82        let has_data = self.read_byte_record_impl(&mut headers);
83
84        if has_data && !self.has_headers {
85            self.pos = bom_len;
86        }
87
88        self.headers = headers;
89        self.has_read = true;
90    }
91
92    #[inline]
93    pub fn byte_headers(&mut self) -> &ByteRecord {
94        self.on_first_read();
95
96        &self.headers
97    }
98
99    pub fn count_records(&mut self) -> u64 {
100        use ReadResult::*;
101
102        self.on_first_read();
103
104        let mut count: u64 = 0;
105
106        loop {
107            let (result, pos) = self.inner.split_record(&self.bytes[self.pos..]);
108
109            self.pos += pos;
110
111            match result {
112                End => break,
113                InputEmpty | Cr | Lf => continue,
114                Record => {
115                    count += 1;
116                }
117            };
118        }
119
120        count.saturating_sub(if self.has_headers { 1 } else { 0 })
121    }
122
123    pub fn split_record(&mut self) -> Option<&[u8]> {
124        use ReadResult::*;
125
126        self.on_first_read();
127
128        let starting_pos = self.pos;
129
130        loop {
131            let (result, pos) = self.inner.split_record(&self.bytes[self.pos..]);
132
133            self.pos += pos;
134
135            match result {
136                End => return None,
137                InputEmpty | Cr | Lf => continue,
138                Record => return Some(&self.bytes[starting_pos..self.pos]),
139            }
140        }
141    }
142
143    fn read_byte_record_impl(&mut self, record: &mut ByteRecord) -> bool {
144        use ReadResult::*;
145
146        record.clear();
147
148        let mut record_builder = ByteRecordBuilder::wrap(record);
149
150        loop {
151            let (result, pos) = self
152                .inner
153                .read_record(&self.bytes[self.pos..], &mut record_builder);
154
155            self.pos += pos;
156
157            match result {
158                End => {
159                    return false;
160                }
161                Cr | Lf | InputEmpty => {
162                    continue;
163                }
164                Record => {
165                    return true;
166                }
167            };
168        }
169    }
170
171    #[inline(always)]
172    pub fn read_byte_record(&mut self, record: &mut ByteRecord) -> bool {
173        self.on_first_read();
174        self.read_byte_record_impl(record)
175    }
176
177    #[inline(always)]
178    pub fn byte_records<'r>(&'r mut self) -> ByteRecordsIter<'r, 'b> {
179        ByteRecordsIter {
180            reader: self,
181            record: ByteRecord::new(),
182        }
183    }
184
185    #[inline(always)]
186    pub fn position(&self) -> u64 {
187        self.pos as u64
188    }
189}
190
191pub struct ByteRecordsIter<'r, 'b> {
192    reader: &'r mut TotalReader<'b>,
193    record: ByteRecord,
194}
195
196impl Iterator for ByteRecordsIter<'_, '_> {
197    type Item = ByteRecord;
198
199    #[inline]
200    fn next(&mut self) -> Option<Self::Item> {
201        // NOTE: cloning the record will not carry over excess capacity
202        // because the record only contains `Vec` currently.
203        if self.reader.read_byte_record(&mut self.record) {
204            Some(self.record.clone())
205        } else {
206            None
207        }
208    }
209}
210
211#[cfg(test)]
212mod tests {
213    use super::*;
214
215    use crate::brec;
216
217    impl<'b> TotalReader<'b> {
218        fn from_bytes_no_headers(bytes: &'b [u8]) -> Self {
219            TotalReaderBuilder::new()
220                .has_headers(false)
221                .from_bytes(bytes)
222        }
223    }
224
225    fn count_records(data: &str) -> u64 {
226        let mut reader = TotalReader::from_bytes_no_headers(data.as_bytes());
227        reader.count_records()
228    }
229
230    fn split_records(data: &str) -> u64 {
231        let mut reader = TotalReader::from_bytes_no_headers(data.as_bytes());
232
233        let mut count: u64 = 0;
234
235        while reader.split_record().is_some() {
236            count += 1;
237        }
238
239        count
240    }
241
242    #[test]
243    fn test_count() {
244        // Empty
245        assert_eq!(count_records(""), 0);
246
247        // Single cells with various empty lines
248        let tests = vec![
249            "name\njohn\nlucy",
250            "name\njohn\nlucy\n",
251            "name\n\njohn\r\nlucy\n",
252            "name\n\njohn\r\nlucy\n\n",
253            "name\n\n\njohn\r\n\r\nlucy\n\n\n",
254            "\nname\njohn\nlucy",
255            "\n\nname\njohn\nlucy",
256            "\r\n\r\nname\njohn\nlucy",
257            "name\njohn\nlucy\r\n",
258            "name\njohn\nlucy\r\n\r\n",
259        ];
260
261        for test in tests.iter() {
262            assert_eq!(count_records(test), 3, "string={:?}", test);
263            assert_eq!(split_records(test), 3, "string={:?}", test);
264        }
265    }
266
267    #[test]
268    fn test_byte_headers() {
269        let data = b"name,surname\njohn,dandy";
270
271        // Headers, call before read
272        let mut reader = TotalReader::from_bytes(data);
273        assert_eq!(reader.byte_headers(), &brec!["name", "surname"]);
274        assert_eq!(
275            reader.byte_records().next().unwrap(),
276            brec!["john", "dandy"]
277        );
278
279        // Headers, call after read
280        let mut reader = TotalReader::from_bytes(data);
281        assert_eq!(
282            reader.byte_records().next().unwrap(),
283            brec!["john", "dandy"]
284        );
285        assert_eq!(reader.byte_headers(), &brec!["name", "surname"]);
286
287        // No headers, call before read
288        let mut reader = TotalReader::from_bytes_no_headers(data);
289        assert_eq!(reader.byte_headers(), &brec!["name", "surname"]);
290        assert_eq!(
291            reader.byte_records().next().unwrap(),
292            brec!["name", "surname"]
293        );
294
295        // No headers, call after read
296        let mut reader = TotalReader::from_bytes_no_headers(data);
297        assert_eq!(
298            reader.byte_records().next().unwrap(),
299            brec!["name", "surname"]
300        );
301        assert_eq!(reader.byte_headers(), &brec!["name", "surname"]);
302
303        // Headers, empty
304        let mut reader = TotalReader::from_bytes(b"");
305        assert_eq!(reader.byte_headers(), &brec![]);
306        assert!(reader.byte_records().next().is_none());
307
308        // No headers, empty
309        let mut reader = TotalReader::from_bytes_no_headers(b"");
310        assert_eq!(reader.byte_headers(), &brec![]);
311        assert!(reader.byte_records().next().is_none());
312    }
313}