simd_csv/
total_reader.rs

1use crate::core::{CoreReader, ReadResult};
2use crate::records::{ByteRecord, ByteRecordBuilder};
3use crate::utils::trim_bom;
4
5pub struct TotalReaderBuilder {
6    delimiter: u8,
7    quote: u8,
8    has_headers: bool,
9}
10
11impl Default for TotalReaderBuilder {
12    fn default() -> Self {
13        Self {
14            delimiter: b',',
15            quote: b'"',
16            has_headers: true,
17        }
18    }
19}
20
21impl TotalReaderBuilder {
22    pub fn new() -> Self {
23        Self::default()
24    }
25
26    pub fn delimiter(&mut self, delimiter: u8) -> &mut Self {
27        self.delimiter = delimiter;
28        self
29    }
30
31    pub fn quote(&mut self, quote: u8) -> &mut Self {
32        self.quote = quote;
33        self
34    }
35
36    pub fn has_headers(&mut self, yes: bool) -> &mut Self {
37        self.has_headers = yes;
38        self
39    }
40
41    pub fn from_bytes<'b>(&self, bytes: &'b [u8]) -> TotalReader<'b> {
42        TotalReader {
43            inner: CoreReader::new(self.delimiter, self.quote),
44            bytes,
45            pos: 0,
46            headers: ByteRecord::new(),
47            has_read: false,
48            has_headers: self.has_headers,
49        }
50    }
51}
52
53// NOTE: a reader to be used when the whole data fits into memory or when using
54// memory maps.
55pub struct TotalReader<'b> {
56    inner: CoreReader,
57    bytes: &'b [u8],
58    pos: usize,
59    headers: ByteRecord,
60    has_read: bool,
61    has_headers: bool,
62}
63
64impl<'b> TotalReader<'b> {
65    pub fn from_bytes(bytes: &'b [u8]) -> Self {
66        TotalReaderBuilder::new().from_bytes(bytes)
67    }
68
69    #[inline]
70    fn on_first_read(&mut self) {
71        if self.has_read {
72            return;
73        }
74
75        // Trimming BOM
76        let bom_len = trim_bom(self.bytes);
77        self.pos += bom_len;
78
79        // Reading headers
80        let mut headers = ByteRecord::new();
81
82        let has_data = self.read_byte_record_impl(&mut headers);
83
84        if has_data && !self.has_headers {
85            self.pos = bom_len;
86        }
87
88        self.headers = headers;
89        self.has_read = true;
90    }
91
92    #[inline]
93    pub fn byte_headers(&mut self) -> &ByteRecord {
94        self.on_first_read();
95
96        &self.headers
97    }
98
99    pub fn count_records(&mut self) -> u64 {
100        use ReadResult::*;
101
102        self.on_first_read();
103
104        let mut count: u64 = 0;
105
106        loop {
107            let (result, pos) = self.inner.split_record(&self.bytes[self.pos..]);
108
109            self.pos += pos;
110
111            match result {
112                End => break,
113                InputEmpty | Cr | Lf => continue,
114                Record => {
115                    count += 1;
116                }
117            };
118        }
119
120        count.saturating_sub(if self.has_headers { 1 } else { 0 })
121    }
122
123    pub fn split_record(&mut self) -> Option<&[u8]> {
124        use ReadResult::*;
125
126        self.on_first_read();
127
128        let starting_pos = self.pos;
129
130        loop {
131            let (result, pos) = self.inner.split_record(&self.bytes[self.pos..]);
132
133            self.pos += pos;
134
135            match result {
136                End => return None,
137                InputEmpty | Cr | Lf => continue,
138                Record => return Some(&self.bytes[starting_pos..self.pos]),
139            }
140        }
141    }
142
143    fn read_byte_record_impl(&mut self, record: &mut ByteRecord) -> bool {
144        use ReadResult::*;
145
146        record.clear();
147
148        let mut record_builder = ByteRecordBuilder::wrap(record);
149
150        loop {
151            let (result, pos) = self
152                .inner
153                .read_record(&self.bytes[self.pos..], &mut record_builder);
154
155            self.pos += pos;
156
157            match result {
158                End => {
159                    return false;
160                }
161                Cr | Lf | InputEmpty => {
162                    continue;
163                }
164                Record => {
165                    return true;
166                }
167            };
168        }
169    }
170
171    #[inline(always)]
172    pub fn read_byte_record(&mut self, record: &mut ByteRecord) -> bool {
173        self.on_first_read();
174        self.read_byte_record_impl(record)
175    }
176
177    #[inline(always)]
178    pub fn byte_records<'r>(&'r mut self) -> ByteRecordsIter<'r, 'b> {
179        ByteRecordsIter {
180            reader: self,
181            record: ByteRecord::new(),
182        }
183    }
184}
185
186pub struct ByteRecordsIter<'r, 'b> {
187    reader: &'r mut TotalReader<'b>,
188    record: ByteRecord,
189}
190
191impl<'r, 'b> Iterator for ByteRecordsIter<'r, 'b> {
192    type Item = ByteRecord;
193
194    #[inline]
195    fn next(&mut self) -> Option<Self::Item> {
196        // NOTE: cloning the record will not carry over excess capacity
197        // because the record only contains `Vec` currently.
198        if self.reader.read_byte_record(&mut self.record) {
199            Some(self.record.clone())
200        } else {
201            None
202        }
203    }
204}
205
206#[cfg(test)]
207mod tests {
208    use super::*;
209
210    use crate::brec;
211
212    impl<'b> TotalReader<'b> {
213        fn from_bytes_no_headers(bytes: &'b [u8]) -> Self {
214            TotalReaderBuilder::new()
215                .has_headers(false)
216                .from_bytes(bytes)
217        }
218    }
219
220    fn count_records(data: &str) -> u64 {
221        let mut reader = TotalReader::from_bytes_no_headers(data.as_bytes());
222        reader.count_records()
223    }
224
225    fn split_records(data: &str) -> u64 {
226        let mut reader = TotalReader::from_bytes_no_headers(data.as_bytes());
227
228        let mut count: u64 = 0;
229
230        while reader.split_record().is_some() {
231            count += 1;
232        }
233
234        count
235    }
236
237    #[test]
238    fn test_count() {
239        // Empty
240        assert_eq!(count_records(""), 0);
241
242        // Single cells with various empty lines
243        let tests = vec![
244            "name\njohn\nlucy",
245            "name\njohn\nlucy\n",
246            "name\n\njohn\r\nlucy\n",
247            "name\n\njohn\r\nlucy\n\n",
248            "name\n\n\njohn\r\n\r\nlucy\n\n\n",
249            "\nname\njohn\nlucy",
250            "\n\nname\njohn\nlucy",
251            "\r\n\r\nname\njohn\nlucy",
252            "name\njohn\nlucy\r\n",
253            "name\njohn\nlucy\r\n\r\n",
254        ];
255
256        for test in tests.iter() {
257            assert_eq!(count_records(test), 3, "string={:?}", test);
258            assert_eq!(split_records(test), 3, "string={:?}", test);
259        }
260    }
261
262    #[test]
263    fn test_byte_headers() {
264        let data = b"name,surname\njohn,dandy";
265
266        // Headers, call before read
267        let mut reader = TotalReader::from_bytes(data);
268        assert_eq!(reader.byte_headers(), &brec!["name", "surname"]);
269        assert_eq!(
270            reader.byte_records().next().unwrap(),
271            brec!["john", "dandy"]
272        );
273
274        // Headers, call after read
275        let mut reader = TotalReader::from_bytes(data);
276        assert_eq!(
277            reader.byte_records().next().unwrap(),
278            brec!["john", "dandy"]
279        );
280        assert_eq!(reader.byte_headers(), &brec!["name", "surname"]);
281
282        // No headers, call before read
283        let mut reader = TotalReader::from_bytes_no_headers(data);
284        assert_eq!(reader.byte_headers(), &brec!["name", "surname"]);
285        assert_eq!(
286            reader.byte_records().next().unwrap(),
287            brec!["name", "surname"]
288        );
289
290        // No headers, call after read
291        let mut reader = TotalReader::from_bytes_no_headers(data);
292        assert_eq!(
293            reader.byte_records().next().unwrap(),
294            brec!["name", "surname"]
295        );
296        assert_eq!(reader.byte_headers(), &brec!["name", "surname"]);
297
298        // Headers, empty
299        let mut reader = TotalReader::from_bytes(b"");
300        assert_eq!(reader.byte_headers(), &brec![]);
301        assert!(reader.byte_records().next().is_none());
302
303        // No headers, empty
304        let mut reader = TotalReader::from_bytes_no_headers(b"");
305        assert_eq!(reader.byte_headers(), &brec![]);
306        assert!(reader.byte_records().next().is_none());
307    }
308}