simd_csv/
records.rs

1use std::borrow::Cow;
2use std::fmt;
3use std::hash::{Hash, Hasher};
4use std::ops::Index;
5
6use crate::debug;
7use crate::utils::{trim_trailing_crlf, unescape, unescape_to, unquoted};
8
9/// A view of a CSV record into a [`ZeroCopyReader`](crate::ZeroCopyReader) buffer.
10pub struct ZeroCopyByteRecord<'a> {
11    slice: &'a [u8],
12    seps: &'a [usize],
13    pub(crate) quote: u8,
14}
15
16impl<'a> ZeroCopyByteRecord<'a> {
17    #[inline]
18    pub(crate) fn new(slice: &'a [u8], seps: &'a [usize], quote: u8) -> Self {
19        Self {
20            slice: trim_trailing_crlf(slice),
21            seps,
22            quote,
23        }
24    }
25
26    #[inline]
27    pub(crate) fn to_parts(&self) -> (Vec<usize>, Vec<u8>) {
28        (self.seps.to_vec(), self.slice.to_vec())
29    }
30
31    /// Number of fields of the record. Cannot be less than 1 since a CSV with no
32    /// columns does not make sense.
33    #[inline(always)]
34    pub fn len(&self) -> usize {
35        // NOTE: an empty zero copy record cannot be constructed,
36        // by definition.
37        self.seps.len() + 1
38    }
39
40    /// Returns whether the record has no fields.
41    #[inline(always)]
42    pub fn is_empty(&self) -> bool {
43        false
44    }
45
46    /// Returns the underlying byte slice, delimiters and all.
47    #[inline(always)]
48    pub fn as_slice(&self) -> &[u8] {
49        self.slice
50    }
51
52    /// Returns an iterator over the record's fields, as-is.
53    ///
54    /// This means fields might or might not be quoted and
55    /// field bytes have not been unescaped at all.
56    #[inline]
57    pub fn iter(&self) -> ZeroCopyByteRecordIter<'_> {
58        ZeroCopyByteRecordIter {
59            record: self,
60            current_forward: 0,
61            current_backward: self.len(),
62        }
63    }
64
65    /// Returns an iterator over the record's fields, unquoted.
66    ///
67    /// See [`Self::unquote`] for more detail.
68    #[inline]
69    pub fn unquoted_iter(&self) -> ZeroCopyByteRecordUnquotedIter<'_> {
70        ZeroCopyByteRecordUnquotedIter {
71            record: self,
72            current_forward: 0,
73            current_backward: self.len(),
74        }
75    }
76
77    /// Returns an iterator over the record's fields, unescaped.
78    ///
79    /// See [`Self::unescape`] for more detail.
80    #[inline]
81    pub fn unescaped_iter(&self) -> ZeroCopyByteRecordUnescapedIter<'_> {
82        ZeroCopyByteRecordUnescapedIter {
83            record: self,
84            current_forward: 0,
85            current_backward: self.len(),
86        }
87    }
88
89    /// Returns the nth field of the zero copy byte record, if it is not
90    /// out-of-bounds.
91    ///
92    /// The field's bytes will be given as-is, quoted or unquoted, and won't be
93    /// unescaped at all.
94    #[inline]
95    pub fn get(&self, index: usize) -> Option<&[u8]> {
96        let len = self.seps.len();
97
98        if index > len {
99            return None;
100        }
101
102        let start = if index == 0 {
103            0
104        } else {
105            self.seps[index - 1] + 1
106        };
107
108        let end = if index == len {
109            self.slice.len()
110        } else {
111            self.seps[index]
112        };
113
114        Some(&self.slice[start..end])
115    }
116
117    /// Returns the nth field of the zero copy byte record, if it is not
118    /// out-of-bounds.
119    ///
120    /// The field's bytes will be given unquoted (i.e. without surrounding
121    /// quotes), but not unescaped (i.e. doubled double quotes will still be
122    /// there).
123    ///
124    /// The overhead vs. [`Self::get`] is only constant (we trim a leading and
125    /// trailing quote if required).
126    #[inline]
127    pub fn unquote(&self, index: usize) -> Option<&[u8]> {
128        self.get(index)
129            .map(|cell| unquoted(cell, self.quote).unwrap_or(cell))
130    }
131
132    /// Returns the nth field of the zero copy byte record, if it is not
133    /// out-of-bounds.
134    ///
135    /// The field's bytes will be completely unescaped.
136    ///
137    /// The overhead vs. [`Self::get`] is linear in the field's number of bytes.
138    ///
139    /// A [`Cow::Owned`] will be returned if the field actually needed
140    /// unescaping, else a [`Cow::Borrowed`] will be returned.
141    #[inline]
142    pub fn unescape(&self, index: usize) -> Option<Cow<[u8]>> {
143        self.unquote(index).map(|cell| {
144            if let Some(trimmed) = unquoted(cell, self.quote) {
145                unescape(trimmed, self.quote)
146            } else {
147                Cow::Borrowed(cell)
148            }
149        })
150    }
151
152    fn read_byte_record(&self, record: &mut ByteRecord) {
153        record.clear();
154
155        for cell in self.iter() {
156            if let Some(trimmed) = unquoted(cell, self.quote) {
157                unescape_to(trimmed, self.quote, &mut record.data);
158
159                let bounds_len = record.bounds.len();
160
161                let start = if bounds_len == 0 {
162                    0
163                } else {
164                    record.bounds[bounds_len - 1].1
165                };
166
167                record.bounds.push((start, record.data.len()));
168            } else {
169                record.push_field(cell);
170            }
171        }
172    }
173
174    /// Converts the zero copy byte record into a proper, owned [`ByteRecord`].
175    #[inline]
176    pub fn to_byte_record(&self) -> ByteRecord {
177        let mut record = ByteRecord::new();
178        self.read_byte_record(&mut record);
179        record
180    }
181
182    #[inline]
183    pub(crate) fn to_byte_record_in_reverse(&self) -> ByteRecord {
184        let mut record = ByteRecord::new();
185
186        for cell in self.unescaped_iter().rev() {
187            record.push_field_in_reverse(&cell);
188        }
189
190        record
191    }
192}
193
194impl fmt::Debug for ZeroCopyByteRecord<'_> {
195    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
196        write!(f, "ZeroCopyByteRecord(")?;
197        f.debug_list()
198            .entries(self.iter().map(debug::Bytes))
199            .finish()?;
200        write!(f, ")")?;
201        Ok(())
202    }
203}
204
205macro_rules! make_zero_copy_iterator {
206    ($name:ident, $method: ident, $out_type: ty) => {
207        pub struct $name<'a> {
208            record: &'a ZeroCopyByteRecord<'a>,
209            current_forward: usize,
210            current_backward: usize,
211        }
212
213        impl ExactSizeIterator for $name<'_> {}
214
215        impl<'a> Iterator for $name<'a> {
216            type Item = $out_type;
217
218            #[inline]
219            fn next(&mut self) -> Option<Self::Item> {
220                if self.current_forward == self.current_backward {
221                    None
222                } else {
223                    let cell = self.record.$method(self.current_forward);
224
225                    self.current_forward += 1;
226
227                    cell
228                }
229            }
230
231            #[inline]
232            fn size_hint(&self) -> (usize, Option<usize>) {
233                let size = self.current_backward - self.current_forward;
234
235                (size, Some(size))
236            }
237
238            #[inline]
239            fn count(self) -> usize
240            where
241                Self: Sized,
242            {
243                self.len()
244            }
245        }
246
247        impl DoubleEndedIterator for $name<'_> {
248            #[inline]
249            fn next_back(&mut self) -> Option<Self::Item> {
250                if self.current_forward == self.current_backward {
251                    None
252                } else {
253                    self.current_backward -= 1;
254
255                    self.record.$method(self.current_backward)
256                }
257            }
258        }
259    };
260}
261
262make_zero_copy_iterator!(ZeroCopyByteRecordIter, get, &'a [u8]);
263make_zero_copy_iterator!(ZeroCopyByteRecordUnquotedIter, unquote, &'a [u8]);
264make_zero_copy_iterator!(ZeroCopyByteRecordUnescapedIter, unescape, Cow<'a, [u8]>);
265
266impl Index<usize> for ZeroCopyByteRecord<'_> {
267    type Output = [u8];
268
269    #[inline]
270    fn index(&self, i: usize) -> &[u8] {
271        self.get(i).unwrap()
272    }
273}
274
275/// An owned, unescaped representation of a CSV record.
276#[derive(Default, Clone, Eq)]
277pub struct ByteRecord {
278    data: Vec<u8>,
279    bounds: Vec<(usize, usize)>,
280}
281
282impl ByteRecord {
283    pub fn new() -> Self {
284        Self::default()
285    }
286
287    #[inline]
288    pub fn len(&self) -> usize {
289        self.bounds.len()
290    }
291
292    #[inline]
293    pub fn is_empty(&self) -> bool {
294        self.len() == 0
295    }
296
297    #[inline]
298    pub fn clear(&mut self) {
299        self.data.clear();
300        self.bounds.clear();
301    }
302
303    #[inline]
304    pub fn truncate(&mut self, len: usize) {
305        self.bounds.truncate(len);
306
307        if let Some((_, end)) = self.bounds.last() {
308            self.data.truncate(*end);
309        } else {
310            self.data.clear();
311        }
312    }
313
314    #[inline]
315    pub fn as_slice(&self) -> &[u8] {
316        &self.data
317    }
318
319    #[inline]
320    pub fn iter(&self) -> ByteRecordIter<'_> {
321        ByteRecordIter {
322            record: self,
323            current_forward: 0,
324            current_backward: self.len(),
325        }
326    }
327
328    #[inline(always)]
329    pub fn push_field(&mut self, bytes: &[u8]) {
330        self.data.extend_from_slice(bytes);
331
332        let bounds_len = self.bounds.len();
333
334        let start = if bounds_len == 0 {
335            0
336        } else {
337            self.bounds[bounds_len - 1].1
338        };
339
340        self.bounds.push((start, self.data.len()));
341    }
342
343    #[inline]
344    fn push_field_in_reverse(&mut self, bytes: &[u8]) {
345        self.data.extend_from_slice(bytes);
346
347        let bounds_len = self.bounds.len();
348
349        let start = if bounds_len == 0 {
350            0
351        } else {
352            self.bounds[bounds_len - 1].1
353        };
354
355        let bounds = (start, self.data.len());
356        self.data[bounds.0..bounds.1].reverse();
357
358        self.bounds.push(bounds);
359    }
360
361    #[inline]
362    pub fn get(&self, index: usize) -> Option<&[u8]> {
363        self.bounds
364            .get(index)
365            .copied()
366            .map(|(start, end)| &self.data[start..end])
367    }
368
369    pub(crate) fn reverse(&mut self) {
370        self.data.reverse();
371        self.bounds.reverse();
372
373        let len = self.data.len();
374
375        for (start, end) in self.bounds.iter_mut() {
376            let new_end = len - *start;
377            let new_start = len - *end;
378
379            *start = new_start;
380            *end = new_end;
381        }
382    }
383}
384
385impl PartialEq for ByteRecord {
386    fn eq(&self, other: &Self) -> bool {
387        if self.bounds.len() != other.bounds.len() {
388            return false;
389        }
390
391        self.iter()
392            .zip(other.iter())
393            .all(|(self_cell, other_cell)| self_cell == other_cell)
394    }
395}
396
397impl Hash for ByteRecord {
398    #[inline]
399    fn hash<H: Hasher>(&self, state: &mut H) {
400        state.write_usize(self.len());
401
402        for cell in self.iter() {
403            state.write(cell);
404        }
405    }
406}
407
408impl Index<usize> for ByteRecord {
409    type Output = [u8];
410
411    #[inline]
412    fn index(&self, i: usize) -> &[u8] {
413        self.get(i).unwrap()
414    }
415}
416
417impl<T: AsRef<[u8]>> Extend<T> for ByteRecord {
418    #[inline]
419    fn extend<I: IntoIterator<Item = T>>(&mut self, iter: I) {
420        for x in iter {
421            self.push_field(x.as_ref());
422        }
423    }
424}
425
426impl<T: AsRef<[u8]>> FromIterator<T> for ByteRecord {
427    #[inline]
428    fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> Self {
429        let mut record = Self::new();
430        record.extend(iter);
431        record
432    }
433}
434
435impl<I, T> From<I> for ByteRecord
436where
437    I: IntoIterator<Item = T>,
438    T: AsRef<[u8]>,
439{
440    fn from(value: I) -> Self {
441        let mut record = Self::new();
442
443        for cell in value.into_iter() {
444            record.push_field(cell.as_ref());
445        }
446
447        record
448    }
449}
450
451impl<'r> IntoIterator for &'r ByteRecord {
452    type IntoIter = ByteRecordIter<'r>;
453    type Item = &'r [u8];
454
455    #[inline]
456    fn into_iter(self) -> ByteRecordIter<'r> {
457        self.iter()
458    }
459}
460
461impl fmt::Debug for ByteRecord {
462    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
463        write!(f, "ByteRecord(")?;
464        f.debug_list()
465            .entries(self.iter().map(debug::Bytes))
466            .finish()?;
467        write!(f, ")")?;
468        Ok(())
469    }
470}
471
472pub struct ByteRecordIter<'a> {
473    record: &'a ByteRecord,
474    current_forward: usize,
475    current_backward: usize,
476}
477
478impl ExactSizeIterator for ByteRecordIter<'_> {}
479
480impl<'a> Iterator for ByteRecordIter<'a> {
481    type Item = &'a [u8];
482
483    #[inline]
484    fn next(&mut self) -> Option<Self::Item> {
485        if self.current_forward == self.current_backward {
486            None
487        } else {
488            let (start, end) = self.record.bounds[self.current_forward];
489
490            self.current_forward += 1;
491
492            Some(&self.record.data[start..end])
493        }
494    }
495
496    #[inline]
497    fn size_hint(&self) -> (usize, Option<usize>) {
498        let size = self.current_backward - self.current_forward;
499
500        (size, Some(size))
501    }
502
503    #[inline]
504    fn count(self) -> usize
505    where
506        Self: Sized,
507    {
508        self.len()
509    }
510}
511
512impl DoubleEndedIterator for ByteRecordIter<'_> {
513    #[inline]
514    fn next_back(&mut self) -> Option<Self::Item> {
515        if self.current_forward == self.current_backward {
516            None
517        } else {
518            self.current_backward -= 1;
519
520            let (start, end) = self.record.bounds[self.current_backward];
521
522            Some(&self.record.data[start..end])
523        }
524    }
525}
526
527pub(crate) struct ByteRecordBuilder<'r> {
528    record: &'r mut ByteRecord,
529    start: usize,
530}
531
532impl<'r> ByteRecordBuilder<'r> {
533    #[inline(always)]
534    pub(crate) fn wrap(record: &'r mut ByteRecord) -> Self {
535        Self { record, start: 0 }
536    }
537
538    #[inline(always)]
539    pub(crate) fn extend_from_slice(&mut self, slice: &[u8]) {
540        self.record.data.extend_from_slice(slice);
541    }
542
543    #[inline(always)]
544    pub(crate) fn push_byte(&mut self, byte: u8) {
545        self.record.data.push(byte);
546    }
547
548    #[inline]
549    pub(crate) fn finalize_field(&mut self) {
550        let start = self.start;
551        self.start = self.record.data.len();
552
553        self.record.bounds.push((start, self.start));
554    }
555
556    #[inline]
557    pub(crate) fn finalize_record(&mut self) {
558        if let Some(b'\r') = self.record.data.last() {
559            self.record.data.pop();
560        }
561
562        self.finalize_field();
563    }
564
565    #[inline]
566    pub(crate) fn finalize_field_preemptively(&mut self, offset: usize) {
567        let start = self.start;
568        self.start = self.record.data.len() + offset;
569
570        self.record.bounds.push((start, self.start));
571
572        self.start += 1;
573    }
574
575    #[inline(always)]
576    pub(crate) fn bump(&mut self) {
577        self.start +=
578            (self.record.bounds.last().map(|(s, _)| *s).unwrap_or(0) != self.start) as usize;
579    }
580}
581
582#[cfg(test)]
583mod tests {
584    use super::*;
585
586    #[test]
587    fn test_zero_copy_byte_record() {
588        let record = ZeroCopyByteRecord::new(b"name,surname,age", &[4, 12], b'"');
589
590        assert_eq!(record.len(), 3);
591
592        let expected: Vec<&[u8]> = vec![b"name", b"surname", b"age"];
593        assert_eq!(record.iter().collect::<Vec<_>>(), expected);
594
595        for i in 0..expected.len() {
596            assert_eq!(record.get(i), Some(expected[i]));
597        }
598
599        assert_eq!(record.get(4), None);
600    }
601
602    #[test]
603    fn test_byte_record() {
604        let mut record = ByteRecord::new();
605
606        assert_eq!(record.len(), 0);
607        assert_eq!(record.is_empty(), true);
608        assert_eq!(record.get(0), None);
609
610        record.push_field(b"name");
611        record.push_field(b"surname");
612        record.push_field(b"age");
613
614        let expected: Vec<&[u8]> = vec![b"name", b"surname", b"age"];
615        assert_eq!(record.iter().collect::<Vec<_>>(), expected);
616
617        assert_eq!(record.get(0), Some::<&[u8]>(b"name"));
618        assert_eq!(record.get(1), Some::<&[u8]>(b"surname"));
619        assert_eq!(record.get(2), Some::<&[u8]>(b"age"));
620        assert_eq!(record.get(3), None);
621    }
622
623    #[test]
624    fn test_mutate_record_after_read() {
625        let mut record = ByteRecord::new();
626        let mut builder = ByteRecordBuilder::wrap(&mut record);
627        builder.extend_from_slice(b"test\r");
628        builder.finalize_record();
629
630        assert_eq!(record.iter().collect::<Vec<_>>(), vec![b"test"]);
631
632        record.push_field(b"next");
633
634        assert_eq!(record.iter().collect::<Vec<_>>(), vec![b"test", b"next"]);
635    }
636
637    #[test]
638    fn test_reverse_byte_record() {
639        let record = brec!["name", "surname", "age"];
640        let mut reversed = record.clone();
641        reversed.reverse();
642
643        assert_eq!(reversed, brec!["ega", "emanrus", "eman"]);
644        reversed.reverse();
645        assert_eq!(record, reversed);
646    }
647}