simd_csv/
records.rs

1use std::borrow::Cow;
2use std::fmt;
3use std::hash::{Hash, Hasher};
4use std::ops::Index;
5
6use crate::debug;
7use crate::utils::{trim_trailing_crlf, unescape, unescape_to, unquoted};
8
9/// A view of a CSV record into a [`ZeroCopyReader`](crate::ZeroCopyReader) buffer.
10pub struct ZeroCopyByteRecord<'a> {
11    slice: &'a [u8],
12    seps: &'a [usize],
13    pub(crate) quote: u8,
14}
15
16impl<'a> ZeroCopyByteRecord<'a> {
17    #[inline]
18    pub(crate) fn new(slice: &'a [u8], seps: &'a [usize], quote: u8) -> Self {
19        Self {
20            slice: trim_trailing_crlf(slice),
21            seps,
22            quote,
23        }
24    }
25
26    #[inline]
27    pub(crate) fn to_parts(&self) -> (Vec<usize>, Vec<u8>) {
28        (self.seps.to_vec(), self.slice.to_vec())
29    }
30
31    /// Number of fields of the record. Cannot be less than 1 since a CSV with no
32    /// columns does not make sense.
33    #[inline(always)]
34    pub fn len(&self) -> usize {
35        // NOTE: an empty zero copy record cannot be constructed,
36        // by definition.
37        self.seps.len() + 1
38    }
39
40    /// Returns whether the record has no fields.
41    #[inline(always)]
42    pub fn is_empty(&self) -> bool {
43        false
44    }
45
46    /// Returns the underlying byte slice, delimiters and all.
47    #[inline(always)]
48    pub fn as_slice(&self) -> &[u8] {
49        self.slice
50    }
51
52    /// Returns an iterator over the record's fields, as-is.
53    ///
54    /// This means fields might or might not be quoted and
55    /// field bytes have not been unescaped at all.
56    #[inline]
57    pub fn iter(&self) -> ZeroCopyByteRecordIter<'_> {
58        ZeroCopyByteRecordIter {
59            record: self,
60            current_forward: 0,
61            current_backward: self.len(),
62        }
63    }
64
65    /// Returns an iterator over the record's fields, unquoted.
66    ///
67    /// See [`Self::unquote`] for more detail.
68    #[inline]
69    pub fn unquoted_iter(&self) -> ZeroCopyByteRecordUnquotedIter<'_> {
70        ZeroCopyByteRecordUnquotedIter {
71            record: self,
72            current_forward: 0,
73            current_backward: self.len(),
74        }
75    }
76
77    /// Returns an iterator over the record's fields, unescaped.
78    ///
79    /// See [`Self::unescape`] for more detail.
80    #[inline]
81    pub fn unescaped_iter(&self) -> ZeroCopyByteRecordUnescapedIter<'_> {
82        ZeroCopyByteRecordUnescapedIter {
83            record: self,
84            current_forward: 0,
85            current_backward: self.len(),
86        }
87    }
88
89    /// Returns the nth field of the zero copy byte record, if it is not
90    /// out-of-bounds.
91    ///
92    /// The field's bytes will be given as-is, quoted or unquoted, and won't be
93    /// unescaped at all.
94    #[inline]
95    pub fn get(&self, index: usize) -> Option<&[u8]> {
96        let len = self.seps.len();
97
98        if index > len {
99            return None;
100        }
101
102        let start = if index == 0 {
103            0
104        } else {
105            self.seps[index - 1] + 1
106        };
107
108        let end = if index == len {
109            self.slice.len()
110        } else {
111            self.seps[index]
112        };
113
114        Some(&self.slice[start..end])
115    }
116
117    /// Returns the nth field of the zero copy byte record, if it is not
118    /// out-of-bounds.
119    ///
120    /// The field's bytes will be given unquoted (i.e. without surrounding
121    /// quotes), but not unescaped (i.e. doubled double quotes will still be
122    /// there).
123    ///
124    /// The overhead vs. [`Self::get`] is only constant (we trim a leading and
125    /// trailing quote if required).
126    #[inline]
127    pub fn unquote(&self, index: usize) -> Option<&[u8]> {
128        self.get(index)
129            .map(|cell| unquoted(cell, self.quote).unwrap_or(cell))
130    }
131
132    /// Returns the nth field of the zero copy byte record, if it is not
133    /// out-of-bounds.
134    ///
135    /// The field's bytes will be completely unescaped.
136    ///
137    /// The overhead vs. [`Self::get`] is linear in the field's number of bytes.
138    ///
139    /// A [`Cow::Owned`] will be returned if the field actually needed
140    /// unescaping, else a [`Cow::Borrowed`] will be returned.
141    #[inline]
142    pub fn unescape(&self, index: usize) -> Option<Cow<[u8]>> {
143        self.unquote(index).map(|cell| {
144            if let Some(trimmed) = unquoted(cell, self.quote) {
145                unescape(trimmed, self.quote)
146            } else {
147                Cow::Borrowed(cell)
148            }
149        })
150    }
151
152    fn read_byte_record(&self, record: &mut ByteRecord) {
153        record.clear();
154
155        for cell in self.iter() {
156            if let Some(trimmed) = unquoted(cell, self.quote) {
157                unescape_to(trimmed, self.quote, &mut record.data);
158
159                let bounds_len = record.bounds.len();
160
161                let start = if bounds_len == 0 {
162                    0
163                } else {
164                    record.bounds[bounds_len - 1].1
165                };
166
167                record.bounds.push((start, record.data.len()));
168            } else {
169                record.push_field(cell);
170            }
171        }
172    }
173
174    /// Converts the zero copy byte record into a proper, owned [`ByteRecord`].
175    #[inline]
176    pub fn to_byte_record(&self) -> ByteRecord {
177        let mut record = ByteRecord::new();
178        self.read_byte_record(&mut record);
179        record
180    }
181
182    #[inline]
183    pub(crate) fn to_byte_record_in_reverse(&self) -> ByteRecord {
184        let mut record = ByteRecord::new();
185
186        for cell in self.unescaped_iter().rev() {
187            record.push_field_in_reverse(&cell);
188        }
189
190        record
191    }
192}
193
194impl fmt::Debug for ZeroCopyByteRecord<'_> {
195    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
196        write!(f, "ZeroCopyByteRecord(")?;
197        f.debug_list()
198            .entries(self.iter().map(debug::Bytes))
199            .finish()?;
200        write!(f, ")")?;
201        Ok(())
202    }
203}
204
205macro_rules! make_zero_copy_iterator {
206    ($name:ident, $method: ident, $out_type: ty) => {
207        pub struct $name<'a> {
208            record: &'a ZeroCopyByteRecord<'a>,
209            current_forward: usize,
210            current_backward: usize,
211        }
212
213        impl ExactSizeIterator for $name<'_> {}
214
215        impl<'a> Iterator for $name<'a> {
216            type Item = $out_type;
217
218            #[inline]
219            fn next(&mut self) -> Option<Self::Item> {
220                if self.current_forward == self.current_backward {
221                    None
222                } else {
223                    let cell = self.record.$method(self.current_forward);
224
225                    self.current_forward += 1;
226
227                    cell
228                }
229            }
230
231            #[inline]
232            fn size_hint(&self) -> (usize, Option<usize>) {
233                let size = self.current_backward - self.current_forward;
234
235                (size, Some(size))
236            }
237
238            #[inline]
239            fn count(self) -> usize
240            where
241                Self: Sized,
242            {
243                self.len()
244            }
245        }
246
247        impl DoubleEndedIterator for $name<'_> {
248            #[inline]
249            fn next_back(&mut self) -> Option<Self::Item> {
250                if self.current_forward == self.current_backward {
251                    None
252                } else {
253                    self.current_backward -= 1;
254
255                    self.record.$method(self.current_backward)
256                }
257            }
258        }
259    };
260}
261
262make_zero_copy_iterator!(ZeroCopyByteRecordIter, get, &'a [u8]);
263make_zero_copy_iterator!(ZeroCopyByteRecordUnquotedIter, unquote, &'a [u8]);
264make_zero_copy_iterator!(ZeroCopyByteRecordUnescapedIter, unescape, Cow<'a, [u8]>);
265
266impl Index<usize> for ZeroCopyByteRecord<'_> {
267    type Output = [u8];
268
269    #[inline]
270    fn index(&self, i: usize) -> &[u8] {
271        self.get(i).unwrap()
272    }
273}
274
275/// An owned, unquoted/unescaped representation of a CSV record.
276///
277/// [`ByteRecord`] are typically used with a [`Reader`](crate::Reader).
278///
279/// *Creating a [`ByteRecord`]*:
280/// ```
281/// use simd_csv::ByteRecord;
282///
283/// let mut record = ByteRecord::new();
284/// record.push_field(b"john");
285/// record.push_field(b"landis");
286/// ```
287#[derive(Default, Clone, Eq)]
288pub struct ByteRecord {
289    data: Vec<u8>,
290    bounds: Vec<(usize, usize)>,
291}
292
293impl ByteRecord {
294    /// Create a empty record.
295    pub fn new() -> Self {
296        Self::default()
297    }
298
299    /// Return the number of fields of the record.
300    #[inline]
301    pub fn len(&self) -> usize {
302        self.bounds.len()
303    }
304
305    /// Return whether the record is empty.
306    #[inline]
307    pub fn is_empty(&self) -> bool {
308        self.len() == 0
309    }
310
311    /// Clear the record completely.
312    #[inline]
313    pub fn clear(&mut self) {
314        self.data.clear();
315        self.bounds.clear();
316    }
317
318    /// Shortens the record, keeping the first `len` elements and dropping the
319    /// rest.
320    #[inline]
321    pub fn truncate(&mut self, len: usize) {
322        self.bounds.truncate(len);
323
324        if let Some((_, end)) = self.bounds.last() {
325            self.data.truncate(*end);
326        } else {
327            self.data.clear();
328        }
329    }
330
331    /// Return the underlying byte slice.
332    ///
333    /// **BEWARE**: the [`Reader`](crate::Reader) amortizes copies by sometimes
334    /// including spurious data such as quotes and delimiters. You will never
335    /// see those bytes while accessing fields because the field boundaries
336    /// remain correct, but you will see them in the underlying slice.
337    #[inline]
338    pub fn as_slice(&self) -> &[u8] {
339        &self.data
340    }
341
342    /// Return an iterator over the record's fields.
343    #[inline]
344    pub fn iter(&self) -> ByteRecordIter<'_> {
345        ByteRecordIter {
346            record: self,
347            current_forward: 0,
348            current_backward: self.len(),
349        }
350    }
351
352    /// Append a new field to the back of the record.
353    #[inline(always)]
354    pub fn push_field(&mut self, bytes: &[u8]) {
355        self.data.extend_from_slice(bytes);
356
357        let bounds_len = self.bounds.len();
358
359        let start = if bounds_len == 0 {
360            0
361        } else {
362            self.bounds[bounds_len - 1].1
363        };
364
365        self.bounds.push((start, self.data.len()));
366    }
367
368    #[inline]
369    fn push_field_in_reverse(&mut self, bytes: &[u8]) {
370        self.data.extend_from_slice(bytes);
371
372        let bounds_len = self.bounds.len();
373
374        let start = if bounds_len == 0 {
375            0
376        } else {
377            self.bounds[bounds_len - 1].1
378        };
379
380        let bounds = (start, self.data.len());
381        self.data[bounds.0..bounds.1].reverse();
382
383        self.bounds.push(bounds);
384    }
385
386    /// Return field at `index`. Will return `None` if `index` is out of bounds.
387    #[inline]
388    pub fn get(&self, index: usize) -> Option<&[u8]> {
389        self.bounds
390            .get(index)
391            .copied()
392            .map(|(start, end)| &self.data[start..end])
393    }
394
395    pub(crate) fn reverse(&mut self) {
396        self.data.reverse();
397        self.bounds.reverse();
398
399        let len = self.data.len();
400
401        for (start, end) in self.bounds.iter_mut() {
402            let new_end = len - *start;
403            let new_start = len - *end;
404
405            *start = new_start;
406            *end = new_end;
407        }
408    }
409}
410
411impl PartialEq for ByteRecord {
412    fn eq(&self, other: &Self) -> bool {
413        if self.bounds.len() != other.bounds.len() {
414            return false;
415        }
416
417        self.iter()
418            .zip(other.iter())
419            .all(|(self_cell, other_cell)| self_cell == other_cell)
420    }
421}
422
423impl Hash for ByteRecord {
424    #[inline]
425    fn hash<H: Hasher>(&self, state: &mut H) {
426        state.write_usize(self.len());
427
428        for cell in self.iter() {
429            state.write(cell);
430        }
431    }
432}
433
434impl Index<usize> for ByteRecord {
435    type Output = [u8];
436
437    #[inline]
438    fn index(&self, i: usize) -> &[u8] {
439        self.get(i).unwrap()
440    }
441}
442
443impl<T: AsRef<[u8]>> Extend<T> for ByteRecord {
444    #[inline]
445    fn extend<I: IntoIterator<Item = T>>(&mut self, iter: I) {
446        for x in iter {
447            self.push_field(x.as_ref());
448        }
449    }
450}
451
452impl<T: AsRef<[u8]>> FromIterator<T> for ByteRecord {
453    #[inline]
454    fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> Self {
455        let mut record = Self::new();
456        record.extend(iter);
457        record
458    }
459}
460
461impl<I, T> From<I> for ByteRecord
462where
463    I: IntoIterator<Item = T>,
464    T: AsRef<[u8]>,
465{
466    fn from(value: I) -> Self {
467        let mut record = Self::new();
468
469        for cell in value.into_iter() {
470            record.push_field(cell.as_ref());
471        }
472
473        record
474    }
475}
476
477impl<'r> IntoIterator for &'r ByteRecord {
478    type IntoIter = ByteRecordIter<'r>;
479    type Item = &'r [u8];
480
481    #[inline]
482    fn into_iter(self) -> ByteRecordIter<'r> {
483        self.iter()
484    }
485}
486
487impl fmt::Debug for ByteRecord {
488    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
489        write!(f, "ByteRecord(")?;
490        f.debug_list()
491            .entries(self.iter().map(debug::Bytes))
492            .finish()?;
493        write!(f, ")")?;
494        Ok(())
495    }
496}
497
498pub struct ByteRecordIter<'a> {
499    record: &'a ByteRecord,
500    current_forward: usize,
501    current_backward: usize,
502}
503
504impl ExactSizeIterator for ByteRecordIter<'_> {}
505
506impl<'a> Iterator for ByteRecordIter<'a> {
507    type Item = &'a [u8];
508
509    #[inline]
510    fn next(&mut self) -> Option<Self::Item> {
511        if self.current_forward == self.current_backward {
512            None
513        } else {
514            let (start, end) = self.record.bounds[self.current_forward];
515
516            self.current_forward += 1;
517
518            Some(&self.record.data[start..end])
519        }
520    }
521
522    #[inline]
523    fn size_hint(&self) -> (usize, Option<usize>) {
524        let size = self.current_backward - self.current_forward;
525
526        (size, Some(size))
527    }
528
529    #[inline]
530    fn count(self) -> usize
531    where
532        Self: Sized,
533    {
534        self.len()
535    }
536}
537
538impl DoubleEndedIterator for ByteRecordIter<'_> {
539    #[inline]
540    fn next_back(&mut self) -> Option<Self::Item> {
541        if self.current_forward == self.current_backward {
542            None
543        } else {
544            self.current_backward -= 1;
545
546            let (start, end) = self.record.bounds[self.current_backward];
547
548            Some(&self.record.data[start..end])
549        }
550    }
551}
552
553pub(crate) struct ByteRecordBuilder<'r> {
554    record: &'r mut ByteRecord,
555    start: usize,
556}
557
558impl<'r> ByteRecordBuilder<'r> {
559    #[inline(always)]
560    pub(crate) fn wrap(record: &'r mut ByteRecord) -> Self {
561        Self { record, start: 0 }
562    }
563
564    #[inline(always)]
565    pub(crate) fn extend_from_slice(&mut self, slice: &[u8]) {
566        self.record.data.extend_from_slice(slice);
567    }
568
569    #[inline(always)]
570    pub(crate) fn push_byte(&mut self, byte: u8) {
571        self.record.data.push(byte);
572    }
573
574    #[inline]
575    pub(crate) fn finalize_field(&mut self) {
576        let start = self.start;
577        self.start = self.record.data.len();
578
579        self.record.bounds.push((start, self.start));
580    }
581
582    #[inline]
583    pub(crate) fn finalize_record(&mut self) {
584        if let Some(b'\r') = self.record.data.last() {
585            self.record.data.pop();
586        }
587
588        self.finalize_field();
589    }
590
591    #[inline]
592    pub(crate) fn finalize_field_preemptively(&mut self, offset: usize) {
593        let start = self.start;
594        self.start = self.record.data.len() + offset;
595
596        self.record.bounds.push((start, self.start));
597
598        self.start += 1;
599    }
600
601    #[inline(always)]
602    pub(crate) fn bump(&mut self) {
603        self.start +=
604            (self.record.bounds.last().map(|(s, _)| *s).unwrap_or(0) != self.start) as usize;
605    }
606}
607
608#[cfg(test)]
609mod tests {
610    use super::*;
611
612    #[test]
613    fn test_zero_copy_byte_record() {
614        let record = ZeroCopyByteRecord::new(b"name,surname,age", &[4, 12], b'"');
615
616        assert_eq!(record.len(), 3);
617
618        let expected: Vec<&[u8]> = vec![b"name", b"surname", b"age"];
619        assert_eq!(record.iter().collect::<Vec<_>>(), expected);
620
621        for i in 0..expected.len() {
622            assert_eq!(record.get(i), Some(expected[i]));
623        }
624
625        assert_eq!(record.get(4), None);
626    }
627
628    #[test]
629    fn test_byte_record() {
630        let mut record = ByteRecord::new();
631
632        assert_eq!(record.len(), 0);
633        assert_eq!(record.is_empty(), true);
634        assert_eq!(record.get(0), None);
635
636        record.push_field(b"name");
637        record.push_field(b"surname");
638        record.push_field(b"age");
639
640        let expected: Vec<&[u8]> = vec![b"name", b"surname", b"age"];
641        assert_eq!(record.iter().collect::<Vec<_>>(), expected);
642
643        assert_eq!(record.get(0), Some::<&[u8]>(b"name"));
644        assert_eq!(record.get(1), Some::<&[u8]>(b"surname"));
645        assert_eq!(record.get(2), Some::<&[u8]>(b"age"));
646        assert_eq!(record.get(3), None);
647    }
648
649    #[test]
650    fn test_mutate_record_after_read() {
651        let mut record = ByteRecord::new();
652        let mut builder = ByteRecordBuilder::wrap(&mut record);
653        builder.extend_from_slice(b"test\r");
654        builder.finalize_record();
655
656        assert_eq!(record.iter().collect::<Vec<_>>(), vec![b"test"]);
657
658        record.push_field(b"next");
659
660        assert_eq!(record.iter().collect::<Vec<_>>(), vec![b"test", b"next"]);
661    }
662
663    #[test]
664    fn test_reverse_byte_record() {
665        let record = brec!["name", "surname", "age"];
666        let mut reversed = record.clone();
667        reversed.reverse();
668
669        assert_eq!(reversed, brec!["ega", "emanrus", "eman"]);
670        reversed.reverse();
671        assert_eq!(record, reversed);
672    }
673}