simd_csv/
records.rs

1use std::borrow::Cow;
2use std::fmt;
3use std::hash::{Hash, Hasher};
4use std::ops::Index;
5
6use crate::debug;
7use crate::utils::{trim_trailing_crlf, unescape, unescape_to, unquoted};
8
9pub struct ZeroCopyByteRecord<'a> {
10    slice: &'a [u8],
11    seps: &'a [usize],
12    pub(crate) quote: u8,
13}
14
15impl<'a> ZeroCopyByteRecord<'a> {
16    #[inline]
17    pub(crate) fn new(slice: &'a [u8], seps: &'a [usize], quote: u8) -> Self {
18        Self {
19            slice: trim_trailing_crlf(slice),
20            seps,
21            quote,
22        }
23    }
24
25    #[inline]
26    pub(crate) fn to_parts(&self) -> (Vec<usize>, Vec<u8>) {
27        (self.seps.to_vec(), self.slice.to_vec())
28    }
29
30    #[inline(always)]
31    pub fn len(&self) -> usize {
32        // NOTE: an empty zero copy record cannot be constructed,
33        // by definition.
34        self.seps.len() + 1
35    }
36
37    #[inline(always)]
38    pub fn is_empty(&self) -> bool {
39        false
40    }
41
42    #[inline(always)]
43    pub fn as_slice(&self) -> &[u8] {
44        self.slice
45    }
46
47    #[inline]
48    pub fn iter(&self) -> ZeroCopyByteRecordIter<'_> {
49        ZeroCopyByteRecordIter {
50            record: self,
51            current_forward: 0,
52            current_backward: self.len(),
53        }
54    }
55
56    #[inline]
57    pub fn unquoted_iter(&self) -> ZeroCopyByteRecordUnquotedIter<'_> {
58        ZeroCopyByteRecordUnquotedIter {
59            record: self,
60            current_forward: 0,
61            current_backward: self.len(),
62        }
63    }
64
65    #[inline]
66    pub fn unescaped_iter(&self) -> ZeroCopyByteRecordUnescapedIter<'_> {
67        ZeroCopyByteRecordUnescapedIter {
68            record: self,
69            current_forward: 0,
70            current_backward: self.len(),
71        }
72    }
73
74    #[inline]
75    pub fn get(&self, index: usize) -> Option<&[u8]> {
76        let len = self.seps.len();
77
78        if index > len {
79            return None;
80        }
81
82        let start = if index == 0 {
83            0
84        } else {
85            self.seps[index - 1] + 1
86        };
87
88        let end = if index == len {
89            self.slice.len()
90        } else {
91            self.seps[index]
92        };
93
94        Some(&self.slice[start..end])
95    }
96
97    #[inline]
98    pub fn unquote(&self, index: usize) -> Option<&[u8]> {
99        self.get(index)
100            .map(|cell| unquoted(cell, self.quote).unwrap_or(cell))
101    }
102
103    #[inline]
104    pub fn unescape(&self, index: usize) -> Option<Cow<[u8]>> {
105        self.unquote(index).map(|cell| {
106            if let Some(trimmed) = unquoted(cell, self.quote) {
107                unescape(trimmed, self.quote)
108            } else {
109                Cow::Borrowed(cell)
110            }
111        })
112    }
113
114    fn read_byte_record(&self, record: &mut ByteRecord) {
115        record.clear();
116
117        for cell in self.iter() {
118            if let Some(trimmed) = unquoted(cell, self.quote) {
119                unescape_to(trimmed, self.quote, &mut record.data);
120
121                let bounds_len = record.bounds.len();
122
123                let start = if bounds_len == 0 {
124                    0
125                } else {
126                    record.bounds[bounds_len - 1].1
127                };
128
129                record.bounds.push((start, record.data.len()));
130            } else {
131                record.push_field(cell);
132            }
133        }
134    }
135
136    #[inline]
137    pub fn to_byte_record(&self) -> ByteRecord {
138        let mut record = ByteRecord::new();
139        self.read_byte_record(&mut record);
140        record
141    }
142
143    #[inline]
144    pub(crate) fn to_byte_record_in_reverse(&self) -> ByteRecord {
145        let mut record = ByteRecord::new();
146
147        for cell in self.unescaped_iter().rev() {
148            record.push_field_in_reverse(&cell);
149        }
150
151        record
152    }
153}
154
155impl fmt::Debug for ZeroCopyByteRecord<'_> {
156    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
157        write!(f, "ZeroCopyByteRecord(")?;
158        f.debug_list()
159            .entries(self.iter().map(debug::Bytes))
160            .finish()?;
161        write!(f, ")")?;
162        Ok(())
163    }
164}
165
166macro_rules! make_zero_copy_iterator {
167    ($name:ident, $method: ident, $out_type: ty) => {
168        pub struct $name<'a> {
169            record: &'a ZeroCopyByteRecord<'a>,
170            current_forward: usize,
171            current_backward: usize,
172        }
173
174        impl ExactSizeIterator for $name<'_> {}
175
176        impl<'a> Iterator for $name<'a> {
177            type Item = $out_type;
178
179            #[inline]
180            fn next(&mut self) -> Option<Self::Item> {
181                if self.current_forward == self.current_backward {
182                    None
183                } else {
184                    let cell = self.record.$method(self.current_forward);
185
186                    self.current_forward += 1;
187
188                    cell
189                }
190            }
191
192            #[inline]
193            fn size_hint(&self) -> (usize, Option<usize>) {
194                let size = self.current_backward - self.current_forward;
195
196                (size, Some(size))
197            }
198
199            #[inline]
200            fn count(self) -> usize
201            where
202                Self: Sized,
203            {
204                self.len()
205            }
206        }
207
208        impl DoubleEndedIterator for $name<'_> {
209            #[inline]
210            fn next_back(&mut self) -> Option<Self::Item> {
211                if self.current_forward == self.current_backward {
212                    None
213                } else {
214                    self.current_backward -= 1;
215
216                    self.record.$method(self.current_backward)
217                }
218            }
219        }
220    };
221}
222
223make_zero_copy_iterator!(ZeroCopyByteRecordIter, get, &'a [u8]);
224make_zero_copy_iterator!(ZeroCopyByteRecordUnquotedIter, unquote, &'a [u8]);
225make_zero_copy_iterator!(ZeroCopyByteRecordUnescapedIter, unescape, Cow<'a, [u8]>);
226
227impl Index<usize> for ZeroCopyByteRecord<'_> {
228    type Output = [u8];
229
230    #[inline]
231    fn index(&self, i: usize) -> &[u8] {
232        self.get(i).unwrap()
233    }
234}
235
236#[derive(Default, Clone, Eq)]
237pub struct ByteRecord {
238    data: Vec<u8>,
239    bounds: Vec<(usize, usize)>,
240}
241
242impl ByteRecord {
243    pub fn new() -> Self {
244        Self::default()
245    }
246
247    #[inline]
248    pub fn len(&self) -> usize {
249        self.bounds.len()
250    }
251
252    #[inline]
253    pub fn is_empty(&self) -> bool {
254        self.len() == 0
255    }
256
257    #[inline]
258    pub fn clear(&mut self) {
259        self.data.clear();
260        self.bounds.clear();
261    }
262
263    #[inline]
264    pub fn truncate(&mut self, len: usize) {
265        self.bounds.truncate(len);
266
267        if let Some((_, end)) = self.bounds.last() {
268            self.data.truncate(*end);
269        } else {
270            self.data.clear();
271        }
272    }
273
274    #[inline]
275    pub fn as_slice(&self) -> &[u8] {
276        &self.data
277    }
278
279    #[inline]
280    pub fn iter(&self) -> ByteRecordIter<'_> {
281        ByteRecordIter {
282            record: self,
283            current_forward: 0,
284            current_backward: self.len(),
285        }
286    }
287
288    #[inline(always)]
289    pub fn push_field(&mut self, bytes: &[u8]) {
290        self.data.extend_from_slice(bytes);
291
292        let bounds_len = self.bounds.len();
293
294        let start = if bounds_len == 0 {
295            0
296        } else {
297            self.bounds[bounds_len - 1].1
298        };
299
300        self.bounds.push((start, self.data.len()));
301    }
302
303    #[inline]
304    fn push_field_in_reverse(&mut self, bytes: &[u8]) {
305        self.data.extend_from_slice(bytes);
306
307        let bounds_len = self.bounds.len();
308
309        let start = if bounds_len == 0 {
310            0
311        } else {
312            self.bounds[bounds_len - 1].1
313        };
314
315        let bounds = (start, self.data.len());
316        self.data[bounds.0..bounds.1].reverse();
317
318        self.bounds.push(bounds);
319    }
320
321    #[inline]
322    pub fn get(&self, index: usize) -> Option<&[u8]> {
323        self.bounds
324            .get(index)
325            .copied()
326            .map(|(start, end)| &self.data[start..end])
327    }
328
329    pub(crate) fn reverse(&mut self) {
330        self.data.reverse();
331        self.bounds.reverse();
332
333        let len = self.data.len();
334
335        for (start, end) in self.bounds.iter_mut() {
336            let new_end = len - *start;
337            let new_start = len - *end;
338
339            *start = new_start;
340            *end = new_end;
341        }
342    }
343}
344
345impl PartialEq for ByteRecord {
346    fn eq(&self, other: &Self) -> bool {
347        if self.bounds.len() != other.bounds.len() {
348            return false;
349        }
350
351        self.iter()
352            .zip(other.iter())
353            .all(|(self_cell, other_cell)| self_cell == other_cell)
354    }
355}
356
357impl Hash for ByteRecord {
358    #[inline]
359    fn hash<H: Hasher>(&self, state: &mut H) {
360        state.write_usize(self.len());
361
362        for cell in self.iter() {
363            state.write(cell);
364        }
365    }
366}
367
368impl Index<usize> for ByteRecord {
369    type Output = [u8];
370
371    #[inline]
372    fn index(&self, i: usize) -> &[u8] {
373        self.get(i).unwrap()
374    }
375}
376
377impl<T: AsRef<[u8]>> Extend<T> for ByteRecord {
378    #[inline]
379    fn extend<I: IntoIterator<Item = T>>(&mut self, iter: I) {
380        for x in iter {
381            self.push_field(x.as_ref());
382        }
383    }
384}
385
386impl<T: AsRef<[u8]>> FromIterator<T> for ByteRecord {
387    #[inline]
388    fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> Self {
389        let mut record = Self::new();
390        record.extend(iter);
391        record
392    }
393}
394
395impl<I, T> From<I> for ByteRecord
396where
397    I: IntoIterator<Item = T>,
398    T: AsRef<[u8]>,
399{
400    fn from(value: I) -> Self {
401        let mut record = Self::new();
402
403        for cell in value.into_iter() {
404            record.push_field(cell.as_ref());
405        }
406
407        record
408    }
409}
410
411impl<'r> IntoIterator for &'r ByteRecord {
412    type IntoIter = ByteRecordIter<'r>;
413    type Item = &'r [u8];
414
415    #[inline]
416    fn into_iter(self) -> ByteRecordIter<'r> {
417        self.iter()
418    }
419}
420
421impl fmt::Debug for ByteRecord {
422    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
423        write!(f, "ByteRecord(")?;
424        f.debug_list()
425            .entries(self.iter().map(debug::Bytes))
426            .finish()?;
427        write!(f, ")")?;
428        Ok(())
429    }
430}
431
432pub struct ByteRecordIter<'a> {
433    record: &'a ByteRecord,
434    current_forward: usize,
435    current_backward: usize,
436}
437
438impl ExactSizeIterator for ByteRecordIter<'_> {}
439
440impl<'a> Iterator for ByteRecordIter<'a> {
441    type Item = &'a [u8];
442
443    #[inline]
444    fn next(&mut self) -> Option<Self::Item> {
445        if self.current_forward == self.current_backward {
446            None
447        } else {
448            let (start, end) = self.record.bounds[self.current_forward];
449
450            self.current_forward += 1;
451
452            Some(&self.record.data[start..end])
453        }
454    }
455
456    #[inline]
457    fn size_hint(&self) -> (usize, Option<usize>) {
458        let size = self.current_backward - self.current_forward;
459
460        (size, Some(size))
461    }
462
463    #[inline]
464    fn count(self) -> usize
465    where
466        Self: Sized,
467    {
468        self.len()
469    }
470}
471
472impl DoubleEndedIterator for ByteRecordIter<'_> {
473    #[inline]
474    fn next_back(&mut self) -> Option<Self::Item> {
475        if self.current_forward == self.current_backward {
476            None
477        } else {
478            self.current_backward -= 1;
479
480            let (start, end) = self.record.bounds[self.current_backward];
481
482            Some(&self.record.data[start..end])
483        }
484    }
485}
486
487pub(crate) struct ByteRecordBuilder<'r> {
488    record: &'r mut ByteRecord,
489    start: usize,
490}
491
492impl<'r> ByteRecordBuilder<'r> {
493    #[inline(always)]
494    pub(crate) fn wrap(record: &'r mut ByteRecord) -> Self {
495        Self { record, start: 0 }
496    }
497
498    #[inline(always)]
499    pub(crate) fn extend_from_slice(&mut self, slice: &[u8]) {
500        self.record.data.extend_from_slice(slice);
501    }
502
503    #[inline(always)]
504    pub(crate) fn push_byte(&mut self, byte: u8) {
505        self.record.data.push(byte);
506    }
507
508    #[inline]
509    pub(crate) fn finalize_field(&mut self) {
510        let start = self.start;
511        self.start = self.record.data.len();
512
513        self.record.bounds.push((start, self.start));
514    }
515
516    #[inline]
517    pub(crate) fn finalize_record(&mut self) {
518        if let Some(b'\r') = self.record.data.last() {
519            self.record.data.pop();
520        }
521
522        self.finalize_field();
523    }
524
525    #[inline]
526    pub(crate) fn finalize_field_preemptively(&mut self, offset: usize) {
527        let start = self.start;
528        self.start = self.record.data.len() + offset;
529
530        self.record.bounds.push((start, self.start));
531
532        self.start += 1;
533    }
534
535    #[inline(always)]
536    pub(crate) fn bump(&mut self) {
537        self.start +=
538            (self.record.bounds.last().map(|(s, _)| *s).unwrap_or(0) != self.start) as usize;
539    }
540}
541
542#[macro_export]
543macro_rules! brec {
544    () => {{
545        ByteRecord::new()
546    }};
547
548    ($($x: expr),*) => {{
549        let mut r = ByteRecord::new();
550
551        $(
552            r.push_field($x.as_bytes());
553        )*
554
555        r
556    }};
557}
558
559#[cfg(test)]
560mod tests {
561    use super::*;
562
563    #[test]
564    fn test_zero_copy_byte_record() {
565        let record = ZeroCopyByteRecord::new(b"name,surname,age", &[4, 12], b'"');
566
567        assert_eq!(record.len(), 3);
568
569        let expected: Vec<&[u8]> = vec![b"name", b"surname", b"age"];
570        assert_eq!(record.iter().collect::<Vec<_>>(), expected);
571
572        for i in 0..expected.len() {
573            assert_eq!(record.get(i), Some(expected[i]));
574        }
575
576        assert_eq!(record.get(4), None);
577    }
578
579    #[test]
580    fn test_byte_record() {
581        let mut record = ByteRecord::new();
582
583        assert_eq!(record.len(), 0);
584        assert_eq!(record.is_empty(), true);
585        assert_eq!(record.get(0), None);
586
587        record.push_field(b"name");
588        record.push_field(b"surname");
589        record.push_field(b"age");
590
591        let expected: Vec<&[u8]> = vec![b"name", b"surname", b"age"];
592        assert_eq!(record.iter().collect::<Vec<_>>(), expected);
593
594        assert_eq!(record.get(0), Some::<&[u8]>(b"name"));
595        assert_eq!(record.get(1), Some::<&[u8]>(b"surname"));
596        assert_eq!(record.get(2), Some::<&[u8]>(b"age"));
597        assert_eq!(record.get(3), None);
598    }
599
600    #[test]
601    fn test_mutate_record_after_read() {
602        let mut record = ByteRecord::new();
603        let mut builder = ByteRecordBuilder::wrap(&mut record);
604        builder.extend_from_slice(b"test\r");
605        builder.finalize_record();
606
607        assert_eq!(record.iter().collect::<Vec<_>>(), vec![b"test"]);
608
609        record.push_field(b"next");
610
611        assert_eq!(record.iter().collect::<Vec<_>>(), vec![b"test", b"next"]);
612    }
613
614    #[test]
615    fn test_reverse_byte_record() {
616        let record = brec!["name", "surname", "age"];
617        let mut reversed = record.clone();
618        reversed.reverse();
619
620        assert_eq!(reversed, brec!["ega", "emanrus", "eman"]);
621        reversed.reverse();
622        assert_eq!(record, reversed);
623    }
624}