simd_csv/
utils.rs

1use std::borrow::Cow;
2use std::io::{self, Read, Seek, SeekFrom};
3
4use memchr::memchr;
5
6#[inline]
7pub fn trim_trailing_crlf(slice: &[u8]) -> &[u8] {
8    let mut len = slice.len();
9
10    let has_lf = len >= 1 && slice[len - 1] == b'\n';
11    let has_crlf = has_lf && len >= 2 && slice[len - 2] == b'\r';
12
13    len -= (has_lf as usize) + (has_crlf as usize);
14
15    &slice[..len]
16}
17
18#[inline(always)]
19pub fn trim_bom(slice: &[u8]) -> usize {
20    if slice.len() >= 3 && &slice[..3] == b"\xef\xbb\xbf" {
21        3
22    } else {
23        0
24    }
25}
26
27#[inline]
28pub fn unquoted(cell: &[u8], quote: u8) -> Option<&[u8]> {
29    let len = cell.len();
30
31    if len >= 2 && cell[0] == quote && cell[len - 1] == quote {
32        Some(&cell[1..len - 1])
33    } else {
34        None
35    }
36}
37
38pub fn unescape(cell: &[u8], quote: u8) -> Cow<[u8]> {
39    let len = cell.len();
40    let mut output = Vec::new();
41
42    let mut pos: usize = 0;
43
44    while pos < len {
45        if let Some(offset) = memchr(quote, &cell[pos..]) {
46            if output.is_empty() {
47                output.reserve_exact(len);
48            }
49
50            output.extend_from_slice(&cell[pos..pos + offset + 1]);
51
52            // NOTE: we assume, next character MUST be a quote
53            pos += offset + 2;
54        } else {
55            break;
56        }
57    }
58
59    if output.is_empty() {
60        Cow::Borrowed(cell)
61    } else {
62        output.extend_from_slice(&cell[pos..]);
63        Cow::Owned(output)
64    }
65}
66
67pub fn unescape_to(cell: &[u8], quote: u8, out: &mut Vec<u8>) {
68    let len = cell.len();
69    let mut pos: usize = 0;
70
71    while pos < len {
72        if let Some(offset) = memchr(quote, &cell[pos..]) {
73            out.extend_from_slice(&cell[pos..pos + offset + 1]);
74
75            // NOTE: we assume, next character MUST be a quote
76            pos += offset + 2;
77        } else {
78            break;
79        }
80    }
81
82    out.extend_from_slice(&cell[pos..]);
83}
84
85pub struct ReverseReader<R> {
86    input: R,
87    offset: u64,
88    ptr: u64,
89}
90
91impl<R: Seek + Read> ReverseReader<R> {
92    pub fn new(input: R, filesize: u64, offset: u64) -> Self {
93        Self {
94            input,
95            offset,
96            ptr: filesize,
97        }
98    }
99}
100
101impl<R: Seek + Read> Read for ReverseReader<R> {
102    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
103        let buff_size = buf.len() as u64;
104
105        if self.ptr == self.offset {
106            return Ok(0);
107        }
108
109        if self.offset + buff_size > self.ptr {
110            let e = (self.ptr - self.offset) as usize;
111
112            self.input.seek(SeekFrom::Start(self.offset))?;
113            self.input.read_exact(&mut buf[0..e])?;
114
115            buf[0..e].reverse();
116
117            self.ptr = self.offset;
118
119            Ok(e)
120        } else {
121            let new_position = self.ptr - buff_size;
122
123            self.input.seek(SeekFrom::Start(new_position))?;
124            self.input.read_exact(buf)?;
125            buf.reverse();
126
127            self.ptr -= buff_size;
128
129            Ok(buff_size as usize)
130        }
131    }
132}
133
134#[cfg(test)]
135mod tests {
136    use super::*;
137
138    #[test]
139    fn test_unescape() {
140        assert_eq!(unescape(b"test", b'"'), Cow::Borrowed(b"test"));
141        assert_eq!(
142            unescape(b"\"\"hello\"\"", b'"'),
143            Cow::<[u8]>::Owned(b"\"hello\"".to_vec())
144        );
145        assert_eq!(
146            unescape(b"this is \"\"hello\"\" then?", b'"'),
147            Cow::<[u8]>::Owned(b"this is \"hello\" then?".to_vec())
148        );
149    }
150
151    #[test]
152    fn test_unescape_to() {
153        let mut scratch = Vec::new();
154
155        unescape_to(b"test", b'"', &mut scratch);
156        assert_eq!(scratch, b"test");
157
158        scratch.clear();
159        unescape_to(b"\"\"hello\"\"", b'"', &mut scratch);
160        assert_eq!(scratch, b"\"hello\"");
161
162        scratch.clear();
163        unescape_to(b"this is \"\"hello\"\" then?", b'"', &mut scratch);
164        assert_eq!(scratch, b"this is \"hello\" then?");
165    }
166}