Skip to main content

simd_csv/
utils.rs

1use std::borrow::Cow;
2use std::io::{self, Read, Seek, SeekFrom, Write};
3
4use memchr::memchr;
5
6#[inline]
7pub fn trim_trailing_crlf(slice: &[u8]) -> &[u8] {
8    let mut len = slice.len();
9
10    let has_lf = len >= 1 && slice[len - 1] == b'\n';
11    let has_crlf = has_lf && len >= 2 && slice[len - 2] == b'\r';
12
13    len -= (has_lf as usize) + (has_crlf as usize);
14
15    &slice[..len]
16}
17
18#[inline(always)]
19pub fn trim_bom(slice: &[u8]) -> usize {
20    if slice.len() >= 3 && &slice[..3] == b"\xef\xbb\xbf" {
21        3
22    } else {
23        0
24    }
25}
26
27#[inline]
28pub fn unquoted(cell: &[u8], quote: u8) -> Option<&[u8]> {
29    let len = cell.len();
30
31    if len >= 2 && cell[0] == quote && cell[len - 1] == quote {
32        Some(&cell[1..len - 1])
33    } else {
34        None
35    }
36}
37
38/// Unescape a potentially escaped but unquoted (no leading/trailing quotes) CSV
39/// cell.
40///
41/// Returns a [`Cow::Borrowed`] if nothing needed unescaping, and a
42/// [`Cow::Owned`] if something was actually unescaped.
43///
44/// This function will therefore not allocate if this is not actually required.
45pub fn unescape(cell: &[u8], quote: u8) -> Cow<'_, [u8]> {
46    let len = cell.len();
47    let mut output = Vec::new();
48
49    let mut pos: usize = 0;
50
51    while pos < len {
52        if let Some(offset) = memchr(quote, &cell[pos..]) {
53            if output.is_empty() {
54                output.reserve_exact(len);
55            }
56
57            let limit = pos + offset + 1;
58
59            output.extend_from_slice(&cell[pos..limit]);
60
61            if limit < len && cell[limit] == quote {
62                pos = limit + 1;
63            } else {
64                pos = limit;
65                break;
66            }
67        } else {
68            break;
69        }
70    }
71
72    if output.is_empty() {
73        Cow::Borrowed(cell)
74    } else {
75        output.extend_from_slice(&cell[pos..]);
76        Cow::Owned(output)
77    }
78}
79
80pub fn unescape_to(cell: &[u8], quote: u8, out: &mut Vec<u8>) {
81    let len = cell.len();
82    let mut pos: usize = 0;
83
84    while pos < len {
85        if let Some(offset) = memchr(quote, &cell[pos..]) {
86            let limit = pos + offset + 1;
87
88            out.extend_from_slice(&cell[pos..limit]);
89
90            if limit < len && cell[limit] == quote {
91                pos = limit + 1;
92            } else {
93                pos = limit;
94                break;
95            }
96        } else {
97            break;
98        }
99    }
100
101    out.extend_from_slice(&cell[pos..]);
102}
103
104pub struct ReverseReader<R> {
105    input: R,
106    offset: u64,
107    ptr: u64,
108}
109
110impl<R: Seek + Read> ReverseReader<R> {
111    pub fn new(input: R, filesize: u64, offset: u64) -> Self {
112        Self {
113            input,
114            offset,
115            ptr: filesize,
116        }
117    }
118}
119
120impl<R: Seek + Read> Read for ReverseReader<R> {
121    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
122        let buff_size = buf.len() as u64;
123
124        if self.ptr == self.offset {
125            return Ok(0);
126        }
127
128        if self.offset + buff_size > self.ptr {
129            let e = (self.ptr - self.offset) as usize;
130
131            self.input.seek(SeekFrom::Start(self.offset))?;
132            self.input.read_exact(&mut buf[0..e])?;
133
134            buf[0..e].reverse();
135
136            self.ptr = self.offset;
137
138            Ok(e)
139        } else {
140            let new_position = self.ptr - buff_size;
141
142            self.input.seek(SeekFrom::Start(new_position))?;
143            self.input.read_exact(buf)?;
144            buf.reverse();
145
146            self.ptr -= buff_size;
147
148            Ok(buff_size as usize)
149        }
150    }
151}
152
153/// A [`Vec`] mutable reference wrapper only allowing to push new stuff, but
154/// restricting everything else and that can be used to maintain some data
155/// structure invariants.
156///
157/// It also implements [`Write`], as a convenience.
158pub struct AppendOnlyView<'v, T> {
159    inner: &'v mut Vec<T>,
160}
161
162impl<'v, T> AppendOnlyView<'v, T> {
163    #[inline(always)]
164    pub(crate) fn new(inner: &'v mut Vec<T>) -> Self {
165        Self { inner }
166    }
167
168    #[inline(always)]
169    pub fn push(&mut self, value: T) {
170        self.inner.push(value);
171    }
172
173    #[inline(always)]
174    pub fn reserve(&mut self, additional: usize) {
175        self.inner.reserve(additional);
176    }
177
178    #[inline(always)]
179    pub fn reserve_exact(&mut self, additional: usize) {
180        self.inner.reserve_exact(additional);
181    }
182}
183
184impl<'v, T: Clone> AppendOnlyView<'v, T> {
185    #[inline(always)]
186    pub fn extend_from_slice(&mut self, other: &[T]) {
187        self.inner.extend_from_slice(other);
188    }
189}
190
191impl<'v, T> Extend<T> for AppendOnlyView<'v, T> {
192    #[inline(always)]
193    fn extend<I: IntoIterator<Item = T>>(&mut self, iter: I) {
194        self.inner.extend(iter);
195    }
196}
197
198impl<'v, T> Write for AppendOnlyView<'v, T>
199where
200    Vec<T>: Write,
201{
202    #[inline(always)]
203    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
204        self.inner.write(buf)
205    }
206
207    #[inline(always)]
208    fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
209        self.inner.write_all(buf)
210    }
211
212    #[inline(always)]
213    fn write_vectored(&mut self, bufs: &[io::IoSlice<'_>]) -> io::Result<usize> {
214        self.inner.write_vectored(bufs)
215    }
216
217    #[inline(always)]
218    fn write_fmt(&mut self, fmt: std::fmt::Arguments<'_>) -> io::Result<()> {
219        self.inner.write_fmt(fmt)
220    }
221
222    #[inline(always)]
223    fn flush(&mut self) -> io::Result<()> {
224        self.inner.flush()
225    }
226}
227
228#[cfg(test)]
229mod tests {
230    use super::*;
231
232    #[test]
233    fn test_unquoted() {
234        assert_eq!(
235            unquoted(b"\"\"\"98469465\"\",\"", b'\"'),
236            Some(b"\"\"98469465\"\",".as_ref())
237        );
238    }
239
240    #[test]
241    fn test_unescape() {
242        assert_eq!(unescape(b"test", b'"'), Cow::Borrowed(b"test"));
243        assert_eq!(
244            unescape(b"\"\"hello\"\"", b'"'),
245            Cow::<[u8]>::Owned(b"\"hello\"".to_vec())
246        );
247        assert_eq!(
248            unescape(b"this is \"\"hello\"\" then?", b'"'),
249            Cow::<[u8]>::Owned(b"this is \"hello\" then?".to_vec())
250        );
251
252        // It should remain safe with incomplete/invalid data
253        assert_eq!(
254            unescape(b"goettigen\"\"", b'"'),
255            Cow::<[u8]>::Owned(b"goettigen\"".to_vec())
256        );
257        assert_eq!(
258            unescape(b"goettigen\"", b'"'),
259            Cow::<[u8]>::Owned(b"goettigen\"".to_vec())
260        );
261        assert_eq!(
262            unescape(b"goettigen\"whatever", b'"'),
263            Cow::<[u8]>::Owned(b"goettigen\"whatever".to_vec())
264        );
265
266        assert_eq!(
267            unescape(b"\"\"98469465\"\",", b'\"'),
268            Cow::<[u8]>::Owned(b"\"98469465\",".to_vec())
269        );
270    }
271
272    #[test]
273    fn test_unescape_to() {
274        let mut scratch = Vec::new();
275
276        unescape_to(b"test", b'"', &mut scratch);
277        assert_eq!(scratch, b"test");
278
279        scratch.clear();
280        unescape_to(b"\"\"hello\"\"", b'"', &mut scratch);
281        assert_eq!(scratch, b"\"hello\"");
282
283        scratch.clear();
284        unescape_to(b"this is \"\"hello\"\" then?", b'"', &mut scratch);
285        assert_eq!(scratch, b"this is \"hello\" then?");
286
287        // It should remain safe with incomplete/invalid data
288        scratch.clear();
289        unescape_to(b"goettigen\"\"", b'"', &mut scratch);
290        assert_eq!(scratch, b"goettigen\"");
291
292        scratch.clear();
293        unescape_to(b"goettigen\"", b'"', &mut scratch);
294        assert_eq!(scratch, b"goettigen\"");
295
296        scratch.clear();
297        unescape_to(b"goettigen\"whatever", b'"', &mut scratch);
298        assert_eq!(scratch, b"goettigen\"whatever");
299    }
300}