Skip to main content

simd_csv/
writer.rs

1use std::io::{self, BufWriter, IntoInnerError, Write};
2
3use memchr::memchr;
4
5use crate::error::{self, Error, ErrorKind};
6use crate::records::{ByteRecord, ZeroCopyByteRecord};
7
8/// Builds a [`Writer`] with given configuration.
9pub struct WriterBuilder {
10    delimiter: u8,
11    quote: u8,
12    buffer_capacity: usize,
13    flexible: bool,
14    crlf: bool,
15}
16
17impl Default for WriterBuilder {
18    fn default() -> Self {
19        Self {
20            delimiter: b',',
21            quote: b'"',
22            buffer_capacity: 8192,
23            flexible: false,
24            crlf: false,
25        }
26    }
27}
28
29impl WriterBuilder {
30    /// Create a new [`WriterBuilder`] with default configuration.
31    pub fn new() -> Self {
32        Self::default()
33    }
34
35    /// Create a new [`WriterBuilder`] with provided `capacity`.
36    pub fn with_capacity(capacity: usize) -> Self {
37        let mut builder = Self::default();
38        builder.buffer_capacity(capacity);
39        builder
40    }
41
42    /// Set the delimiter to be used by the created [`Writer`].
43    ///
44    /// This delimiter must be a single byte.
45    ///
46    /// Will default to a comma.
47    pub fn delimiter(&mut self, delimiter: u8) -> &mut Self {
48        self.delimiter = delimiter;
49        self
50    }
51
52    /// Set the quote char to be used by the created [`Writer`].
53    ///
54    /// This char must be a single byte.
55    ///
56    /// Will default to a double quote.
57    pub fn quote(&mut self, quote: u8) -> &mut Self {
58        self.quote = quote;
59        self
60    }
61
62    /// Indicate that the created [`Writer`] should use CRLF newlines.
63    pub fn crlf_newlines(&mut self, yes: bool) -> &mut Self {
64        self.crlf = yes;
65        self
66    }
67
68    /// Set the capacity of the created [`Writer`]'s buffered writer.
69    pub fn buffer_capacity(&mut self, capacity: usize) -> &mut Self {
70        self.buffer_capacity = capacity;
71        self
72    }
73
74    /// Indicate whether the created [`Writer`] should be "flexible", i.e.
75    /// whether it should allow writing records having different number of
76    /// fields than the first one.
77    ///
78    /// Will default to `false`.
79    pub fn flexible(&mut self, yes: bool) -> &mut Self {
80        self.flexible = yes;
81        self
82    }
83
84    /// Create a new [`Writer`] using the provided writer implementing
85    /// [`std::io::Write`].
86    pub fn from_writer<W: Write>(&self, writer: W) -> Writer<W> {
87        let mut must_quote = [false; 256];
88        must_quote[b'\r' as usize] = true;
89        must_quote[b'\n' as usize] = true;
90        must_quote[self.delimiter as usize] = true;
91        must_quote[self.quote as usize] = true;
92
93        Writer {
94            delimiter: self.delimiter,
95            quote: self.quote,
96            line_terminator: if self.crlf { b"\r\n" } else { b"\n" },
97            buf_writer: BufWriter::with_capacity(self.buffer_capacity, writer),
98            flexible: self.flexible,
99            field_count: None,
100            must_quote,
101        }
102    }
103}
104
105/// An already configured CSV writer.
106///
107/// # Configuration
108///
109/// To configure a [`Writer`], if you need a custom delimiter for instance of if
110/// you want to tweak the size of the inner buffer. Check out the
111/// [`WriterBuilder`].
112pub struct Writer<W: Write> {
113    delimiter: u8,
114    quote: u8,
115    line_terminator: &'static [u8],
116    buf_writer: BufWriter<W>,
117    flexible: bool,
118    field_count: Option<usize>,
119    must_quote: [bool; 256],
120}
121
122impl<W: Write> Writer<W> {
123    /// Create a new writer with default configuration using the provided writer
124    /// implementing [`std::io::Write`].
125    ///
126    /// Avoid providing a buffered writer because buffering will be handled for
127    /// you by the [`Writer`].
128    pub fn from_writer(writer: W) -> Self {
129        WriterBuilder::new().from_writer(writer)
130    }
131
132    /// Flush the underlying [`BufWriter`].
133    #[inline(always)]
134    pub fn flush(&mut self) -> io::Result<()> {
135        self.buf_writer.flush()
136    }
137
138    #[inline]
139    fn check_field_count(&mut self, written: usize) -> error::Result<()> {
140        if self.flexible {
141            return Ok(());
142        }
143
144        match self.field_count {
145            Some(expected) => {
146                if written != expected {
147                    return Err(Error::new(ErrorKind::UnequalLengths {
148                        expected_len: expected,
149                        len: written,
150                        pos: None,
151                    }));
152                }
153            }
154            None => {
155                self.field_count = Some(written);
156            }
157        }
158
159        Ok(())
160    }
161
162    /// Write the given "record" while foregoing any quoting/escaping.
163    ///
164    /// This method accepts any item implementing [`IntoIterator`] and yielding
165    /// references to byte slices.
166    ///
167    /// **BEWARE**: if written data needed escaping, invalid CSV will be
168    /// written!
169    ///
170    /// Only use this method when you can guarantee you are doing the right
171    /// thing and want the extra performance.
172    pub fn write_record_no_quoting<I, T>(&mut self, record: I) -> error::Result<()>
173    where
174        I: IntoIterator<Item = T>,
175        T: AsRef<[u8]>,
176    {
177        let mut first = true;
178        let mut written: usize = 0;
179        let mut empty = false;
180
181        for cell in record.into_iter() {
182            if first {
183                first = false;
184            } else {
185                self.buf_writer.write_all(&[self.delimiter])?;
186            }
187
188            let cell = cell.as_ref();
189
190            if cell.is_empty() {
191                empty = true;
192            }
193
194            self.buf_writer.write_all(cell)?;
195
196            written += 1;
197        }
198
199        if written == 1 && empty {
200            self.buf_writer.write_all(&[self.quote, self.quote])?;
201        }
202
203        self.check_field_count(written)?;
204
205        self.buf_writer.write_all(self.line_terminator)?;
206
207        Ok(())
208    }
209
210    /// Write the given [`ByteRecord`] while foregoing any quoting/escaping.
211    ///
212    /// **BEWARE**: if written data needed escaping, invalid CSV will be
213    /// written!
214    ///
215    /// Only use this method when you can guarantee you are doing the right
216    /// thing and want the extra performance.
217    #[inline(always)]
218    pub fn write_byte_record_no_quoting(&mut self, record: &ByteRecord) -> error::Result<()> {
219        self.write_record_no_quoting(record.iter())
220    }
221
222    #[inline]
223    fn should_quote(&self, mut cell: &[u8]) -> bool {
224        // This strategy comes directly from `rust-csv`
225        let mut yes = false;
226        while !yes && cell.len() >= 8 {
227            yes = self.must_quote[cell[0] as usize]
228                || self.must_quote[cell[1] as usize]
229                || self.must_quote[cell[2] as usize]
230                || self.must_quote[cell[3] as usize]
231                || self.must_quote[cell[4] as usize]
232                || self.must_quote[cell[5] as usize]
233                || self.must_quote[cell[6] as usize]
234                || self.must_quote[cell[7] as usize];
235            cell = &cell[8..];
236        }
237        yes || cell.iter().any(|&b| self.must_quote[b as usize])
238    }
239
240    fn write_quoted_cell(&mut self, cell: &[u8]) -> error::Result<()> {
241        self.buf_writer.write_all(&[self.quote])?;
242
243        let mut i: usize = 0;
244
245        if cell.len() < 8 {
246            while i < cell.len() {
247                match cell[i..].iter().copied().position(|b| b == self.quote) {
248                    None => {
249                        self.buf_writer.write_all(&cell[i..])?;
250                        break;
251                    }
252                    Some(offset) => {
253                        self.buf_writer.write_all(&cell[i..i + offset + 1])?;
254                        self.buf_writer.write_all(&[self.quote])?;
255                        i += offset + 1;
256                    }
257                }
258            }
259        } else {
260            while i < cell.len() {
261                match memchr(self.quote, &cell[i..]) {
262                    None => {
263                        self.buf_writer.write_all(&cell[i..])?;
264                        break;
265                    }
266                    Some(offset) => {
267                        self.buf_writer.write_all(&cell[i..i + offset + 1])?;
268                        self.buf_writer.write_all(&[self.quote])?;
269                        i += offset + 1;
270                    }
271                };
272            }
273        }
274
275        self.buf_writer.write_all(&[self.quote])?;
276
277        Ok(())
278    }
279
280    /// Write the given "record".
281    ///
282    /// This method accepts any item implementing [`IntoIterator`] and yielding
283    /// references to byte slices.
284    pub fn write_record<I, T>(&mut self, record: I) -> error::Result<()>
285    where
286        I: IntoIterator<Item = T>,
287        T: AsRef<[u8]>,
288    {
289        let mut first = true;
290        let mut written: usize = 0;
291        let mut empty = false;
292
293        for cell in record.into_iter() {
294            if first {
295                first = false;
296            } else {
297                self.buf_writer.write_all(&[self.delimiter])?;
298            }
299
300            let cell = cell.as_ref();
301
302            if cell.is_empty() {
303                empty = true;
304            }
305
306            if self.should_quote(cell) {
307                self.write_quoted_cell(cell)?;
308            } else {
309                self.buf_writer.write_all(cell)?;
310            }
311
312            written += 1;
313        }
314
315        if written == 1 && empty {
316            self.buf_writer.write_all(&[self.quote, self.quote])?;
317        }
318
319        self.check_field_count(written)?;
320
321        self.buf_writer.write_all(self.line_terminator)?;
322
323        Ok(())
324    }
325
326    /// Write the given [`ByteRecord`].
327    #[inline(always)]
328    pub fn write_byte_record(&mut self, record: &ByteRecord) -> error::Result<()> {
329        self.write_record(record.iter())
330    }
331
332    /// Write the given [`ZeroCopyByteRecord`] using a fast path if reader & writer
333    /// have matching delimiter & quote.
334    ///
335    /// A [`ZeroCopyByteRecord`] keeps track of its quote, but not the delimiter so
336    /// this one must be provided to the method.
337    #[inline]
338    pub fn write_zero_copy_byte_record(
339        &mut self,
340        delimiter: u8,
341        record: &ZeroCopyByteRecord,
342    ) -> error::Result<()> {
343        if self.delimiter == delimiter && record.quote == self.quote {
344            self.buf_writer.write_all(record.as_slice())?;
345            self.buf_writer.write_all(self.line_terminator)?;
346        } else {
347            self.write_record(record.unescaped_iter())?;
348        }
349
350        Ok(())
351    }
352
353    /// Same as [`Self::write_zero_copy_byte_record`], but only write the
354    /// given selection of cell indices.
355    #[inline]
356    pub fn write_zero_copy_byte_record_indices(
357        &mut self,
358        delimiter: u8,
359        record: &ZeroCopyByteRecord,
360        indices: &[usize],
361    ) -> error::Result<()> {
362        if self.delimiter == delimiter && record.quote == self.quote {
363            self.write_record_no_quoting(indices.iter().copied().map(|i| &record[i]))?;
364        } else {
365            self.write_record(indices.iter().copied().map(|i| record.unescape(i).unwrap()))?;
366        }
367
368        Ok(())
369    }
370
371    /// Write the given byte slice, as-is, without quoting/escaping, with an
372    /// added newline.
373    ///
374    /// **BEWARE**: if written data needed escaping, invalid CSV will be
375    /// written!
376    ///
377    /// This method can typically be used with slices yielded by
378    /// [`Splitter.split_record`](crate::Splitter::split_record).
379    #[inline(always)]
380    pub fn write_splitted_record(&mut self, record: &[u8]) -> error::Result<()> {
381        self.buf_writer.write_all(record)?;
382        self.buf_writer.write_all(self.line_terminator)?;
383
384        Ok(())
385    }
386
387    /// Attempt to unwrap the underlying [`BufWriter`] by flusing it and
388    /// returning the original writer.
389    #[inline]
390    pub fn into_inner(self) -> Result<W, IntoInnerError<BufWriter<W>>> {
391        self.buf_writer.into_inner()
392    }
393}
394
395#[cfg(test)]
396mod tests {
397    use std::io::{self, Cursor};
398
399    use super::*;
400
401    #[test]
402    fn test_write_byte_record() -> io::Result<()> {
403        let output = Cursor::new(Vec::<u8>::new());
404        let mut writer = WriterBuilder::with_capacity(32).from_writer(output);
405
406        writer.write_byte_record_no_quoting(&brec!["name", "surname", "age"])?;
407        writer.write_byte_record(&brec!["john,", "landis", "45"])?;
408        writer.write_byte_record(&brec!["lucy", "get\ngot", "\"te,\"st\""])?;
409
410        assert_eq!(
411            std::str::from_utf8(writer.into_inner()?.get_ref()).unwrap(),
412            "name,surname,age\n\"john,\",landis,45\nlucy,\"get\ngot\",\"\"\"te,\"\"st\"\"\"\n",
413        );
414
415        Ok(())
416    }
417
418    #[test]
419    fn test_write_empty_cells() {
420        fn write(record: &ByteRecord) -> String {
421            let output = Cursor::new(Vec::<u8>::new());
422            let mut writer = Writer::from_writer(output);
423            writer.write_byte_record(record).unwrap();
424            String::from_utf8_lossy(&writer.into_inner().unwrap().into_inner()).into_owned()
425        }
426
427        assert_eq!(write(&brec![]), "\n");
428        assert_eq!(write(&brec![""]), "\"\"\n");
429        assert_eq!(write(&brec!["", "", ""]), ",,\n");
430        assert_eq!(write(&brec!["name", "", "age"]), "name,,age\n");
431        assert_eq!(write(&brec!["name", ""]), "name,\n");
432    }
433
434    #[test]
435    fn should_quote() {
436        let writer = Writer::from_writer(Cursor::new(Vec::<u8>::new()));
437
438        assert_eq!(writer.should_quote(b"test"), false);
439        assert_eq!(writer.should_quote(b"test,"), true);
440        assert_eq!(writer.should_quote(b"te\"st"), true);
441        assert_eq!(writer.should_quote(b"te\nst"), true);
442        assert_eq!(
443            writer.should_quote(b"testtesttesttesttesttesttesttest\n"),
444            true
445        );
446        assert_eq!(writer.should_quote(b"te\rst"), true);
447    }
448}