simd_csv/
writer.rs

1use std::io::{self, BufWriter, IntoInnerError, Write};
2
3use memchr::memchr;
4
5use crate::error::{self, Error, ErrorKind};
6use crate::records::ByteRecord;
7
8/// Builds a [`Writer`] with given configuration.
9pub struct WriterBuilder {
10    delimiter: u8,
11    quote: u8,
12    buffer_capacity: usize,
13    flexible: bool,
14}
15
16impl Default for WriterBuilder {
17    fn default() -> Self {
18        Self {
19            delimiter: b',',
20            quote: b'"',
21            buffer_capacity: 8192,
22            flexible: false,
23        }
24    }
25}
26
27impl WriterBuilder {
28    /// Create a new [`WriterBuilder`] with default configuration.
29    pub fn new() -> Self {
30        Self::default()
31    }
32
33    /// Create a new [`WriterBuilder`] with provided `capacity`.
34    pub fn with_capacity(capacity: usize) -> Self {
35        let mut builder = Self::default();
36        builder.buffer_capacity(capacity);
37        builder
38    }
39
40    /// Set the delimiter to be used by the created [`Writer`].
41    ///
42    /// This delimiter must be a single byte.
43    ///
44    /// Will default to a comma.
45    pub fn delimiter(&mut self, delimiter: u8) -> &mut Self {
46        self.delimiter = delimiter;
47        self
48    }
49
50    /// Set the quote char to be used by the created [`Writer`].
51    ///
52    /// This char must be a single byte.
53    ///
54    /// Will default to a double quote.
55    pub fn quote(&mut self, quote: u8) -> &mut Self {
56        self.quote = quote;
57        self
58    }
59
60    /// Set the capacity of the created [`Writer`]'s buffered writer.
61    pub fn buffer_capacity(&mut self, capacity: usize) -> &mut Self {
62        self.buffer_capacity = capacity;
63        self
64    }
65
66    /// Indicate whether the created [`Writer`] should be "flexible", i.e.
67    /// whether it should allow writing records having different number of
68    /// fields than the first one.
69    ///
70    /// Will default to `false`.
71    pub fn flexible(&mut self, yes: bool) -> &mut Self {
72        self.flexible = yes;
73        self
74    }
75
76    /// Create a new [`Writer`] using the provided writer implementing
77    /// [`std::io::Write`].
78    pub fn from_writer<W: Write>(&self, writer: W) -> Writer<W> {
79        let mut must_quote = [false; 256];
80        must_quote[b'\r' as usize] = true;
81        must_quote[b'\n' as usize] = true;
82        must_quote[self.delimiter as usize] = true;
83        must_quote[self.quote as usize] = true;
84
85        Writer {
86            delimiter: self.delimiter,
87            quote: self.quote,
88            buffer: BufWriter::with_capacity(self.buffer_capacity, writer),
89            flexible: self.flexible,
90            field_count: None,
91            must_quote,
92        }
93    }
94}
95
96/// An already configured CSV writer.
97///
98/// # Configuration
99///
100/// To configure a [`Writer`], if you need a custom delimiter for instance of if
101/// you want to tweak the size of the inner buffer. Check out the
102/// [`WriterBuilder`].
103pub struct Writer<W: Write> {
104    delimiter: u8,
105    quote: u8,
106    buffer: BufWriter<W>,
107    flexible: bool,
108    field_count: Option<usize>,
109    must_quote: [bool; 256],
110}
111
112impl<W: Write> Writer<W> {
113    /// Create a new writer with default configuration using the provided writer
114    /// implementing [`std::io::Write`].
115    ///
116    /// Avoid providing a buffered writer because buffering will be handled for
117    /// you by the [`Writer`].
118    pub fn from_writer(writer: W) -> Self {
119        WriterBuilder::new().from_writer(writer)
120    }
121
122    /// Flush the underlying [`BufWriter`].
123    #[inline(always)]
124    pub fn flush(&mut self) -> io::Result<()> {
125        self.buffer.flush()
126    }
127
128    #[inline]
129    fn check_field_count(&mut self, written: usize) -> error::Result<()> {
130        if self.flexible {
131            return Ok(());
132        }
133
134        match self.field_count {
135            Some(expected) => {
136                if written != expected {
137                    return Err(Error::new(ErrorKind::UnequalLengths {
138                        expected_len: expected,
139                        len: written,
140                        pos: None,
141                    }));
142                }
143            }
144            None => {
145                self.field_count = Some(written);
146            }
147        }
148
149        Ok(())
150    }
151
152    /// Write the given "record" while foregoing any quoting/escaping.
153    ///
154    /// This method accepts any item implementing [`IntoIterator`] and yielding
155    /// references to byte slices.
156    ///
157    /// **BEWARE**: if written data needed escaping, invalid CSV will be
158    /// written!
159    ///
160    /// Only use this method when you can guarantee you are doing the right
161    /// thing and want the extra performance.
162    pub fn write_record_no_quoting<I, T>(&mut self, record: I) -> error::Result<()>
163    where
164        I: IntoIterator<Item = T>,
165        T: AsRef<[u8]>,
166    {
167        let mut first = true;
168        let mut written: usize = 0;
169        let mut empty = false;
170
171        for cell in record.into_iter() {
172            if first {
173                first = false;
174            } else {
175                self.buffer.write_all(&[self.delimiter])?;
176            }
177
178            let cell = cell.as_ref();
179
180            if cell.is_empty() {
181                empty = true;
182            }
183
184            self.buffer.write_all(cell)?;
185
186            written += 1;
187        }
188
189        if written == 1 && empty {
190            self.buffer.write_all(&[self.quote, self.quote])?;
191        }
192
193        self.check_field_count(written)?;
194
195        self.buffer.write_all(b"\n")?;
196
197        Ok(())
198    }
199
200    /// Write the given [`ByteRecord`] while foregoing any quoting/escaping.
201    ///
202    /// **BEWARE**: if written data needed escaping, invalid CSV will be
203    /// written!
204    ///
205    /// Only use this method when you can guarantee you are doing the right
206    /// thing and want the extra performance.
207    #[inline(always)]
208    pub fn write_byte_record_no_quoting(&mut self, record: &ByteRecord) -> error::Result<()> {
209        self.write_record_no_quoting(record.iter())
210    }
211
212    #[inline]
213    fn should_quote(&self, mut cell: &[u8]) -> bool {
214        // This strategy comes directly from `rust-csv`
215        let mut yes = false;
216        while !yes && cell.len() >= 8 {
217            yes = self.must_quote[cell[0] as usize]
218                || self.must_quote[cell[1] as usize]
219                || self.must_quote[cell[2] as usize]
220                || self.must_quote[cell[3] as usize]
221                || self.must_quote[cell[4] as usize]
222                || self.must_quote[cell[5] as usize]
223                || self.must_quote[cell[6] as usize]
224                || self.must_quote[cell[7] as usize];
225            cell = &cell[8..];
226        }
227        yes || cell.iter().any(|&b| self.must_quote[b as usize])
228    }
229
230    fn write_quoted_cell(&mut self, cell: &[u8]) -> error::Result<()> {
231        self.buffer.write_all(&[self.quote])?;
232
233        let mut i: usize = 0;
234
235        if cell.len() < 8 {
236            while i < cell.len() {
237                match cell[i..].iter().copied().position(|b| b == self.quote) {
238                    None => {
239                        self.buffer.write_all(&cell[i..])?;
240                        break;
241                    }
242                    Some(offset) => {
243                        self.buffer.write_all(&cell[i..i + offset + 1])?;
244                        self.buffer.write_all(&[self.quote])?;
245                        i += offset + 1;
246                    }
247                }
248            }
249        } else {
250            while i < cell.len() {
251                match memchr(self.quote, &cell[i..]) {
252                    None => {
253                        self.buffer.write_all(&cell[i..])?;
254                        break;
255                    }
256                    Some(offset) => {
257                        self.buffer.write_all(&cell[i..i + offset + 1])?;
258                        self.buffer.write_all(&[self.quote])?;
259                        i += offset + 1;
260                    }
261                };
262            }
263        }
264
265        self.buffer.write_all(&[self.quote])?;
266
267        Ok(())
268    }
269
270    /// Write the given "record".
271    ///
272    /// This method accepts any item implementing [`IntoIterator`] and yielding
273    /// references to byte slices.
274    pub fn write_record<I, T>(&mut self, record: I) -> error::Result<()>
275    where
276        I: IntoIterator<Item = T>,
277        T: AsRef<[u8]>,
278    {
279        let mut first = true;
280        let mut written: usize = 0;
281        let mut empty = false;
282
283        for cell in record.into_iter() {
284            if first {
285                first = false;
286            } else {
287                self.buffer.write_all(&[self.delimiter])?;
288            }
289
290            let cell = cell.as_ref();
291
292            if cell.is_empty() {
293                empty = true;
294            }
295
296            if self.should_quote(cell) {
297                self.write_quoted_cell(cell)?;
298            } else {
299                self.buffer.write_all(cell)?;
300            }
301
302            written += 1;
303        }
304
305        if written == 1 && empty {
306            self.buffer.write_all(&[self.quote, self.quote])?;
307        }
308
309        self.check_field_count(written)?;
310
311        self.buffer.write_all(b"\n")?;
312
313        Ok(())
314    }
315
316    /// Write the given [`ByteRecord`].
317    #[inline(always)]
318    pub fn write_byte_record(&mut self, record: &ByteRecord) -> error::Result<()> {
319        self.write_record(record.iter())
320    }
321
322    /// Write the given byte slice, as-is, without quoting/escaping, with an
323    /// added newline.
324    ///
325    /// **BEWARE**: if written data needed escaping, invalid CSV will be
326    /// written!
327    ///
328    /// This method can typically be used with slices yielded by
329    /// [`Splitter.split_record`](crate::Splitter::split_record).
330    #[inline(always)]
331    pub fn write_splitted_record(&mut self, record: &[u8]) -> error::Result<()> {
332        self.buffer.write_all(record)?;
333        self.buffer.write_all(b"\n")?;
334
335        Ok(())
336    }
337
338    /// Attempt to unwrap the underlying [`BufWriter`] by flusing it and
339    /// returning the original writer.
340    #[inline]
341    pub fn into_inner(self) -> Result<W, IntoInnerError<BufWriter<W>>> {
342        self.buffer.into_inner()
343    }
344}
345
346#[cfg(test)]
347mod tests {
348    use std::io::{self, Cursor};
349
350    use super::*;
351
352    #[test]
353    fn test_write_byte_record() -> io::Result<()> {
354        let output = Cursor::new(Vec::<u8>::new());
355        let mut writer = WriterBuilder::with_capacity(32).from_writer(output);
356
357        writer.write_byte_record_no_quoting(&brec!["name", "surname", "age"])?;
358        writer.write_byte_record(&brec!["john,", "landis", "45"])?;
359        writer.write_byte_record(&brec!["lucy", "get\ngot", "\"te,\"st\""])?;
360
361        assert_eq!(
362            std::str::from_utf8(writer.into_inner()?.get_ref()).unwrap(),
363            "name,surname,age\n\"john,\",landis,45\nlucy,\"get\ngot\",\"\"\"te,\"\"st\"\"\"\n",
364        );
365
366        Ok(())
367    }
368
369    #[test]
370    fn test_write_empty_cells() {
371        fn write(record: &ByteRecord) -> String {
372            let output = Cursor::new(Vec::<u8>::new());
373            let mut writer = Writer::from_writer(output);
374            writer.write_byte_record(record).unwrap();
375            String::from_utf8_lossy(&writer.into_inner().unwrap().into_inner()).into_owned()
376        }
377
378        assert_eq!(write(&brec![]), "\n");
379        assert_eq!(write(&brec![""]), "\"\"\n");
380        assert_eq!(write(&brec!["", "", ""]), ",,\n");
381        assert_eq!(write(&brec!["name", "", "age"]), "name,,age\n");
382        assert_eq!(write(&brec!["name", ""]), "name,\n");
383    }
384
385    #[test]
386    fn should_quote() {
387        let writer = Writer::from_writer(Cursor::new(Vec::<u8>::new()));
388
389        assert_eq!(writer.should_quote(b"test"), false);
390        assert_eq!(writer.should_quote(b"test,"), true);
391        assert_eq!(writer.should_quote(b"te\"st"), true);
392        assert_eq!(writer.should_quote(b"te\nst"), true);
393        assert_eq!(
394            writer.should_quote(b"testtesttesttesttesttesttesttest\n"),
395            true
396        );
397        assert_eq!(writer.should_quote(b"te\rst"), true);
398    }
399}