simd_csv/
splitter.rs

1use std::io::Read;
2
3use crate::buffer::ScratchBuffer;
4use crate::core::{CoreReader, ReadResult};
5use crate::error;
6use crate::utils::trim_trailing_crlf;
7
8pub struct SplitterBuilder {
9    delimiter: u8,
10    quote: u8,
11    buffer_capacity: Option<usize>,
12}
13
14impl Default for SplitterBuilder {
15    fn default() -> Self {
16        Self {
17            delimiter: b',',
18            quote: b'"',
19            buffer_capacity: None,
20        }
21    }
22}
23
24impl SplitterBuilder {
25    pub fn new() -> Self {
26        Self::default()
27    }
28
29    pub fn with_capacity(capacity: usize) -> Self {
30        let mut splitter = Self::default();
31        splitter.buffer_capacity(capacity);
32        splitter
33    }
34
35    pub fn delimiter(&mut self, delimiter: u8) -> &mut Self {
36        self.delimiter = delimiter;
37        self
38    }
39
40    pub fn quote(&mut self, quote: u8) -> &mut Self {
41        self.quote = quote;
42        self
43    }
44
45    pub fn buffer_capacity(&mut self, capacity: usize) -> &mut Self {
46        self.buffer_capacity = Some(capacity);
47        self
48    }
49
50    pub fn from_reader<R: Read>(&self, reader: R) -> Splitter<R> {
51        Splitter {
52            buffer: ScratchBuffer::with_optional_capacity(self.buffer_capacity, reader),
53            inner: CoreReader::new(self.delimiter, self.quote),
54        }
55    }
56}
57
58pub struct Splitter<R> {
59    buffer: ScratchBuffer<R>,
60    inner: CoreReader,
61}
62
63impl<R: Read> Splitter<R> {
64    pub fn from_reader(reader: R) -> Self {
65        SplitterBuilder::new().from_reader(reader)
66    }
67
68    #[inline(always)]
69    pub fn strip_bom(&mut self) -> error::Result<()> {
70        self.buffer.strip_bom()?;
71        Ok(())
72    }
73
74    pub fn count_records(&mut self) -> error::Result<u64> {
75        use ReadResult::*;
76
77        let mut count: u64 = 0;
78
79        loop {
80            let input = self.buffer.fill_buf()?;
81
82            let (result, pos) = self.inner.split_record(input);
83
84            self.buffer.consume(pos);
85
86            match result {
87                End => break,
88                InputEmpty | Cr | Lf => continue,
89                Record => {
90                    count += 1;
91                }
92            };
93        }
94
95        Ok(count)
96    }
97
98    pub fn split_record(&mut self) -> error::Result<Option<&[u8]>> {
99        use ReadResult::*;
100
101        self.buffer.reset();
102
103        loop {
104            let input = self.buffer.fill_buf()?;
105
106            let (result, pos) = self.inner.split_record(input);
107
108            match result {
109                End => {
110                    self.buffer.consume(pos);
111                    return Ok(None);
112                }
113                Cr | Lf => {
114                    self.buffer.consume(pos);
115                }
116                InputEmpty => {
117                    self.buffer.save();
118                }
119                Record => {
120                    return Ok(Some(trim_trailing_crlf(self.buffer.flush(pos))));
121                }
122            };
123        }
124    }
125}
126
127#[cfg(test)]
128mod tests {
129    use std::io::Cursor;
130
131    use super::*;
132
133    fn count_records(data: &str, capacity: usize) -> u64 {
134        let mut splitter = SplitterBuilder::with_capacity(capacity).from_reader(Cursor::new(data));
135        splitter.count_records().unwrap()
136    }
137
138    fn split_records(data: &str, capacity: usize) -> u64 {
139        let mut splitter = SplitterBuilder::with_capacity(capacity).from_reader(Cursor::new(data));
140        let mut count: u64 = 0;
141
142        while let Some(_) = splitter.split_record().unwrap() {
143            count += 1;
144        }
145
146        count
147    }
148
149    #[test]
150    fn test_count() {
151        // Empty
152        assert_eq!(count_records("", 1024), 0);
153
154        // Single cells with various empty lines
155        let tests = vec![
156            "name\njohn\nlucy",
157            "name\njohn\nlucy\n",
158            "name\n\njohn\r\nlucy\n",
159            "name\n\njohn\r\nlucy\n\n",
160            "name\n\n\njohn\r\n\r\nlucy\n\n\n",
161            "\nname\njohn\nlucy",
162            "\n\nname\njohn\nlucy",
163            "\r\n\r\nname\njohn\nlucy",
164            "name\njohn\nlucy\r\n",
165            "name\njohn\nlucy\r\n\r\n",
166        ];
167
168        for capacity in [32usize, 4, 3, 2, 1] {
169            for test in tests.iter() {
170                assert_eq!(
171                    count_records(test, capacity),
172                    3,
173                    "capacity={} string={:?}",
174                    capacity,
175                    test
176                );
177            }
178        }
179
180        // Multiple cells
181        let data = "name,surname,age\njohn,landy,45\nlucy,rose,67";
182        assert_eq!(count_records(data, 1024), 3);
183        assert_eq!(split_records(data, 1024), 3);
184
185        // Quoting
186        for capacity in [1024usize, 32usize, 4, 3, 2, 1] {
187            let data = "name,surname,age\n\"john\",\"landy, the \"\"everlasting\"\" bastard\",45\nlucy,rose,\"67\"\njermaine,jackson,\"89\"\n\nkarine,loucan,\"52\"\r\n";
188
189            assert_eq!(count_records(data, capacity), 5, "capacity={}", capacity);
190            assert_eq!(split_records(data, capacity), 5, "capacity={}", capacity);
191        }
192
193        // Different separator
194        let data = "name\tsurname\tage\njohn\tlandy\t45\nlucy\trose\t67";
195        assert_eq!(count_records(data, 1024), 3);
196        assert_eq!(split_records(data, 1024), 3);
197    }
198
199    #[test]
200    fn test_empty_row() -> error::Result<()> {
201        let data = "name\n\"\"\nlucy\n\"\"";
202
203        // Counting
204        let mut reader = Splitter::from_reader(Cursor::new(data));
205
206        assert_eq!(reader.count_records()?, 4);
207
208        Ok(())
209    }
210}