rsv_lib/utils/
row_split.rs

1// CSV row split module that supports:
2// 1. double-quoted field
3// 2. comma in a double-quoted field
4// 3. double-quotes in a field escaped by a backslash \
5// 4. double-quotes in a field escaped by a preceding double-quotes as discussed in
6// https://stackoverflow.com/questions/17808511/how-to-properly-escape-a-double-quote-in-csv
7
8// worked for examples:
9// v1,v2,v3
10// "v1","v2","v3"
11// "v1",v2,v3
12// "Charles \"Pretty Boy\" Floyd","1 Short St, Smallville"
13// "Charles ""Pretty Boy"" Floyd","1 Short St, Smallville"
14
15use std::{iter::Peekable, str::CharIndices};
16
17#[derive(Debug)]
18pub struct CsvRowSplitter<'a> {
19    row: &'a str,
20    char_indices: Peekable<CharIndices<'a>>,
21    sep: char,
22    quote: char,
23    parse_done: bool,
24    field_start_index: usize,
25    field_is_quoted: bool,
26    field_has_separator: bool,
27    cur_in_quoted_field: bool,
28    cur_is_field_start: bool,
29}
30
31impl<'a> CsvRowSplitter<'a> {
32    pub fn new(row: &'a str, sep: char, quote: char) -> CsvRowSplitter<'a> {
33        CsvRowSplitter {
34            row: row,
35            char_indices: row.char_indices().peekable(),
36            sep: sep,
37            quote: quote,
38            parse_done: false,
39            field_start_index: 0,
40            field_is_quoted: false,
41            field_has_separator: false, // whether a field has a CSV sep within it
42            cur_in_quoted_field: false,
43            cur_is_field_start: true, // whether current position is the start of a field
44        }
45    }
46
47    fn field_start_set(&mut self, start_index: usize) {
48        self.field_start_index = start_index;
49        self.field_is_quoted = false;
50        self.field_has_separator = false;
51        self.cur_in_quoted_field = false;
52        self.cur_is_field_start = true;
53    }
54
55    fn get_field(&self, end_index: usize) -> &'a str {
56        let field_shift = self.field_is_quoted as usize - self.field_has_separator as usize;
57        let i = self.field_start_index + field_shift;
58        let j = end_index - field_shift;
59        unsafe { self.row.get_unchecked(i..j) }
60    }
61
62    fn next_char_is_none_or_sep(&mut self) -> bool {
63        match self.char_indices.peek() {
64            None => true,
65            Some((_, v)) => v == &self.sep,
66        }
67    }
68}
69
70impl<'a> Iterator for CsvRowSplitter<'a> {
71    type Item = &'a str;
72
73    fn next(&mut self) -> Option<Self::Item> {
74        if self.parse_done {
75            return None;
76        }
77
78        loop {
79            let Some((index, c)) = self.char_indices.next() else {
80                // obtain last field
81                self.parse_done = true;
82                let f = self.get_field(self.row.len());
83                return Some(f);
84            };
85
86            if c == '\\' {
87                // skip \ escape, e.g., v1,v2\",v3 is parsed into ["v1", "v2\"", "v3"]
88                self.char_indices.next();
89            } else if c == self.sep {
90                if self.cur_in_quoted_field {
91                    self.field_has_separator = true;
92                } else {
93                    let f = self.get_field(index);
94                    self.field_start_set(index + 1);
95                    return Some(f);
96                }
97            } else if c == self.quote {
98                if self.cur_is_field_start {
99                    self.field_is_quoted = true;
100                    self.cur_in_quoted_field = true;
101                } else if self.next_char_is_none_or_sep() {
102                    self.cur_in_quoted_field = false;
103                } else {
104                    // skip double-quotes escape, e.g., v1,v2"",v3 is parsed into ["v1", "v2""", "v3"]
105                    self.char_indices.next();
106                }
107            }
108
109            self.cur_is_field_start = false;
110        }
111    }
112}
113
114#[cfg(test)]
115mod tests {
116    // Note this useful idiom: importing names from outer (for mod tests) scope.
117    use super::*;
118
119    #[test]
120    fn test_csv_row_split() {
121        let r = "我们abc,def,12";
122        let o = CsvRowSplitter::new(&r, ',', '"').collect::<Vec<_>>();
123        assert_eq!(o, vec!["我们abc", "def", "12"]);
124
125        let r = "1,2,3,";
126        let o = CsvRowSplitter::new(&r, ',', '"').collect::<Vec<_>>();
127        assert_eq!(o, vec!["1", "2", "3", ""]);
128
129        let r = r#"1,2,3,"""#;
130        let o = CsvRowSplitter::new(&r, ',', '"').collect::<Vec<_>>();
131        assert_eq!(o, vec!["1", "2", "3", ""]);
132
133        let r = r#"1,2,3,"",4"#;
134        let o = CsvRowSplitter::new(&r, ',', '"').collect::<Vec<_>>();
135        assert_eq!(o, vec!["1", "2", "3", "", "4"]);
136
137        let r = r#"1,2,3,"","4""#;
138        let o = CsvRowSplitter::new(&r, ',', '"').collect::<Vec<_>>();
139        assert_eq!(o, vec!["1", "2", "3", "", "4"]);
140
141        // quoted field
142        let r = r#""1",2,3,"#;
143        let o = CsvRowSplitter::new(&r, ',', '"').collect::<Vec<_>>();
144        assert_eq!(o, vec!["1", "2", "3", ""]);
145
146        // comma in quoted field
147        let r = r#"first,second,"third,fourth",fifth"#;
148        let o = CsvRowSplitter::new(&r, ',', '"').collect::<Vec<_>>();
149        assert_eq!(o, vec!["first", "second", r#""third,fourth""#, "fifth"]);
150
151        let r = r#"first,second,"third,fourth","fifth""#;
152        let o = CsvRowSplitter::new(&r, ',', '"').collect::<Vec<_>>();
153        assert_eq!(o, vec!["first", "second", r#""third,fourth""#, "fifth"]);
154
155        let r = r#""third,fourth","fifth""#;
156        let o = CsvRowSplitter::new(&r, ',', '"').collect::<Vec<_>>();
157        assert_eq!(o, vec![r#""third,fourth""#, "fifth"]);
158
159        // double-quote in field,, escaped by a preceding \
160        let r = r#"third\",fourth,"fifth""#;
161        let o = CsvRowSplitter::new(&r, ',', '"').collect::<Vec<_>>();
162        assert_eq!(o, vec![r#"third\""#, "fourth", "fifth"]);
163
164        let r = r#""Charles ""Pretty Boy"" Floyd","1 Short St, Smallville""#;
165        let o = CsvRowSplitter::new(&r, ',', '"').collect::<Vec<_>>();
166        assert_eq!(
167            o,
168            vec![
169                r#"Charles ""Pretty Boy"" Floyd"#,
170                r#""1 Short St, Smallville""#
171            ]
172        );
173    }
174}