csv_lib/models/
in_row_iter.rs

1use memchr::{memchr, memchr2};
2
3/// ## Struct InRowIter
4/// - An struct, used to help int the row processing.
5#[derive(Debug)]
6pub struct InRowIter<'a> {
7    line: &'a [u8],
8    field_separator: u8,
9    string_separator: u8,
10    cursor: usize,
11}
12
13impl<'mmap> InRowIter<'mmap> {
14    #[inline(always)]
15    /// Creates a new instance of the struct `InRowIter<'a>`
16    pub fn new(line: &'mmap [u8], field_separator: u8, string_separator: u8) -> Self {
17        Self {
18            line,
19            field_separator,
20            string_separator,
21            cursor: 0,
22        }
23    }
24    #[inline(always)]
25    /// ## Inner library Cursor Setter
26    /// - Set a new value for the cursor.
27    pub(crate) fn set_cursor(&mut self, new_index :usize){
28        self.cursor = new_index;
29    }
30    #[inline(always)]
31    /// ## Inner library Cursor Getter
32    /// - Gets the current value of the cursor
33    pub(crate) fn get_cursor(&self) -> usize { self.cursor }
34    #[inline(always)]
35    /// Count the number of fields, that a line haves.
36    pub fn count_fields(&self, delimiter: u8, string_separator: u8) -> usize {
37        let mut count = 0;
38        let mut pos = 0;
39        let len = self.line.len();
40        let check_string = string_separator != 0;
41        let mut in_string = false;
42
43        while pos < len {
44            let slice = &self.line[pos..];
45
46            // Find next interesting byte (delimiter or string_separator)
47            if let Some(next_pos) = if check_string {
48                memchr2(delimiter, string_separator, slice)
49            } else {
50                memchr(delimiter, slice)
51            } {
52                let byte = slice[next_pos];
53
54                if check_string && byte == string_separator {
55                    in_string = !in_string;
56                    pos += next_pos + 1;
57                } else if byte == delimiter && !in_string {
58                    count += 1;
59                    pos += next_pos + 1;
60                } else {
61                    pos += next_pos + 1;
62                }
63            } else {
64                break;
65            }
66        }
67
68        count + 1 // Number of fields = number of delimiters + 1
69    }
70    #[inline(always)]
71    /// Extract the content of a field in raw format.
72    pub fn get_field_index(&mut self, target: usize) -> Option<&'mmap [u8]> {
73        let mut count = 0;
74        //Store the cursor
75        let actual = self.get_cursor();
76        //Put the cursor at the beginning
77        self.set_cursor(0_usize);
78        while let Some(el) = self.next(){
79            if count == target {
80                //restore cursor
81                self.set_cursor(actual);
82                return Some(el);
83            }
84            count = count + 1;
85        }
86        //restore cursor
87        self.set_cursor(actual);
88        None
89    }
90    #[inline(always)]
91    /// ## Peek Field At Index
92    /// - Returns the field at a specific index without modifying the internal cursor.
93    #[allow(unused_assignments)]
94    pub fn peek_field_index(&self, target: usize) -> Option<&'mmap [u8]> {
95        let mut count = 0;
96        let mut pos = 0usize;
97        let slice = self.line;
98        let mut in_string = false;
99        let check_string = self.string_separator != 0;
100
101        let mut field_start = 0usize;
102        let mut field_end = 0usize;
103
104        while pos < slice.len() {
105            let mut start_offset = 0usize;
106            let mut end_offset = 0usize;
107            let start = pos;
108
109            if check_string && slice[pos] == self.string_separator {
110                in_string = true;
111                pos += 1;
112                start_offset = 1;
113            }
114
115            while pos < slice.len() {
116                let byte = slice[pos];
117
118                if check_string && byte == self.string_separator {
119                    if in_string {
120                        if pos + 1 < slice.len() && slice[pos + 1] == self.string_separator {
121                            pos += 2; // Escaped quote
122                            continue;
123                        } else {
124                            in_string = false;
125                            end_offset = 1;
126                            pos += 1;
127                            continue;
128                        }
129                    }
130                }
131
132                if byte == self.field_separator && !in_string {
133                    if count == target {
134                        field_start = start + start_offset;
135                        field_end = pos - end_offset;
136                        return Some(&slice[field_start..field_end]);
137                    } else {
138                        count += 1;
139                        pos += 1;
140                        break;
141                    }
142                }
143
144                pos += 1;
145            }
146        }
147
148        if count == target {
149            let start_offset = if check_string && slice[pos.saturating_sub(1)] == self.string_separator { 1 } else { 0 };
150            let end_offset = if check_string && slice[slice.len().saturating_sub(1)] == self.string_separator { 1 } else { 0 };
151
152            field_start = pos + start_offset;
153            field_end = slice.len() - end_offset;
154            return Some(&slice[field_start.min(field_end)..field_end]);
155        }
156
157        None
158    }
159
160}
161
162impl<'mmap> Iterator for InRowIter<'mmap> {
163    type Item = &'mmap [u8];
164    #[inline(always)]
165    fn next(&mut self) -> Option<Self::Item> {
166        if self.cursor >= self.line.len() {
167            return None;
168        }
169
170        let slice = &self.line[self.cursor..];
171        let mut pos = 0usize;
172        let mut in_string = false;
173        let check_string = self.string_separator != 0;
174        let mut start_offset = 0;
175        let mut end_offset = 0;
176
177        // If the field starts with a string separator, skip it
178        if check_string && !slice.is_empty() && slice[0] == self.string_separator {
179            in_string = true;
180            pos = 1;
181            start_offset = 1;
182        }
183
184        while pos < slice.len() {
185            let byte = slice[pos];
186
187            if check_string && byte == self.string_separator {
188                if in_string {
189                    // Check if this is an escaped double quote ("")
190                    if pos + 1 < slice.len() && slice[pos + 1] == self.string_separator {
191                        // Skip escaped quote
192                        pos += 2;
193                        continue;
194                    } else {
195                        in_string = false;
196                        end_offset = 1; // Potential quote at the end
197                        pos += 1;
198                        continue;
199                    }
200                } else {
201                    // Unexpected string separator if outside string, treat normal
202                    pos += 1;
203                    continue;
204                }
205            }
206
207            if byte == self.field_separator && !in_string {
208                let field = &slice[start_offset..pos - end_offset];
209                self.cursor += pos + 1;
210                return Some(field);
211            }
212
213            pos += 1;
214        }
215
216        // Last field
217        let field = &slice[start_offset..slice.len() - end_offset];
218        self.cursor = self.line.len();
219        Some(field)
220    }
221}
222
223
224
225
226#[cfg(test)]
227mod tests {
228    use crate::decoders::decoders::Encoding::Windows1252;
229    use crate::models::in_row_iter::InRowIter;
230
231    #[test]
232    fn test_iter_next(){
233
234        let csv_data = b"uno;dos;3;cuatro;cinco;6;siete;ocho;9";
235        let mut row = InRowIter::new(csv_data, b';', 0u8);
236
237  
238        let f = row.get_field_index(2).unwrap();
239
240        let dec = Windows1252.decode(f);
241        println!("Field Data: {}",  dec.as_ref());
242        
243        
244    }
245}