1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
use crate::decoders::decoders::Encoding;
#[cfg(target_arch = "x86_64")]
use crate::helpers::bytes_helper::locate_line_break_avx2;
use crate::helpers::bytes_helper::locate_line_break_memchr3;
use crate::models::field::Field;
use crate::models::in_row_iter::InRowIter;
use std::borrow::Cow;
#[derive(Debug)]
#[allow(dead_code)]
pub struct Row<'mmap> {
slice: &'mmap [u8],
cursor: usize,
field_separator: u8,
string_delimiter: u8,
force_mem_cacher: bool,
iter: InRowIter<'mmap>,
}
impl<'mmap> Row<'mmap> {
/// ## Constructor
/// - Make a new instance of `Row` struct.
pub fn new(
slice: &'mmap [u8],
field_separator: u8,
string_delimiter: u8,
force_mem_cacher: bool,
) -> Self {
let i = InRowIter::new(slice, field_separator, string_delimiter);
Self {
slice,
cursor: 0,
field_separator,
string_delimiter,
force_mem_cacher,
iter: i,
}
}
/// ## Creates a new Empty row
pub fn new_empty() -> Self {
Self {
slice: &[],
cursor: 0,
field_separator: 0,
string_delimiter: 0,
force_mem_cacher: false,
iter: InRowIter::new(&[], 0, 0),
}
}
/// ## Is Empty
/// - Gets if a row is empty
pub fn is_empty(&self) -> bool {
self.slice.is_empty()
}
/// ## Get Slice of &[[u8]] bytes
/// - Gets a reference of the bytes inside row.
/// #### `returns` : &'mmap [[u8]]
pub fn get_slice(&self) -> &'mmap [u8] {
&self.slice
}
#[allow(dead_code)]
/// ## Next Field
/// - Finds the next slice between current position and delimiter
/// - Pack this slice, inside a Field struct in order to access field functions.
/// - Can iter between differents Fields, in order.
/// #### `returns` : An Option<Field<'mmap>>
pub fn next_field(&mut self) -> Option<Field<'mmap>> {
//If we move here the cfg, and target compariision, is faster. only doit once, and not on each line iter.
if self.force_mem_cacher {
return match self.next_raw_memchr3() {
Some(row) => Some(Field::new(row, self.string_delimiter)),
None => None,
};
}
#[cfg(target_arch = "x86_64")]
{
//En x86, si soporta avx2 lo uso
if is_x86_feature_detected!("avx2") {
match unsafe { self.new_raw_avx2() } {
Some(row) => Some(Field::new(row, self.string_delimiter)),
None => None,
}
} else {
//En x86, si no soporta avx2, uso el memcach3
match self.next_raw_memchr3() {
Some(row) => Some(Field::new(row, self.string_delimiter)),
None => None,
}
}
}
#[cfg(target_arch = "aarch64")]
{
match self.new_raw_neon() {
Some(row) => Some(Field::new(row, self.string_delimiter)),
None => None,
}
}
}
#[allow(dead_code)]
/// # Get Field by Index
/// - Receives an usize (zero based index), and returns the field associated to the iteration.
/// #### `returns`: An Option<Field<'mmap>>
pub fn get_index(&self, index: usize) -> Field<'mmap> {
let data = &self.iter.peek_field_index(index);
match data {
Some(f) => Field::new(f, self.string_delimiter),
None => Field::new_empty(self.string_delimiter),
}
}
/// ## Decodes the full line
/// - Must give the encoding.
pub fn decode_line(&mut self, enc: Encoding) -> Cow<str> {
enc.decode(self.slice)
}
//------------------------------------------------------------//
//--------------------- PRIVATE ------------------------------//
//------------------------------------------------------------//
#[cfg(target_arch = "aarch64")]
fn new_raw_neon(&mut self) -> Option<&'mmap [u8]> {
unsafe {
// Obtain the unmapped slice starting from the cursor
let slice = &self.slice[self.cursor..];
// Locate the break index
match crate::helpers::bytes_helper::locate_line_break_neon(slice, self.field_separator)
{
0 => {
// EOF, reset cursor
self.reset_cursor();
None
}
sep_index => {
// Correctly extract the row WITHOUT including the separator
let row = &self.slice[self.cursor..self.cursor + sep_index];
// Now we must remove the separator bytes at the end
// Check if row ends with \r\n
let end = if row.ends_with(b"\r\n") {
2
} else if row.ends_with(&[b'\n']) || row.ends_with(&[b'\r']) {
1
} else {
0 // in case of custom separator (or no separator)
};
// Final row slice without line break or separator
let row = &row[..row.len() - end];
// Move the cursor forward to after the separator
self.cursor += sep_index;
Some(row)
}
}
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn new_raw_avx2(&mut self) -> Option<&'mmap [u8]> {
unsafe {
let slice = &self.slice[self.cursor..];
let sep_index = locate_line_break_avx2(slice, self.field_separator);
if sep_index == 0 {
self.reset_cursor();
return None;
}
let full_row = &self.slice[self.cursor..self.cursor + sep_index];
let trim_len = if full_row.ends_with(b"\r\n") {
2
} else if full_row.ends_with(&[b'\r']) || full_row.ends_with(&[b'\n']) {
1
} else {
0
};
let valid_len = full_row.len().saturating_sub(trim_len);
let row = &full_row[..valid_len];
self.cursor += sep_index;
Some(row)
}
}
fn next_raw_memchr3(&mut self) -> Option<&'mmap [u8]> {
//determine the tos end slice
let slice = &self.slice[self.cursor..];
//Determine the line break cursor position
match locate_line_break_memchr3(slice, self.cursor, self.field_separator) {
0 => {
//EOF, so, reset cursor
self.reset_cursor();
None
}
i => {
//Take a reference of the map file
let map = &self.slice[..];
//Return the byte slice of a row
let row = &map[self.cursor..i];
//Move the cursor position
self.cursor = i;
//Extract the byte line
Some(row)
}
}
}
/// Reset the internal cursor
fn reset_cursor(&mut self) {
self.cursor = 0;
}
}