1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#![allow(dead_code)]
// Adapted from https://github.com/BurntSushi/memchr/blob/master/src/arch/generic/memchr.rs
use crate::{
Escapes, Vector,
ext::Pointer,
vector::MoveMask,
writer::{Writer, write, write_slice},
};
/// A generic structure for handling escape sequences in a vectorized manner.
///
/// # Type Parameters
/// - `E`: The escape type implementing the `Escapes` trait.
#[derive(Clone, Copy, Debug)]
pub(crate) struct Generic<E> {
escapes: E,
}
impl<E> Generic<E>
where
E: Escapes,
{
/// The number of bytes processed per iteration in the search loop.
const LOOP_SIZE: usize = 4 * E::Vector::BYTES;
/// Creates a new `Generic` instance with the given escape handler.
///
/// # Parameters
/// - `escapes`: The escape handler to be used.
#[inline(always)]
pub(crate) fn new(escapes: E) -> Generic<E> {
Generic { escapes }
}
/// Escapes the input string by applying the escape sequences defined in the `Escapes` trait.
///
/// # Parameters
/// - `haystack`: The input string to be processed for escape sequences.
/// - `writer`: The function to write the escaped output.
///
/// # Returns
/// A `Result` indicating the success or failure of the escape operation.
#[inline(always)]
pub(crate) fn escape<R>(
&mut self,
haystack: &str,
mut writer: impl Writer<R>,
) -> Result<(), R> {
let len = haystack.len();
let cur = haystack.as_ptr();
unsafe { self.escape_raw(cur, cur.add(len), &mut writer) }
}
/// Escapes the input data between the `start` and `end` pointers.
///
/// # Parameters
/// - `start`: The starting pointer of the data to be escaped.
/// - `end`: The ending pointer of the data to be escaped.
/// - `writer`: The function to write the escaped output.
///
/// # Returns
/// A `Result` indicating the success or failure of the escape operation.
///
/// # Safety
/// This function is unsafe because it operates on raw pointers and assumes
/// that the memory between `start` and `end` is valid and properly aligned.
#[inline(always)]
pub(crate) unsafe fn escape_raw<R>(
&mut self,
start: *const u8,
end: *const u8,
writer: &mut impl Writer<R>,
) -> Result<(), R> {
unsafe {
let len = end.distance(start);
let mut written = start;
debug_assert!(
len >= E::Vector::BYTES,
"haystack has length {}, but must be at least {}",
len,
E::Vector::BYTES
);
let align = E::Vector::BYTES - (start.to_usize() & E::Vector::ALIGN);
if align > 0 {
let x = E::Vector::load_unaligned(start);
let mask = self.escapes.masking(x).movemask();
self.write_mask_unaligned(mask, start, align, &mut written, writer)?;
}
// Set `cur` to the first V-aligned pointer greater than `start`.
let mut cur = start.add(align);
debug_assert!(cur > start && end.sub(E::Vector::BYTES) >= start);
if len >= Self::LOOP_SIZE {
while cur <= end.sub(Self::LOOP_SIZE) {
debug_assert_eq!(0, cur.to_usize() % E::Vector::BYTES);
let a = E::Vector::load_aligned(cur);
let b = E::Vector::load_aligned(cur.add(E::Vector::BYTES));
let c = E::Vector::load_aligned(cur.add(2 * E::Vector::BYTES));
let d = E::Vector::load_aligned(cur.add(3 * E::Vector::BYTES));
let eqa = self.escapes.masking(a);
let eqb = self.escapes.masking(b);
let eqc = self.escapes.masking(c);
let eqd = self.escapes.masking(d);
let or1 = eqa.or(eqb);
let or2 = eqc.or(eqd);
let or3 = or1.or(or2);
if or3.movemask_will_have_non_zero() {
self.write_mask(eqa.movemask(), cur, &mut written, writer)?;
self.write_mask(
eqb.movemask(),
cur.add(E::Vector::BYTES),
&mut written,
writer,
)?;
self.write_mask(
eqc.movemask(),
cur.add(E::Vector::BYTES * 2),
&mut written,
writer,
)?;
self.write_mask(
eqd.movemask(),
cur.add(E::Vector::BYTES * 3),
&mut written,
writer,
)?;
}
cur = cur.add(Self::LOOP_SIZE);
}
}
// Handle any leftovers after the aligned loop above.
while cur <= end.sub(E::Vector::BYTES) {
debug_assert!(end.distance(cur) >= E::Vector::BYTES);
let v = E::Vector::load_aligned(cur);
let mask = self.escapes.masking(v).movemask();
self.write_mask(mask, cur, &mut written, writer)?;
cur = cur.add(E::Vector::BYTES);
}
// Handle any remaining bytes that are less than a full vector's worth.
if cur < end {
debug_assert!(end.distance(cur) < E::Vector::BYTES);
let rest = (E::Vector::BYTES - end.distance(cur)) as u32;
let start = cur.sub(E::Vector::BYTES - end.distance(cur));
debug_assert_eq!(end.distance(start), E::Vector::BYTES);
let x = E::Vector::load_unaligned(start);
let mask = self.escapes.masking(x).movemask().shr(rest);
self.write_mask(mask, cur, &mut written, writer)?;
}
if written < end {
write_slice(written, end, writer)?;
}
Ok(())
}
}
/// Writes a single step of the escape process, handling any necessary escapes.
///
/// # Parameters
/// - `mask`: The mask indicating which bytes need to be escaped.
/// - `cur`: The current pointer in the data.
/// - `offset`: The offset from the current pointer.
/// - `written`: A mutable reference to the pointer indicating the last written position.
/// - `writer`: The function to write the escaped output.
///
/// # Returns
/// A `Result` containing the updated mask after clearing the least significant bit.
///
/// # Safety
/// This function is unsafe because it operates on raw pointers and assumes
/// that the memory is valid.
#[inline(always)]
unsafe fn write_step<R>(
mask: <<E as Escapes>::Vector as Vector>::Mask,
cur: *const u8,
offset: usize,
written: &mut *const u8,
writer: &mut impl Writer<R>,
) -> Result<<<E as Escapes>::Vector as Vector>::Mask, R> {
unsafe {
let c = E::position(*cur.add(offset));
if !E::FALSE_POSITIVE || c < E::ESCAPE_LEN {
let at = cur.add(offset);
if *written < at {
write_slice(*written, at, writer)?;
}
write(E::escape(c), writer)?;
*written = at.add(1);
}
Ok(mask.clear_least_significant_bit())
}
}
/// A helper function to write the escape mask, handling both aligned and unaligned data.
///
/// # Parameters
/// - `mask`: The mask indicating which bytes need to be escaped.
/// - `cur`: The current pointer in the data.
/// - `limit`: The limit up to which the mask should be processed.
/// - `written`: A mutable reference to the pointer indicating the last written position.
/// - `writer`: The function to write the escaped output.
///
/// # Returns
/// A `Result` indicating the success or failure of the write operation.
///
/// # Safety
/// This function is unsafe because it operates on raw pointers and assumes
/// that the memory is valid.
#[inline(always)]
unsafe fn write_mask_helper<R>(
&mut self,
mut mask: <<E as Escapes>::Vector as Vector>::Mask,
cur: *const u8,
limit: usize,
written: &mut *const u8,
writer: &mut impl Writer<R>,
) -> Result<(), R> {
unsafe {
if mask.has_non_zero() {
let mut offset = mask.first_offset();
while offset < limit {
mask = Self::write_step(mask, cur, offset, written, writer)?;
if !mask.has_non_zero() {
break;
}
offset = mask.first_offset();
}
}
Ok(())
}
}
/// Writes the escape mask for unaligned data.
///
/// # Parameters
/// - `mask`: The mask indicating which bytes need to be escaped.
/// - `cur`: The current pointer in the data.
/// - `align`: The alignment offset.
/// - `written`: A mutable reference to the pointer indicating the last written position.
/// - `writer`: The function to write the escaped output.
///
/// # Returns
/// A `Result` indicating the success or failure of the write operation.
///
/// # Safety
/// This function is unsafe because it operates on raw pointers and assumes
/// that the memory is valid.
#[inline(always)]
unsafe fn write_mask_unaligned<R>(
&mut self,
mask: <<E as Escapes>::Vector as Vector>::Mask,
cur: *const u8,
align: usize,
written: &mut *const u8,
writer: &mut impl Writer<R>,
) -> Result<(), R> {
unsafe { self.write_mask_helper(mask, cur, align, written, writer) }
}
/// Writes the escape mask for aligned data.
///
/// # Parameters
/// - `mask`: The mask indicating which bytes need to be escaped.
/// - `cur`: The current pointer in the data.
/// - `written`: A mutable reference to the pointer indicating the last written position.
/// - `writer`: The function to write the escaped output.
///
/// # Returns
/// A `Result` indicating the success or failure of the write operation.
///
/// # Safety
/// This function is unsafe because it operates on raw pointers and assumes
/// that the memory is valid.
#[inline(always)]
unsafe fn write_mask<R>(
&mut self,
mask: <<E as Escapes>::Vector as Vector>::Mask,
cur: *const u8,
written: &mut *const u8,
writer: &mut impl Writer<R>,
) -> Result<(), R> {
unsafe { self.write_mask_helper(mask, cur, usize::MAX, written, writer) }
}
}