rsonpath/input.rs
1//! Input structures that can be fed into an [`Engine`](crate::engine::Engine).
2//!
3//! The engine itself is generic in the [`Input`] trait declared here.
4//! There are a couple of different built-in implementations, each
5//! suitable for a different scenario. Consult the module-level
6//! documentation of each type to determine which to use. Here's a quick
7//! cheat-sheet:
8//!
9//! | Input scenario | Type to use |
10//! |:------------------------------|:------------------|
11//! | memory based | [`BorrowedBytes`] |
12//! | memory based, take ownership | [`OwnedBytes`] |
13//! | file based | [`MmapInput`] |
14//! | [`Read`](std::io::Read) based | [`BufferedInput`] |
15
16pub mod borrowed;
17pub mod buffered;
18pub mod error;
19pub mod mmap;
20pub mod owned;
21mod padding;
22mod slice;
23pub use borrowed::BorrowedBytes;
24pub use buffered::BufferedInput;
25pub use mmap::MmapInput;
26pub use owned::OwnedBytes;
27
28use self::error::InputError;
29use crate::{result::InputRecorder, string_pattern::StringPattern};
30use std::ops::Deref;
31
32/// Make the struct repr(C) with alignment equal to [`MAX_BLOCK_SIZE`].
33macro_rules! repr_align_block_size {
34 ($it:item) => {
35 #[repr(C, align(128))]
36 $it
37 };
38}
39pub(crate) use repr_align_block_size;
40
41/// Global padding guarantee for all [`Input`] implementations.
42/// Iterating over blocks of at most this size is guaranteed
43/// to produce only full blocks.
44///
45/// # Remarks
46/// This is set to `128` and unlikely to change.
47/// Widest available SIMD is AVX512, which has 64-byte blocks.
48/// The engine processes blocks in pairs, thus 128 is the highest possible request made to a block iterator.
49/// For this value to change a new, wider SIMD implementation would have to appear.
50pub const MAX_BLOCK_SIZE: usize = 128;
51
52/// UTF-8 encoded bytes representing a JSON document that support
53/// block-by-block iteration and basic seeking procedures.
54pub trait Input: Sized {
55 /// Type of the iterator used by [`iter_blocks`](Input::iter_blocks), parameterized
56 /// by the lifetime of source input and the size of the block.
57 type BlockIterator<'i, 'r, R, const N: usize>: InputBlockIterator<
58 'i,
59 N,
60 Block = Self::Block<'i, N>,
61 Error = Self::Error,
62 >
63 where
64 Self: 'i,
65 R: InputRecorder<Self::Block<'i, N>> + 'r;
66
67 /// Type of errors that can occur when operating on this [`Input`].
68 type Error: Into<InputError>;
69
70 /// Type of the blocks returned by the `BlockIterator`.
71 type Block<'i, const N: usize>: InputBlock<'i, N>
72 where
73 Self: 'i;
74
75 /// Return the length of the entire input, if known.
76 ///
77 /// This is meant to be used merely as a hint.
78 /// There are [`Input`] implementations that may not be able to know the entire
79 /// length a priori, and they should return [`None`].
80 #[inline(always)]
81 #[must_use]
82 fn len_hint(&self) -> Option<usize> {
83 None
84 }
85
86 /// Return the length of the padding added at the start of the input.
87 ///
88 /// This depends on the particular [`Input`] implementation, and may be zero.
89 /// In any case the length of the entire input should be equivalent to the length
90 /// of the source plus [`leading_padding_len`](`Input::leading_padding_len`) plus
91 /// [`trailing_padding_len`](`Input::trailing_padding_len`).
92 #[must_use]
93 fn leading_padding_len(&self) -> usize;
94
95 /// Return the length of the padding added at the end of the input.
96 ///
97 /// This depends on the particular [`Input`] implementation, and may be zero.
98 /// In any case the length of the entire input should be equivalent to the length
99 /// of the source plus [`leading_padding_len`](`Input::leading_padding_len`) plus
100 /// [`trailing_padding_len`](`Input::trailing_padding_len`).
101 #[must_use]
102 fn trailing_padding_len(&self) -> usize;
103
104 /// Iterate over blocks of size `N` of the input.
105 /// `N` has to be a power of two larger than 1.
106 #[must_use]
107 fn iter_blocks<'i, 'r, R, const N: usize>(&'i self, recorder: &'r R) -> Self::BlockIterator<'i, 'r, R, N>
108 where
109 R: InputRecorder<Self::Block<'i, N>>;
110
111 /// Search for an occurrence of `needle` in the input,
112 /// starting from `from` and looking back. Returns the index
113 /// of the first occurrence or `None` if the `needle` was not found.
114 #[must_use]
115 fn seek_backward(&self, from: usize, needle: u8) -> Option<usize>;
116
117 /// Search for an occurrence of any of the `needles` in the input,
118 /// starting from `from` and looking forward. Returns the index
119 /// of the first occurrence and the needle found, or `None` if none of the `needles` were not found.
120 ///
121 /// # Errors
122 /// This function can read more data from the input if no relevant characters are found
123 /// in the current buffer, which can fail.
124 fn seek_forward<const N: usize>(&self, from: usize, needles: [u8; N]) -> Result<Option<(usize, u8)>, Self::Error>;
125
126 /// Search for the first byte in the input that is not ASCII whitespace
127 /// starting from `from`. Returns a pair: the index of first such byte,
128 /// and the byte itself; or `None` if no non-whitespace characters
129 /// were found.
130 ///
131 /// # Errors
132 /// This function can read more data from the input if no relevant characters are found
133 /// in the current buffer, which can fail.
134 fn seek_non_whitespace_forward(&self, from: usize) -> Result<Option<(usize, u8)>, Self::Error>;
135
136 /// Search for the first byte in the input that is not ASCII whitespace
137 /// starting from `from` and looking back. Returns a pair:
138 /// the index of first such byte, and the byte itself;
139 /// or `None` if no non-whitespace characters were found.
140 #[must_use]
141 fn seek_non_whitespace_backward(&self, from: usize) -> Option<(usize, u8)>;
142
143 /// Decide whether the slice of input between `from` (inclusive)
144 /// and `to` (exclusive) matches the `member` (comparing bitwise,
145 /// including double quotes delimiters).
146 ///
147 /// This will also check if the leading double quote is not
148 /// escaped by a backslash character.
149 ///
150 /// # Errors
151 /// This function can read more data from the input if `to` falls beyond
152 /// the range that was already read, and the read operation can fail.
153 fn is_member_match(&self, from: usize, to: usize, member: &StringPattern) -> Result<bool, Self::Error>;
154}
155
156/// An iterator over blocks of input of size `N`.
157/// Implementations MUST guarantee that the blocks returned from `next`
158/// are *exactly* of size `N`.
159pub trait InputBlockIterator<'i, const N: usize> {
160 /// The type of blocks returned.
161 type Block: InputBlock<'i, N>;
162
163 /// Type of errors that can occur when reading from this iterator.
164 type Error: Into<InputError>;
165
166 /// Advances the iterator and returns the next value.
167 ///
168 /// # Errors
169 /// May fail depending on the implementation.
170 fn next(&mut self) -> Result<Option<Self::Block>, Self::Error>;
171
172 /// Get the offset of the iterator in the input.
173 ///
174 /// The offset is the starting point of the block that will be returned next
175 /// from this iterator, if any. It starts as 0 and increases by `N` on every
176 /// block retrieved.
177 fn get_offset(&self) -> usize;
178
179 /// Offset the iterator by `count` full blocks forward.
180 ///
181 /// The `count` parameter must be greater than 0.
182 fn offset(&mut self, count: isize);
183}
184
185/// A block of bytes of size `N` returned from [`InputBlockIterator`].
186pub trait InputBlock<'i, const N: usize>: Deref<Target = [u8]> {
187 /// Split the block in half, giving two slices of size `N`/2.
188 #[must_use]
189 fn halves(&self) -> (&[u8], &[u8]);
190
191 /// Split the block in four, giving four slices of size `N`/4.
192 #[inline]
193 #[must_use]
194 fn quarters(&self) -> (&[u8], &[u8], &[u8], &[u8]) {
195 assert_eq!(N % 4, 0);
196 let (half1, half2) = self.halves();
197 let (q1, q2) = (&half1[..N / 4], &half1[N / 4..]);
198 let (q3, q4) = (&half2[..N / 4], &half2[N / 4..]);
199
200 (q1, q2, q3, q4)
201 }
202}
203
204impl<'i, const N: usize> InputBlock<'i, N> for &'i [u8] {
205 #[inline(always)]
206 fn halves(&self) -> (&[u8], &[u8]) {
207 assert_eq!(N % 2, 0);
208 (&self[..N / 2], &self[N / 2..])
209 }
210}
211
212pub(super) trait SliceSeekable {
213 fn is_member_match(&self, from: usize, to: usize, member: &StringPattern) -> bool;
214
215 fn seek_backward(&self, from: usize, needle: u8) -> Option<usize>;
216
217 fn seek_forward<const N: usize>(&self, from: usize, needles: [u8; N]) -> Option<(usize, u8)>;
218
219 fn seek_non_whitespace_forward(&self, from: usize) -> Option<(usize, u8)>;
220
221 fn seek_non_whitespace_backward(&self, from: usize) -> Option<(usize, u8)>;
222}
223
224// This is mostly adapted from [slice::align_to](https://doc.rust-lang.org/std/primitive.slice.html#method.align_to).
225fn align_to<const N: usize>(bytes: &[u8]) -> (&[u8], &[u8], &[u8]) {
226 let ptr = bytes.as_ptr();
227 let offset = ptr.align_offset(N);
228 if offset > bytes.len() {
229 (bytes, &[], &[])
230 } else {
231 let (left, rest) = bytes.split_at(offset);
232 let middle_len = (rest.len() / N) * N;
233 let (middle, right) = rest.split_at(middle_len);
234
235 (left, middle, right)
236 }
237}
238
239#[cfg(test)]
240mod tests {
241 use super::align_to;
242 use crate::input::MAX_BLOCK_SIZE;
243
244 // Run all tests for the actual alignment we use.
245 const N: usize = MAX_BLOCK_SIZE;
246 const SIZE: usize = 1024;
247
248 #[repr(align(128))]
249 struct Aligned {
250 bytes: [u8; SIZE],
251 }
252
253 #[test]
254 fn test_all_alignments() {
255 // We construct a byte array that is already aligned,
256 // and then take all suffixes for all possible misalignments
257 // and small sizes.
258 let aligned = Aligned { bytes: get_bytes() };
259 let slice = &aligned.bytes;
260
261 for i in 0..slice.len() {
262 let misalignment = i % N;
263 test_with_misalignment(misalignment, &slice[i..]);
264 }
265 }
266
267 fn get_bytes() -> [u8; SIZE] {
268 let mut bytes = [0; SIZE];
269
270 for (i, b) in bytes.iter_mut().enumerate() {
271 let x = i % (u8::MAX as usize);
272 *b = x as u8;
273 }
274
275 bytes
276 }
277
278 fn test_with_misalignment(misalignment: usize, slice: &[u8]) {
279 let expected_left_len = (N - misalignment) % N;
280 let expected_rem_len = slice.len() - expected_left_len;
281 let expected_middle_len = (expected_rem_len / N) * N;
282 let expected_right_len = expected_rem_len - expected_middle_len;
283
284 let (left, middle, right) = align_to::<N>(slice);
285 let glued_back: Vec<_> = [left, middle, right].into_iter().flatten().copied().collect();
286
287 assert_eq!(left.len(), expected_left_len, "misalignment = {misalignment}");
288 assert_eq!(middle.len(), expected_middle_len, "misalignment = {misalignment}");
289 assert_eq!(right.len(), expected_right_len, "misalignment = {misalignment}");
290 assert_eq!(glued_back, slice, "misalignment = {misalignment}");
291 }
292}